# Normalization on Concatenation

## Zscore and MinMax Normalization

In [1]:
from scipy import stats
import numpy as np
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from tqdm import tqdm

def zm(start_partition, end_partition, data, labels, data_dir, first_name, last_name):
    
    num_timestamps= 60
    num_attributes = 24

    for i in range(start_partition-1,end_partition):
        new_partition = np.zeros((np.array(data[i]).shape[0], num_timestamps*num_attributes))
        new_partition = np.array(data[i])

        with tqdm(num_attributes) as pbar:
            for j in range(0,num_attributes):
                new_attribute = np.zeros(new_partition.shape[0]*num_timestamps)  
                
                for m in range(0,new_partition.shape[0]):
                    new_attribute[m*num_timestamps:(m+1)*num_timestamps] = new_partition[m,j*num_timestamps:(j+1)*num_timestamps]
                 
                if np.std(new_attribute)== 0.0:
                    minmax = np.ones(new_partition.shape[0]*num_timestamps)
                else:
                    zscore = stats.zscore(new_attribute)
                    data_2d = zscore.reshape(-1, 1)
                    minmax = scaler.fit_transform(data_2d).flatten()
                if (np.isnan(minmax).any() or np.isinf(minmax).any()):
                    print('nan-zscore')
                
                for m in range(0,new_partition.shape[0]):
                    new_partition[m,j*num_timestamps:(j+1)*num_timestamps] = minmax[m*num_timestamps:(m+1)*num_timestamps] 
                 
                pbar.update(1)

            with open(data_dir + first_name + "Partition" + str(i+1) + last_name +".pkl", 'wb') as f:
                pickle.dump(new_partition, f)
            with open(data_dir + first_name + "Partition" + str(i+1) + "_Labels" + last_name +".pkl", 'wb') as f:
                pickle.dump(labels[i], f)


### RUS_Tomek_Smote

In [2]:
import pickle
import pandas as pd

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/6_1_OUSampling_Concatenation_KnnImputation/"

X_train_array = []
Y_train_array = []

num_partitions = 5
for i in range(0, num_partitions):
    with open(data_dir + "RUS_Tomek_Smote_" +"Partition" + str(i+1) + "_WithoutC_Concatenation_KnnImputation" +".pkl", 'rb') as f:
        X_train_array.append(pickle.load(f))
    with open(data_dir + "RUS_Tomek_Smote_" + "Partition" + str(i+1) + "_Labels_WithoutC_Concatenation_KnnImputation" +".pkl", 'rb') as f:
        Y_train_array.append(pickle.load(f))

In [3]:
data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/8_3_FinalData_OUSampling_Concatenation_ZM_KnnImputation/"
first_name = 'RUS_Tomek_Smote_'
last_name = '_WithoutC_Concatenation_ZM_KnnImputation'

zm(1,5,X_train_array,Y_train_array, data_dir, first_name, last_name)

24it [00:00, 44.74it/s]
24it [00:00, 41.17it/s]
24it [00:00, 45.87it/s]
24it [00:00, 49.55it/s]
24it [00:00, 48.77it/s]


### RUS_Tomek_Adasyn

In [4]:
import pickle
import pandas as pd

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/6_1_OUSampling_Concatenation_KnnImputation/"

X_train_array = []
Y_train_array = []

num_partitions = 5
for i in range(0, num_partitions):
    with open(data_dir + "RUS_Tomek_Adasyn_" +"Partition" + str(i+1) + "_WithoutC_Concatenation_KnnImputation" +".pkl", 'rb') as f:
        X_train_array.append(pickle.load(f))
    with open(data_dir + "RUS_Tomek_Adasyn_" + "Partition" + str(i+1) + "_Labels_WithoutC_Concatenation_KnnImputation" +".pkl", 'rb') as f:
        Y_train_array.append(pickle.load(f))

In [5]:
data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/8_3_FinalData_OUSampling_Concatenation_ZM_KnnImputation/"
first_name = 'RUS_Tomek_Adasyn_'
last_name = '_WithoutC_Concatenation_ZM_KnnImputation'

zm(1,5,X_train_array,Y_train_array, data_dir, first_name, last_name)

24it [00:00, 46.63it/s]
24it [00:00, 49.99it/s]
24it [00:00, 50.58it/s]
24it [00:00, 51.08it/s]
24it [00:00, 51.02it/s]


### Smote

In [6]:
import pickle
import pandas as pd

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/6_2_OSampling_Concatenation_KnnImputation/"

X_train_array = []
Y_train_array = []

num_partitions = 5
for i in range(0, num_partitions):
    with open(data_dir + "Smote_" +"Partition" + str(i+1) + "_OSampling_Concatenation_KnnImputation" +".pkl", 'rb') as f:
        X_train_array.append(pickle.load(f))
    with open(data_dir + "Smote_" + "Partition" + str(i+1) + "_Labels_OSampling_Concatenation_KnnImputation" +".pkl", 'rb') as f:
        Y_train_array.append(pickle.load(f))

In [7]:
data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/8_4_FinalData_OSampling_Concatenation_ZM_KnnImputation/"
first_name = 'Smote_'
last_name = '_OSampling_Concatenation_ZM_KnnImputation'

zm(1,5,X_train_array,Y_train_array, data_dir, first_name, last_name)

24it [00:04,  5.20it/s]
24it [00:05,  4.16it/s]
24it [00:02,  9.59it/s]
24it [00:03,  7.78it/s]
24it [00:04,  5.29it/s]


### Adasyn 

In [8]:
import pickle
import pandas as pd

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/6_2_OSampling_Concatenation_KnnImputation/"

X_train_array = []
Y_train_array = []

num_partitions = 5
for i in range(0, num_partitions):
    with open(data_dir + "Adasyn_" +"Partition" + str(i+1) + "_OSampling_Concatenation_KnnImputation" +".pkl", 'rb') as f:
        X_train_array.append(pickle.load(f))
    with open(data_dir + "Adasyn_" + "Partition" + str(i+1) + "_Labels_OSampling_Concatenation_KnnImputation" +".pkl", 'rb') as f:
        Y_train_array.append(pickle.load(f))

In [9]:
data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/8_4_FinalData_OSampling_Concatenation_ZM_KnnImputation/"
first_name = 'Adasyn_'
last_name = '_OSampling_Concatenation_ZM_KnnImputation'

zm(1,5,X_train_array,Y_train_array, data_dir, first_name, last_name)

24it [00:05,  4.66it/s]
24it [00:05,  4.51it/s]
24it [00:02,  9.61it/s]
24it [00:03,  7.74it/s]
24it [00:04,  5.21it/s]
