# Normalization on Concatenated Multivariate Time Series Data (2D)

## LSBZM Normalization Technique

In [4]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from scipy import stats
from tqdm import tqdm

def lsbzm(start_partition, end_partition, data, labels, data_dir, first_name, last_name):

    num_timestamps= 60
    num_attributes = 24

    for i in range(start_partition-1,end_partition):
        new_partition = np.zeros((np.array(data[i]).shape[0], num_timestamps*num_attributes))
        new_partition = np.array(data[i])

        with tqdm(num_attributes) as pbar:
            for j in range(0,num_attributes):
                new_attribute = np.zeros(new_partition.shape[0]*num_timestamps)  
                
                for m in range(0,new_partition.shape[0]):
                    new_attribute[m*num_timestamps:(m+1)*num_timestamps] = new_partition[m,j*num_timestamps:(j+1)*num_timestamps]
                
                minmax = np.zeros(new_partition.shape[0]*num_timestamps)
                all_positive = np.zeros(new_partition.shape[0]*num_timestamps)

                the_min = np.min(new_attribute)
                the_max = np.max(new_attribute)
                skewness = stats.skew(new_attribute)
                
                if (the_max - the_min > 100000):
                    if (skewness > 1):
                        if (the_min < 0):
                            all_positive = new_attribute + 2*abs(the_min)
                        else:
                            all_positive = new_attribute
                            
                        if np.std(all_positive)== 0.0:
                            minmax = np.ones(new_partition.shape[0]*num_timestamps)
                        else:
                            log = np.log(all_positive)
                            data_2d = log.reshape(-1, 1)
                            minmax = scaler.fit_transform(data_2d).flatten()
                        if (np.isnan(minmax).any() or np.isinf(minmax).any()):
                            print('nan-log')

                    elif (skewness < -1):
                        if (the_min < 0):
                            all_positive = new_attribute + abs(the_min)
                        else:
                            all_positive = new_attribute

                        if np.std(all_positive)== 0.0:
                            minmax = np.ones(new_partition.shape[0]*num_timestamps)
                        else:
                            sqrt = np.sqrt(all_positive)
                            data_2d = sqrt.reshape(-1, 1)
                            minmax = scaler.fit_transform(data_2d).flatten()
                        if (np.isnan(minmax).any() or np.isinf(minmax).any()):
                            print('nan-sqrt')

                    else:
                        if np.std(new_attribute)== 0.0:
                            minmax = np.ones(new_partition.shape[0]*num_timestamps)
                        else:
                            zscore = stats.zscore(new_attribute)
                            data_2d = zscore.reshape(-1, 1)
                            minmax = scaler.fit_transform(data_2d).flatten()
                        if (np.isnan(minmax).any() or np.isinf(minmax).any()):
                            print('nan-zscore')
                
                
                        
                elif (the_max - the_min < 0.00001):
                    
                    if (skewness > 1):
                        if (the_min < 0):
                            all_positive = new_attribute + 2*abs(the_min)
                        else:
                            all_positive = new_attribute
                            
                        if np.std(all_positive)== 0.0:
                            minmax = np.ones(new_partition.shape[0]*num_timestamps)
                        else:
                            log = np.log(all_positive)
                            data_2d = log.reshape(-1, 1)
                            minmax = scaler.fit_transform(data_2d).flatten()
                        if (np.isnan(minmax).any() or np.isinf(minmax).any()):
                            print('nan-log')

                    elif (skewness < -1):
                        if (the_min < 0):
                            all_positive = new_attribute + abs(the_min)
                        else:
                            all_positive = new_attribute

                        if np.std(all_positive)== 0.0:
                            minmax = np.ones(new_partition.shape[0]*num_timestamps)
                        else:
                            sqrt = np.sqrt(all_positive)
                            data_2d = sqrt.reshape(-1, 1)
                            minmax = scaler.fit_transform(data_2d).flatten()
                        if (np.isnan(minmax).any() or np.isinf(minmax).any()):
                            print('nan-sqrt')

                    else:
                        if np.std(new_attribute)== 0.0:
                            minmax = np.ones(new_partition.shape[0]*num_timestamps)
                        else:
                            zscore = stats.zscore(new_attribute)
                            data_2d = zscore.reshape(-1, 1)
                            minmax = scaler.fit_transform(data_2d).flatten()
                        if (np.isnan(minmax).any() or np.isinf(minmax).any()):
                            print('nan-zscore')

                        
                else:
                    if (skewness > 1 or skewness < -1):
                        if (the_min < 0):
                            all_positive = new_attribute + 2*abs(the_min)
                        else:
                            all_positive = new_attribute

                        if np.std(all_positive)== 0.0:
                            minmax = np.ones(new_partition.shape[0]*num_timestamps)
                        else:
                            boxcox, values = stats.boxcox(all_positive)
                            data_2d = boxcox.reshape(-1, 1)
                            minmax = scaler.fit_transform(data_2d).flatten()
                        if (np.isnan(minmax).any() or np.isinf(minmax).any()):
                            print('nan-boxcox')
                            
                    else:
                        if np.std(new_attribute)== 0.0:
                            minmax = np.ones(new_partition.shape[0]*num_timestamps)
                        else:
                            zscore = stats.zscore(new_attribute)
                            data_2d = zscore.reshape(-1, 1)
                            minmax = scaler.fit_transform(data_2d).flatten()
                        if (np.isnan(minmax).any() or np.isinf(minmax).any()):
                            print('nan-zscore')

                    new_attribute = minmax

                for m in range(0,new_partition.shape[0]):
                    new_partition[m,j*num_timestamps:(j+1)*num_timestamps] = minmax[m*num_timestamps:(m+1)*num_timestamps] 
                 

                pbar.update(1)

       

            with open(data_dir + first_name + "Partition" + str(i+1) + last_name + ".pkl", 'wb') as f:
                pickle.dump(new_partition, f)
            with open(data_dir + first_name + "Partition" + str(i+1) + "_Labels" + last_name +".pkl", 'wb') as f:
                pickle.dump(labels[i], f)

### Reading the Data built by the Previous Notebooks (RUS_Tomek_Smote)

In [5]:
import pickle
import pandas as pd
import numpy as np

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/6_1_OUSampling_Concatenation_KnnImputation/"

X_train_array = []
Y_train_array = []

num_partitions = 5
for i in range(0, num_partitions):
    with open(data_dir + "RUS_Tomek_Smote_" +"Partition" + str(i+1) + "_OUSampling_WithoutC_Concatenation_KnnImputation" +".pkl", 'rb') as f:
        X_train_array.append(pickle.load(f))
    with open(data_dir + "RUS_Tomek_Smote_" + "Partition" + str(i+1) + "_Labels_OUSampling_WithoutC_Concatenation_KnnImputation" +".pkl", 'rb') as f:
        Y_train_array.append(pickle.load(f))

In [6]:
data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/8_3_FinalData_OUSampling_Concatenation_LSBZM_KnnImputation/"
first_name = 'RUS_Tomek_Smote_'
last_name = '_WithoutC_Concatenation_LSBZM_KnnImputation'

lsbzm(1,5,X_train_array,Y_train_array, data_dir, first_name, last_name)

24it [00:02, 10.97it/s]
24it [00:01, 13.86it/s]
24it [00:02,  9.96it/s]
24it [00:02, 11.65it/s]
24it [00:01, 13.82it/s]


### Reading the Data built by the Previous Notebooks (RUS_Tomek_Adasyn)

In [7]:
import pickle
import pandas as pd

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/6_1_OUSampling_Concatenation_KnnImputation/"

X_train_array = []
Y_train_array = []

num_partitions = 5
for i in range(0, num_partitions):
    with open(data_dir + "RUS_Tomek_Adasyn_" +"Partition" + str(i+1) + "_OUSampling_WithoutC_Concatenation_KnnImputation" +".pkl", 'rb') as f:
        X_train_array.append(pickle.load(f))
    with open(data_dir + "RUS_Tomek_Adasyn_" + "Partition" + str(i+1) + "_Labels_OUSampling_WithoutC_Concatenation_KnnImputation" +".pkl", 'rb') as f:
        Y_train_array.append(pickle.load(f))

In [8]:
data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/8_3_FinalData_OUSampling_Concatenation_LSBZM_KnnImputation/"
first_name = 'RUS_Tomek_Adasyn_'
last_name = '_WithoutC_Concatenation_LSBZM_KnnImputation'

lsbzm(1,5,X_train_array,Y_train_array, data_dir, first_name, last_name)

24it [00:02, 11.53it/s]
24it [00:01, 13.24it/s]
24it [00:02, 10.02it/s]
24it [00:02, 11.16it/s]
24it [00:01, 13.50it/s]


### Reading the Data built by the Previous Notebooks (Smote)

In [9]:
import pickle
import pandas as pd

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/6_2_OSampling_Concatenation_KnnImputation/"

X_train_array = []
Y_train_array = []

num_partitions = 5
for i in range(0, num_partitions):
    with open(data_dir + "Smote_" +"Partition" + str(i+1) + "_OSampling_WithoutC_Concatenation_KnnImputation" +".pkl", 'rb') as f:
        X_train_array.append(pickle.load(f))
    with open(data_dir + "Smote_" + "Partition" + str(i+1) + "_Labels_OSampling_WithoutC_Concatenation_KnnImputation" +".pkl", 'rb') as f:
        Y_train_array.append(pickle.load(f))

In [10]:
data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/8_4_FinalData_OSampling_Concatenation_LSBZM_KnnImputation/"
first_name = 'Smote_'
last_name = '_OSampling_WithoutC_Concatenation_LSBZM_KnnImputation'

lsbzm(1,5,X_train_array,Y_train_array, data_dir, first_name, last_name)

24it [00:15,  1.51it/s]
24it [00:18,  1.27it/s]
24it [00:09,  2.61it/s]
24it [00:10,  2.37it/s]
24it [00:15,  1.53it/s]


### Reading the Data built by the Previous Notebooks (Adasyn)

In [11]:
import pickle
import pandas as pd

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/6_2_OSampling_Concatenation_KnnImputation/"

X_train_array = []
Y_train_array = []

num_partitions = 5
for i in range(0, num_partitions):
    with open(data_dir + "Adasyn_" +"Partition" + str(i+1) + "_OSampling_WithoutC_Concatenation_KnnImputation" +".pkl", 'rb') as f:
        X_train_array.append(pickle.load(f))
    with open(data_dir + "Adasyn_" + "Partition" + str(i+1) + "_Labels_OSampling_WithoutC_Concatenation_KnnImputation" +".pkl", 'rb') as f:
        Y_train_array.append(pickle.load(f))

In [12]:
data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/8_4_FinalData_OSampling_Concatenation_LSBZM_KnnImputation/"
first_name = 'Adasyn_'
last_name = '_OSampling_WithoutC_Concatenation_LSBZM_KnnImputation'

lsbzm(1,5,X_train_array,Y_train_array, data_dir, first_name, last_name)

24it [00:15,  1.56it/s]
24it [00:18,  1.29it/s]
24it [00:08,  2.71it/s]
24it [00:09,  2.57it/s]
24it [00:14,  1.61it/s]


### Reading the Data built by the Previous Notebooks (GNI)

In [13]:
import pickle
import pandas as pd

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/6_2_OSampling_Concatenation_KnnImputation/"

X_train_array = []
Y_train_array = []

num_partitions = 5
for i in range(0, num_partitions):
    with open(data_dir + "GaussianNoise_" +"Partition" + str(i+1) + "_OSampling_WithoutC_Concatenation_KnnImputation" +".pkl", 'rb') as f:
        X_train_array.append(pickle.load(f))
    with open(data_dir + "GaussianNoise_" + "Partition" + str(i+1) + "_Labels_OSampling_WithoutC_Concatenation_KnnImputation" +".pkl", 'rb') as f:
        Y_train_array.append(pickle.load(f))

In [14]:
data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/8_4_FinalData_OSampling_Concatenation_LSBZM_KnnImputation/"
first_name = 'GaussianNoise_'
last_name = '_OSampling_WithoutC_Concatenation_LSBZM_KnnImputation'

lsbzm(1,5,X_train_array,Y_train_array, data_dir, first_name, last_name)

24it [00:16,  1.48it/s]
24it [00:18,  1.29it/s]
24it [00:08,  2.69it/s]
24it [00:08,  2.68it/s]
24it [00:15,  1.60it/s]


### Reading the Data built by the Previous Notebooks (GNI, Tomek, RUS)

In [15]:
import pickle
import pandas as pd

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/6_1_OUSampling_Concatenation_KnnImputation/"

X_train_array = []
Y_train_array = []

num_partitions = 5
for i in range(0, num_partitions):
    with open(data_dir + "RUS_Tomek_GNI_" +"Partition" + str(i+1) + "_OUSampling_WithoutC_Concatenation_KnnImputation" +".pkl", 'rb') as f:
        X_train_array.append(pickle.load(f))
    with open(data_dir + "RUS_Tomek_GNI_" + "Partition" + str(i+1) + "_Labels_OUSampling_WithoutC_Concatenation_KnnImputation" +".pkl", 'rb') as f:
        Y_train_array.append(pickle.load(f))

In [16]:
data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/8_3_FinalData_OUSampling_Concatenation_LSBZM_KnnImputation/"
first_name = 'RUS_Tomek_GNI_'
last_name = '_WithoutC_Concatenation_LSBZM_KnnImputation'

lsbzm(1,5,X_train_array,Y_train_array, data_dir, first_name, last_name)

24it [00:02, 10.86it/s]
24it [00:02, 11.20it/s]
24it [00:01, 13.08it/s]
24it [00:02, 11.35it/s]
24it [00:01, 14.41it/s]


### Reading the Data built by the Previous Notebooks (TimeGAN)

In [17]:
import pickle
import pandas as pd

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/6_2_OSampling_Concatenation_KnnImputation/"

X_train_array = []
Y_train_array = []

num_partitions = 5
for i in range(0, num_partitions):
    with open(data_dir + "TimeGAN_" +"Partition" + str(i+1) + "_OSampling_WithoutC_Concatenation_KnnImputation" +".pkl", 'rb') as f:
        X_train_array.append(pickle.load(f))
    with open(data_dir + "timeGAN_" + "Partition" + str(i+1) + "_Labels_OSampling_WithoutC_Concatenation_KnnImputation" +".pkl", 'rb') as f:
        Y_train_array.append(pickle.load(f))

In [18]:
data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/8_4_FinalData_OSampling_Concatenation_LSBZM_KnnImputation/"
first_name = 'TimeGAN_'
last_name = '_OSampling_WithoutC_Concatenation_LSBZM_KnnImputation'

lsbzm(1,5,X_train_array,Y_train_array, data_dir, first_name, last_name)

24it [00:13,  1.80it/s]
24it [00:20,  1.19it/s]
24it [00:08,  2.98it/s]
24it [00:10,  2.38it/s]
24it [00:12,  1.91it/s]


### Reading the Data built by the Previous Notebooks (TimeGAN, Tomek, RUS)

In [19]:
import pickle
import pandas as pd

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/6_1_OUSampling_Concatenation_KnnImputation/"

X_train_array = []
Y_train_array = []

num_partitions = 5
for i in range(0, num_partitions):
    with open(data_dir + "RUS_Tomek_TimeGAN_" +"Partition" + str(i+1) + "_OUSampling_WithoutC_Concatenation_KnnImputation" +".pkl", 'rb') as f:
        X_train_array.append(pickle.load(f))
    with open(data_dir + "RUS_Tomek_TimeGAN_" + "Partition" + str(i+1) + "_Labels_OUSampling_WithoutC_Concatenation_KnnImputation" +".pkl", 'rb') as f:
        Y_train_array.append(pickle.load(f))

In [20]:
data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/8_3_FinalData_OUSampling_Concatenation_LSBZM_KnnImputation/"
first_name = 'RUS_Tomek_TimeGAN_'
last_name = '_WithoutC_Concatenation_LSBZM_KnnImputation'

lsbzm(1,5,X_train_array,Y_train_array, data_dir, first_name, last_name)

24it [00:01, 13.75it/s]
24it [00:02, 11.52it/s]
24it [00:01, 13.49it/s]
24it [00:02, 11.49it/s]
24it [00:01, 13.00it/s]
