# Normalization on NewFeatures

In [11]:
import numpy as np
import pandas as pd
import pickle

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/4_2_FinalData_NewFeatures_KnnImputation/"

newf_imputed_data = []
labels = []


num_partitions = 5

for i in range(0,num_partitions):
# Load the array with Pickle
    with open(data_dir + "Partition" + str(i+1) + "_NewFeatures_KnnImputation" +".pkl", 'rb') as f:
        newf_imputed_data.append(pickle.load(f))
        
    print("P"+str(i+1)+" Nan-Value: "+ str(np.isnan(newf_imputed_data[i]).any() or np.isinf(newf_imputed_data[i]).any()))  

    with open(data_dir + "Partition" + str(i+1) + "_Labels_NewFeatures_KnnImputation" +".pkl", 'rb') as f:
        labels.append(pickle.load(f))

P1 Nan-Value: False
P2 Nan-Value: False
P3 Nan-Value: False
P4 Nan-Value: False
P5 Nan-Value: False


## Zscore Normalization

In [2]:
from scipy import stats
import numpy as np
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from tqdm import tqdm

def zscore(start_partition, end_partition, data, labels, data_dir, name):
    
    num_new_features = 9
    num_attributes = 24
    num_all_features = num_attributes * num_new_features

    for i in range(start_partition-1,end_partition):
        new_partition = np.zeros((np.array(data[i]).shape[0], num_all_features))
        new_partition = np.array(data[i])

        with tqdm(num_all_features) as pbar:
            for j in range(0,num_all_features):
                new_column = np.zeros(new_partition.shape[0])  
                new_column = new_partition[:,j]

                
                if np.std(new_column)== 0.0:
                    zscore = np.ones(new_partition.shape[0])
                else:
                    zscore = stats.zscore(new_column)
                if (np.isnan(zscore).any() or np.isinf(zscore).any()):
                    print('nan-zscore')

                new_column = zscore


                new_partition[:,j] = new_column
                pbar.update(1)

            with open(data_dir + "Partition" + str(i+1) + "_" + name +".pkl", 'wb') as f:
                pickle.dump(new_partition, f)
            with open(data_dir + "Partition" + str(i+1) + "_Labels_" + name +".pkl", 'wb') as f:
                pickle.dump(labels[i], f)

        
                   

## MinMax Normalization

In [3]:
from scipy import stats
import numpy as np
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from tqdm import tqdm

def minmax(start_partition, end_partition, data, labels, data_dir, name):
    
    num_new_features = 9
    num_attributes = 24
    num_all_features = num_attributes * num_new_features

    for i in range(start_partition-1,end_partition):
        new_partition = np.zeros((np.array(data[i]).shape[0], num_all_features))
        new_partition = np.array(data[i])

        with tqdm(num_all_features) as pbar:
            for j in range(0,num_all_features):
                new_column = np.zeros(new_partition.shape[0])  
                new_column = new_partition[:,j]

                
                if np.std(new_column)== 0.0:
                    minmax = np.ones(new_partition.shape[0])
                else:
                    data_2d = new_column.reshape(-1, 1)
                    minmax = scaler.fit_transform(data_2d).flatten()
                if (np.isnan(minmax).any() or np.isinf(minmax).any()):
                    print('nan-zscore')

                new_column = minmax


                new_partition[:,j] = new_column
                pbar.update(1)

            with open(data_dir + "Partition" + str(i+1) + "_" + name +".pkl", 'wb') as f:
                pickle.dump(new_partition, f)
            with open(data_dir + "Partition" + str(i+1) + "_Labels_" + name +".pkl", 'wb') as f:
                pickle.dump(labels[i], f)

        
                   

In [4]:
data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/5_1_FinalData_NewFeatures_ZM_KnnImputation/"
name = 'NewFeatures_ZNorm_KnnImputation'

zscore(1,5,newf_imputed_data,labels, data_dir, name)

216it [00:00, 949.52it/s] 
216it [00:00, 848.12it/s] 
216it [00:00, 1781.60it/s]
216it [00:00, 1520.89it/s]
216it [00:00, 939.91it/s] 


In [5]:
data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/5_1_FinalData_NewFeatures_ZM_KnnImputation/"
name = 'NewFeatures_MinMaxNorm_KnnImputation'

minmax(1,5,newf_imputed_data,labels, data_dir, name)

216it [00:00, 1034.65it/s]
216it [00:00, 878.62it/s] 
216it [00:00, 1814.47it/s]
216it [00:00, 1534.84it/s]
216it [00:00, 1048.31it/s]


## Log, Square, BoxCox, Zscore, and MinMax Normalization (LSBZM Norm) 

In [38]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from scipy import stats
from tqdm import tqdm

def lsbzm(start_partition, end_partition, data, labels, data_dir, name):

    num_new_features = 9
    num_attributes = 24
    num_all_features = num_attributes * num_new_features

    for i in range(start_partition-1,end_partition):
        new_partition = np.zeros((np.array(data[i]).shape[0], num_all_features))
        new_partition = np.array(data[i])

        with tqdm(num_all_features) as pbar:
            for j in range(0,num_all_features):
                new_column = np.zeros(new_partition.shape[0])  
                new_column = new_partition[:,j]

                minmax = np.zeros(new_partition.shape[0])
                all_positive = np.zeros(new_partition.shape[0])

                the_min = np.min(new_column)
                the_max = np.max(new_column)
                skewness = stats.skew(new_column)
                
                        
                if (the_max - the_min > 100000):
                    if (skewness > 1):
                        if (the_min < 0):
                            all_positive = new_column + 2*abs(the_min)
                        else:
                            all_positive = new_column
                            
                        if np.std(all_positive)== 0.0:
                            minmax = np.ones(new_partition.shape[0])
                        else:
                            log = np.log(all_positive)
                            data_2d = log.reshape(-1, 1)
                            minmax = scaler.fit_transform(data_2d).flatten()
                        if (np.isnan(minmax).any() or np.isinf(minmax).any()):
                            print('nan-log')
                            

                    elif (skewness < -1):
                        if (the_min < 0):
                            all_positive = new_column + abs(the_min)
                        else:
                            all_positive = new_column

                        if np.std(all_positive)== 0.0:
                            minmax = np.ones(new_partition.shape[0])
                        else:
                            sqrt = np.sqrt(all_positive)
                            data_2d = sqrt.reshape(-1, 1)
                            minmax = scaler.fit_transform(data_2d).flatten()
                        if (np.isnan(minmax).any() or np.isinf(minmax).any()):
                            print('nan-sqrt')

                    else:
                        if np.std(new_column)== 0.0:
                            minmax = np.ones(new_partition.shape[0])
                        else:
                            zscore = stats.zscore(new_column)
                            data_2d = zscore.reshape(-1, 1)
                            minmax = scaler.fit_transform(data_2d).flatten()
                        if (np.isnan(minmax).any() or np.isinf(minmax).any()):
                            print('nan-zscore')
                
                        
                elif (the_max - the_min < 0.00001):
                    
                    if (skewness > 1):
                        if (the_min < 0):
                            all_positive = new_column + 2*abs(the_min)
                        else:
                            all_positive = new_column
                            
                        if np.std(all_positive)== 0.0:
                            minmax = np.ones(new_partition.shape[0])
                        else:
                            log = np.log(all_positive)
                            data_2d = log.reshape(-1, 1)
                            minmax = scaler.fit_transform(data_2d).flatten()
                        if (np.isnan(minmax).any() or np.isinf(minmax).any()):
                            print('nan-log') 

                    elif (skewness < -1):
                        if (the_min < 0):
                            all_positive = new_column + abs(the_min)
                        else:
                            all_positive = new_column

                        if np.std(all_positive)== 0.0:
                            minmax = np.ones(new_partition.shape[0])
                        else:
                            sqrt = np.sqrt(all_positive)
                            data_2d = sqrt.reshape(-1, 1)
                            minmax = scaler.fit_transform(data_2d).flatten()
                        if (np.isnan(minmax).any() or np.isinf(minmax).any()):
                            print('nan-sqrt') 

                    else:
                        if np.std(new_column)== 0.0:
                            minmax = np.ones(new_partition.shape[0])
                        else:
                            zscore = stats.zscore(new_column)
                            data_2d = zscore.reshape(-1, 1)
                            minmax = scaler.fit_transform(data_2d).flatten()
                        if (np.isnan(minmax).any() or np.isinf(minmax).any()):
                            print('nan-zscore')
                            
                        
                else:
                    if (skewness > 1 or skewness < -1):
                        if (the_min < 0):
                            all_positive = new_column + 2*abs(the_min)
                        else:
                            all_positive = new_column

                        if np.std(all_positive)== 0.0:
                            minmax = np.ones(new_partition.shape[0])
                        else:
                            boxcox, values = stats.boxcox(all_positive)
                            data_2d = boxcox.reshape(-1, 1)
                            minmax = scaler.fit_transform(data_2d).flatten()
                        if (np.isnan(minmax).any() or np.isinf(minmax).any()):
                            print('nan-boxcox')

                            
                    else:
                        if np.std(new_column)== 0.0:
                            minmax = np.ones(new_partition.shape[0])
                        else:
                            zscore = stats.zscore(new_column)
                            data_2d = zscore.reshape(-1, 1)
                            minmax = scaler.fit_transform(data_2d).flatten()
                        if (np.isnan(minmax).any() or np.isinf(minmax).any()):
                            print('nan-zscore')

                new_column = minmax

                new_partition[:,j] = new_column

                pbar.update(1)

            with open(data_dir + "Partition" + str(i+1) + "_" + name + ".pkl", 'wb') as f:
                pickle.dump(new_partition, f)
            with open(data_dir + "Partition" + str(i+1) + "_Labels_" + name +".pkl", 'wb') as f:
                pickle.dump(labels[i], f)

In [39]:
data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/5_2_FinalData_NewFeatures_LSBZM_KnnImputation/"
name = 'NewFeatures_LSBZM_KnnImputation'

lsbzm(1,5,newf_imputed_data,labels, data_dir, name)

216it [00:00, 427.03it/s]
216it [00:00, 264.51it/s]
216it [00:00, 688.64it/s]
216it [00:00, 634.15it/s]
216it [00:00, 363.69it/s]


## Missing value Exploration

In [28]:
# Loading the Raw Data
import pickle
import numpy as np

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/5_2_FinalData_NewFeatures_LSBZM_KnnImputation/"
normalized_data = []

num_partitions = 5

for i in range(0,num_partitions):
# Load the array with Pickle
    with open(data_dir + "Partition" + str(i+1) + "_NewFeatures_LSBZM_KnnImputation" + ".pkl", 'rb') as f:
        normalized_data.append(pickle.load(f))
    print("P"+str(i+1)+" Nan-Value: "+ str(np.isnan(normalized_data[i]).any() or np.isinf(normalized_data[i]).any()))  

P1 Nan-Value: False
P2 Nan-Value: False
P3 Nan-Value: False
P4 Nan-Value: False
P5 Nan-Value: False


In [29]:
# Loading the Raw Data
import pickle
import numpy as np

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/5_1_FinalData_NewFeatures_ZM_KnnImputation/"
normalized_data = []

num_partitions = 5

for i in range(0,num_partitions):
# Load the array with Pickle
    with open(data_dir + "Partition" + str(i+1) + "_NewFeatures_ZNorm_KnnImputation" + ".pkl", 'rb') as f:
        normalized_data.append(pickle.load(f))
    print("P"+str(i+1)+" Nan-Value: "+ str(np.isnan(normalized_data[i]).any() or np.isinf(normalized_data[i]).any()))  

P1 Nan-Value: False
P2 Nan-Value: False
P3 Nan-Value: False
P4 Nan-Value: False
P5 Nan-Value: False
