# Normalization and Imputation

In [1]:
# Loading the Raw Data
import pickle
import numpy as np

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/1_Raw/"
raw_data = []

num_partitions = 5

for i in range(0,num_partitions):
# Load the array with Pickle
    with open(data_dir + "Partition" + str(i+1) + ".pkl", 'rb') as f:
        raw_data.append(pickle.load(f))

# Imputation

# KNN Imputation 

In [2]:
# Inter Column and Between Instance Imputation

from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=6)

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/3_KnnImputation/"

from tqdm import tqdm

k = 100
number_of_partitions = 5
num_attributes = 25
num_timestamps = 60

for i in range(0,number_of_partitions):
    new_partition = np.zeros((num_timestamps,num_attributes,np.array(raw_data[i]).shape[2]))
    new_partition = np.array(raw_data[i])
    
    with tqdm(new_partition.shape[2]) as pbar:
        for j in range(0,new_partition.shape[2]):
            new_column = np.zeros((num_timestamps,num_attributes))  
            new_column = new_partition[:,:,j]
            
            for m in range(0,num_attributes-1):
                # we will start from 1 since we do not need to work on timestamps which are the first columns (0)
                new_column[:,m+1][new_column[:,m+1] == 0.0] = np.nan
                
                if np.isnan(new_column[:,m+1]).all():
                    for n in range(0, k):
                        if np.isnan(new_partition[:,m+1,j-n-1]).any() == False:
                            new_column[:,m+1] = new_partition[:,m+1,j-n-1]
                            break
                else:
                    if j>1: 
                        new_2d = [new_partition[:,m+1,j-2], new_partition[:,m+1,j-1], new_column[:,m+1]]
                        new_column[:,m+1] = imputer.fit_transform(new_2d)[2,:]
                    else:
                        new_2d = new_column[:,m+1].reshape(-1, 1)
                        new_column[:,m+1] = imputer.fit_transform(new_2d)[:,0]
                
                
            new_partition[:,:,j] = new_column

            pbar.update(1)

        with open(data_dir + "Partition" + str(i+1) + "_KnnImputation" + ".pkl", 'wb') as f:
            pickle.dump(new_partition, f)
            
            
# Between Instance Imputation

73492it [03:53, 314.29it/s]
88557it [08:55, 165.31it/s]
42510it [02:33, 276.31it/s]
51261it [03:51, 221.76it/s]
75365it [04:58, 252.71it/s]


# Missing Value Exploration

In [1]:
import pickle
import numpy as np

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/3_KnnImputation/"
imputed_data = []

number_of_partitions = 5
for i in range(1,number_of_partitions +1):
# Load the array with Pickle
    with open(data_dir + "Partition" + str(i) + "_KnnImputation" + ".pkl", 'rb') as f:
        imputed_data.append(pickle.load(f))

In [4]:
def print_missing_values(data, start_partition, end_partition):
    abt_header = ['Timestamp', 'R_VALUE','TOTUSJH','TOTBSQ','TOTPOT','TOTUSJZ','ABSNJZH','SAVNCPP',
                               'USFLUX','TOTFZ','MEANPOT', 'EPSX', 'EPSY','EPSZ','MEANSHR','SHRGT45','MEANGAM',
                                  'MEANGBT','MEANGBZ','MEANGBH','MEANJZH','TOTFY','MEANJZD','MEANALP','TOTFX']
    num_columns = 25
    num_timestamps = 60
    num_partitions = 5
    null_count = [0,0,0,0,0]
    non_null_count = [0,0,0,0,0]
    null_count_per_feature = np.zeros((num_partitions,num_columns), dtype=int)

    for i in range(start_partition-1, end_partition):
        partition = np.array(data[i])

        for j in range(0,partition.shape[2]):
            mvts = partition[:,:, j]
            for m in range(0,num_columns):
                for n in range (0,num_timestamps):
                    if (mvts[n,m] == 0.0 or np.isnan(mvts[n,m]).any()):
                        null_count[i] += 1
                        null_count_per_feature[i,m] += 1
                    else:
                        non_null_count[i] += 1

        print("Partition" + str(i+1) + ":")
        print("null counts in P" + str(i+1) + ": " + str(null_count[i]))
        print("non-null counts in P"+ str(i+1) + ": " + str(non_null_count[i]))
        for x in range(0,num_columns):
            print(abt_header[x] + ": " + str(null_count_per_feature[i,x]))

        print("\n")

In [5]:
print_missing_values(imputed_data,1,5)

Partition1:
null counts in P1: 0
non-null counts in P1: 110238000
Timestamp: 0
R_VALUE: 0
TOTUSJH: 0
TOTBSQ: 0
TOTPOT: 0
TOTUSJZ: 0
ABSNJZH: 0
SAVNCPP: 0
USFLUX: 0
TOTFZ: 0
MEANPOT: 0
EPSX: 0
EPSY: 0
EPSZ: 0
MEANSHR: 0
SHRGT45: 0
MEANGAM: 0
MEANGBT: 0
MEANGBZ: 0
MEANGBH: 0
MEANJZH: 0
TOTFY: 0
MEANJZD: 0
MEANALP: 0
TOTFX: 0


Partition2:
null counts in P2: 0
non-null counts in P2: 132835500
Timestamp: 0
R_VALUE: 0
TOTUSJH: 0
TOTBSQ: 0
TOTPOT: 0
TOTUSJZ: 0
ABSNJZH: 0
SAVNCPP: 0
USFLUX: 0
TOTFZ: 0
MEANPOT: 0
EPSX: 0
EPSY: 0
EPSZ: 0
MEANSHR: 0
SHRGT45: 0
MEANGAM: 0
MEANGBT: 0
MEANGBZ: 0
MEANGBH: 0
MEANJZH: 0
TOTFY: 0
MEANJZD: 0
MEANALP: 0
TOTFX: 0


Partition3:
null counts in P3: 0
non-null counts in P3: 63765000
Timestamp: 0
R_VALUE: 0
TOTUSJH: 0
TOTBSQ: 0
TOTPOT: 0
TOTUSJZ: 0
ABSNJZH: 0
SAVNCPP: 0
USFLUX: 0
TOTFZ: 0
MEANPOT: 0
EPSX: 0
EPSY: 0
EPSZ: 0
MEANSHR: 0
SHRGT45: 0
MEANGAM: 0
MEANGBT: 0
MEANGBZ: 0
MEANGBH: 0
MEANJZH: 0
TOTFY: 0
MEANJZD: 0
MEANALP: 0
TOTFX: 0


Partition4:
null cou

# Normalization

# Zscore and MinMax Normalization

In [None]:
from scipy import stats
import numpy as np
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from tqdm import tqdm

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/4_1_KnnImputation_ZscoreMinMaxNormalization/"

number_of_partitions = 5
num_attributes = 25
num_timestamps = 60

for i in range(0,number_of_partitions):
    new_partition = np.zeros((num_timestamps,num_attributes,np.array(imputed_data[i]).shape[2]))
    new_partition = np.array(imputed_data[i])
    
    with tqdm(new_partition.shape[2]) as pbar:
        for j in range(0,new_partition.shape[2]):
            new_column = np.zeros((num_timestamps,num_attributes))  
            new_column = new_partition[:,:,j]
            
            for m in range(0,num_attributes-1):
                if np.std(new_column[:,m+1])== 0.0:
                    minmax = np.ones(60)
                else:
                    zscore = stats.zscore(new_column[:,m+1])
                    data_2d = zscore.reshape(-1, 1)
                    minmax = scaler.fit_transform(data_2d).flatten()
                if (np.isnan(minmax).any()):
                    print('nan-zscore')
                    
                new_column[:,m+1] = minmax
                
                
            new_partition[:,:,j] = new_column
            pbar.update(1)

        with open(data_dir + "Partition" + str(i+1) + "_KnnImputation_ZScoreMinMaxNormalization" + ".pkl", 'wb') as f:
            pickle.dump(new_partition, f)

        
                   

73492it [02:51, 427.29it/s]
88557it [03:53, 379.53it/s]
42510it [01:51, 381.03it/s]
51261it [02:16, 375.96it/s]
33695it [01:29, 367.62it/s]

# Log, Square, BoxCox, Zscore, and MinMax Normalization 

In [2]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

from scipy import stats

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/4_2_KnnImputation_LogSquareBoxCoxZscoreMinMaxNormalization/"

from tqdm import tqdm

number_of_partitions = 5
num_attributes = 25
num_timestamps = 60

for i in range(0,number_of_partitions):
    new_partition = np.zeros((num_timestamps,num_attributes,np.array(imputed_data[i]).shape[2]))
    new_partition = np.array(imputed_data[i])
    
    with tqdm(new_partition.shape[2]) as pbar:
        for j in range(0,new_partition.shape[2]):
            new_column = np.zeros((num_timestamps,num_attributes)) 
            new_column = new_partition[:,:,j]
            minmax = np.zeros(num_timestamps)
            all_positive = np.zeros(num_timestamps)
            
            for m in range(0,num_attributes-1):
                the_min = np.min(new_column[:,m+1])
                the_max = np.max(new_column[:,m+1])
                skewness = stats.skew(new_column[:,m+1])
                
                if (the_max - the_min > 10000):
                    if (skewness > 1):
                        if (the_min < 0):
                            all_positive = new_column[:,m+1] + abs(the_min) + 0.1
                        else:
                            all_positive = new_column[:,m+1]
                        if np.std(all_positive)== 0.0:
                            minmax = np.ones(60)
                        else:
                            log = np.log(all_positive)
                            zscore = stats.zscore(log)
                            data_2d = zscore.reshape(-1, 1)
                            minmax = scaler.fit_transform(data_2d).flatten()
                        if (np.isnan(minmax).any()):
                            print('nan-log')

                    elif (skewness < -1):
                        if (the_min < 0):
                            all_positive = new_column[:,m+1] + abs(the_min) + 0.1
                        else:
                            all_positive = new_column[:,m+1]

                        if np.std(all_positive)== 0.0:
                            minmax = np.ones(60)
                        else:
                            sqrt = np.sqrt(all_positive)
                            zscore = stats.zscore(sqrt)
                            data_2d = zscore.reshape(-1, 1)
                            minmax = scaler.fit_transform(data_2d).flatten()
                        if (np.isnan(minmax).any()):
                            print('nan-sqrt')

                    else:
                        
                        if np.std(new_column[:,m+1])== 0.0:
                            minmax = np.ones(60)
                        else:
                            zscore = stats.zscore(new_column[:,m+1])
                            data_2d = zscore.reshape(-1, 1)
                            minmax = scaler.fit_transform(data_2d).flatten()
                        if (np.isnan(minmax).any()):
                            print('nan-zscore')
                        

                else:
                    if (skewness > 1 or skewness < -1):

                        if (the_min < 0):
                            all_positive = new_column[:,m+1] + abs(the_min) + 0.1
                        else:
                            all_positive = new_column[:,m+1]

                        
                        if np.std(all_positive)== 0.0:
                            minmax = np.ones(60)
                        else:
                            boxcox, values = stats.boxcox(all_positive)
                            minmax = stats.zscore(boxcox)
                            data_2d = zscore.reshape(-1, 1)
                            minmax = scaler.fit_transform(data_2d).flatten()
                        if (np.isnan(minmax).any()):
                            print('nan-boxcox')
                    else:

                        if np.std(new_column[:,m+1])== 0.0:
                            minmax = np.ones(60)
                        else:
                            zscore = stats.zscore(new_column[:,m+1])
                            data_2d = zscore.reshape(-1, 1)
                            minmax = scaler.fit_transform(data_2d).flatten()
                        if (np.isnan(minmax).any()):
                            print('nan-zscore')

                new_column[:,m+1] = minmax
                
            new_partition[:,:,j] = new_column

            pbar.update(1)

        with open(data_dir + "Partition" + str(i+1) + "_KnnImputation_LogSquareBoxCoxZscoreMinMaxNormalization" + ".pkl", 'wb') as f:
            pickle.dump(new_partition, f)

73492it [07:31, 162.69it/s]
88557it [09:59, 147.65it/s]
42510it [04:45, 148.99it/s]
51261it [05:52, 145.43it/s]
75365it [08:44, 143.71it/s]


In [8]:
# Loading the Raw Data
import pickle
import numpy as np

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/4_2_KnnImputation_LogSquareBoxCoxZscoreMinMaxNormalization/"
normalized_data = []

num_partitions = 5

for i in range(0,num_partitions):
# Load the array with Pickle
    with open(data_dir + "Partition" + str(i+1) + "_KnnImputation_LogSquareBoxCoxZscoreMinMaxNormalization" + ".pkl", 'rb') as f:
        normalized_data.append(pickle.load(f))

In [9]:
def print_missing_values_normalized(data, start_partition, end_partition):
    abt_header = ['Timestamp', 'R_VALUE','TOTUSJH','TOTBSQ','TOTPOT','TOTUSJZ','ABSNJZH','SAVNCPP',
                               'USFLUX','TOTFZ','MEANPOT', 'EPSX', 'EPSY','EPSZ','MEANSHR','SHRGT45','MEANGAM',
                                  'MEANGBT','MEANGBZ','MEANGBH','MEANJZH','TOTFY','MEANJZD','MEANALP','TOTFX']
    num_columns = 25
    num_timestamps = 60
    num_partitions = 5
    null_count = [0,0,0,0,0]
    non_null_count = [0,0,0,0,0]
    null_count_per_feature = np.zeros((num_partitions,num_columns), dtype=int)

    for i in range(start_partition-1, end_partition):
        partition = np.array(data[i])

        for j in range(0,partition.shape[2]):
            mvts = partition[:,:, j]
            for m in range(0,num_columns):
                for n in range (0,num_timestamps):
                    if (np.isnan(mvts[n,m]).any()):
                        null_count[i] += 1
                        null_count_per_feature[i,m] += 1
                    else:
                        non_null_count[i] += 1

        print("Partition" + str(i+1) + ":")
        print("null counts in P" + str(i+1) + ": " + str(null_count[i]))
        print("non-null counts in P"+ str(i+1) + ": " + str(non_null_count[i]))
        for x in range(0,num_columns):
            print(abt_header[x] + ": " + str(null_count_per_feature[i,x]))

        print("\n")

In [10]:
print_missing_values_normalized(normalized_data,1,5)

Partition1:
null counts in P1: 0
non-null counts in P1: 110238000
Timestamp: 0
R_VALUE: 0
TOTUSJH: 0
TOTBSQ: 0
TOTPOT: 0
TOTUSJZ: 0
ABSNJZH: 0
SAVNCPP: 0
USFLUX: 0
TOTFZ: 0
MEANPOT: 0
EPSX: 0
EPSY: 0
EPSZ: 0
MEANSHR: 0
SHRGT45: 0
MEANGAM: 0
MEANGBT: 0
MEANGBZ: 0
MEANGBH: 0
MEANJZH: 0
TOTFY: 0
MEANJZD: 0
MEANALP: 0
TOTFX: 0


Partition2:
null counts in P2: 0
non-null counts in P2: 132835500
Timestamp: 0
R_VALUE: 0
TOTUSJH: 0
TOTBSQ: 0
TOTPOT: 0
TOTUSJZ: 0
ABSNJZH: 0
SAVNCPP: 0
USFLUX: 0
TOTFZ: 0
MEANPOT: 0
EPSX: 0
EPSY: 0
EPSZ: 0
MEANSHR: 0
SHRGT45: 0
MEANGAM: 0
MEANGBT: 0
MEANGBZ: 0
MEANGBH: 0
MEANJZH: 0
TOTFY: 0
MEANJZD: 0
MEANALP: 0
TOTFX: 0


Partition3:
null counts in P3: 0
non-null counts in P3: 63765000
Timestamp: 0
R_VALUE: 0
TOTUSJH: 0
TOTBSQ: 0
TOTPOT: 0
TOTUSJZ: 0
ABSNJZH: 0
SAVNCPP: 0
USFLUX: 0
TOTFZ: 0
MEANPOT: 0
EPSX: 0
EPSY: 0
EPSZ: 0
MEANSHR: 0
SHRGT45: 0
MEANGAM: 0
MEANGBT: 0
MEANGBZ: 0
MEANGBH: 0
MEANJZH: 0
TOTFY: 0
MEANJZD: 0
MEANALP: 0
TOTFX: 0


Partition4:
null cou