# Preparing Final Data for ML and DL (ZScore and MinMax Norm)

# ML

In [1]:
# Loading the Raw Data
import pickle
import numpy as np

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/4_1_KnnImputation_ZscoreMinMaxNormalization/"
normalized_data = []

num_partitions = 5

for i in range(0,num_partitions):
# Load the array with Pickle
    with open(data_dir + "Partition" + str(i+1) + "_KnnImputation_ZscoreMinMaxNormalization" + ".pkl", 'rb') as f:
        normalized_data.append(pickle.load(f))

In [2]:
import pandas as pd
labels = []
data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/2_Labels/"

for i in range(1,6):
    labels.append(pd.read_csv(data_dir + "Partition" + str(i) + "_labels.csv"))

## 2D PKL Concatenation

In [3]:
# Concatenation with Shuffle

def multi_to_uni(start_partition, end_partition, data_dir, data, labels):
    category_mapping = {'X': 1, 'M': 1, 'B': 0, 'C': 0, 'FQ': 0}
    
    num_attributes = 25
    num_timestamps = 60
    
    for i in range(start_partition-1,end_partition):
        
        the_labels = pd.DataFrame()
        the_labels['FLARE_CLASS'] = labels[i]['FLARE_CLASS'].map(category_mapping)
        new_partition = np.zeros((np.array(data[i]).shape[2], num_timestamps*(num_attributes-1)))
        new_partition_label = np.zeros(new_partition.shape[0])
        
        each_partition = np.zeros((num_timestamps, num_attributes, new_partition.shape[0]))
        each_partition = np.array(data[i])
        
        with tqdm(new_partition.shape[0]) as pbar:
            for j in range(0,new_partition.shape[0]):
                new_column = np.zeros((num_timestamps,num_attributes)) 
                new_column = each_partition[:,:,j]

                flettened = np.zeros(num_timestamps*(num_attributes-1))

                for m in range(1,num_attributes):
                    flettened[(m-1)*num_timestamps:m*num_timestamps] = new_column[:,m]

                new_partition[j,:] = flettened
                new_partition_label[j] = the_labels.iloc[j]
                
                
                pbar.update(1)
        
        print("P"+str(i+1)+" Nan-Value: "+ str(np.isnan(new_partition).any()))
        X_train = new_partition
        Y_train = new_partition_label


        num_samples = X_train.shape[0]
        shuffle_indices = np.random.permutation(num_samples)

        X_train_shuffled = X_train[shuffle_indices]
        Y_train_shuffled = Y_train[shuffle_indices]
    

        with open(data_dir + "Partition" + str(i+1) 
                       + "_2DPKL_Concatenation_KnnImputation_ZscoreMinMaxNormalization" + ".pkl", 'wb') as f:
            pickle.dump(X_train_shuffled, f)

        with open(data_dir + "Partition" + str(i+1) 
                       + "_Labels_2DPKL_Concatenation_KnnImputation_ZscoreMinMaxNormalization" + ".pkl", 'wb') as f:
            pickle.dump(Y_train_shuffled, f)

In [4]:
from tqdm import tqdm
import pandas as pd
data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/5_4_FinalData_ML_2DPKL_Concatenation_KnnImputation_ZscoreMinMaxNormalization/"

multi_to_uni(1, 5, data_dir, normalized_data, labels)

73492it [00:01, 44949.92it/s]


P1 Nan-Value: False


88557it [00:02, 38910.30it/s]


P2 Nan-Value: False


42510it [00:01, 42218.40it/s]


P3 Nan-Value: False


51261it [00:01, 44431.83it/s]


P4 Nan-Value: False


75365it [00:01, 44094.07it/s]


P5 Nan-Value: False


## 2D PKL With New Features 

In [5]:
# NewFeatures with Shuffle

def new_Features_pkl(start_partition, end_partition, data_dir, data, labels):
    category_mapping = {'X': 1, 'M': 1, 'B': 0, 'C': 0, 'FQ': 0}

    number_of_new_features = 9
    num_attributes = 25
    num_timestamps = 60
    
    for i in range(start_partition-1,end_partition):
        
        the_labels = pd.DataFrame()
        the_labels['FLARE_CLASS'] = labels[i]['FLARE_CLASS'].map(category_mapping)
        new_partition = np.zeros((np.array(data[i]).shape[2], number_of_new_features*24))
        new_partition_label = np.zeros(new_partition.shape[0])
        
        each_partition = np.zeros((num_timestamps, num_attributes, new_partition.shape[0]))
        each_partition = np.array(data[i])
        
        with tqdm(new_partition.shape[0]) as pbar:
            for j in range(0,new_partition.shape[0]):
                new_column = np.zeros((num_timestamps,num_attributes)) 
                new_column = each_partition[:,:,j]
                

                new_features = np.zeros(number_of_new_features*(num_attributes-1))

                for m in range(1,num_attributes):
                    
                    mean = np.mean(new_column[:,m])
                    new_features[((m-1)*number_of_new_features) + 0] = mean
                    median = np.median(new_column[:,m])
                    new_features[((m-1)*number_of_new_features) + 1] = median
                    std = np.std(new_column[:,m])
                    new_features[((m-1)*number_of_new_features) + 2] = std
                    
                    skewness = skew(new_column[:,m])
                    if skewness == np.nan:
                        skewness = new_partition[j-1, ((m-1)*number_of_new_features) + 3]
                    new_features[((m-1)*number_of_new_features) + 3] = skewness
                    
                    kurtosis_value = kurtosis(new_column[:,m])
                    if kurtosis_value == np.nan:
                        kurtosis_value = new_partition[j-1, ((m-1)*number_of_new_features) + 4]
                    new_features[((m-1)*number_of_new_features) + 4] = kurtosis_value
                    
                    indices = np.arange(num_timestamps)
                    weight_array = indices / num_timestamps
                    weighted_avg = np.average(new_column[:,m], weights=weight_array)
                    if weighted_avg == np.nan:
                        weighted_avg = new_partition[j-1, ((m-1)*number_of_new_features) + 5]
                    new_features[((m-1)*number_of_new_features) + 5] = weighted_avg
                    
                    last_value = new_column[59,m]
                    new_features[((m-1)*number_of_new_features) + 6] = last_value
                    first_value = new_column[0,m]
                    new_features[((m-1)*number_of_new_features) + 7] = first_value
                    
                    numerator = np.sum((new_column[:,m] - mean) * (indices - np.mean(indices)))
                    denominator = np.sum((new_column[:,m] - mean) ** 2)
                    slope = numerator / denominator
                    if slope == np.nan:
                        slope = new_partition[j-1, ((m-1)*number_of_new_features) + 8]
                    new_features[((m-1)*number_of_new_features) + 8] = slope
                
                    
                new_partition[j,:] = new_features
                new_partition_label[j] = the_labels.iloc[j]
                
                pbar.update(1)
                
        for z in range(0,num_attributes-1):

            data_2d = new_partition[:,z*9+3].reshape(-1, 1)
            new_partition[:,z*9+3] = scaler.fit_transform(data_2d).flatten()

            data_2d = new_partition[:,z*9+4].reshape(-1, 1)
            new_partition[:,z*9+4] = scaler.fit_transform(data_2d).flatten()

            data_2d = new_partition[:,z*9+8].reshape(-1, 1)
            new_partition[:,z*9+8] = scaler.fit_transform(data_2d).flatten()

        print("P"+str(i+1)+" Nan-Value: "+ str(np.isnan(new_partition).any()))  
        X_train = new_partition
        Y_train = new_partition_label


        num_samples = X_train.shape[0]
        shuffle_indices = np.random.permutation(num_samples)

        X_train_shuffled = X_train[shuffle_indices]
        Y_train_shuffled = Y_train[shuffle_indices]
    

        with open(data_dir + "Partition" + str(i+1) 
                       + "_2DPKL_NewFeatures_KnnImputation_ZscoreMinMaxNormalization" + ".pkl", 'wb') as f:
            pickle.dump(X_train_shuffled, f)

        with open(data_dir + "Partition" + str(i+1) 
                       + "_Labels_2DPKL_NewFeatures_KnnImputation_ZscoreMinMaxNormalization" + ".pkl", 'wb') as f:
            pickle.dump(Y_train_shuffled, f)

In [6]:
from tqdm import tqdm
import pandas as pd
from scipy.stats import skew, kurtosis
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/5_5_FinalData_ML_2DPKL_NewFeatures_KnnImputation_ZscoreMinMaxNormalization/"

new_Features_pkl(1, 5, data_dir, normalized_data, labels)

73492it [08:16, 148.06it/s]


P1 Nan-Value: False


88557it [20:23, 72.36it/s] 


P2 Nan-Value: False


42510it [04:49, 147.00it/s]


P3 Nan-Value: False


51261it [05:47, 147.62it/s]


P4 Nan-Value: False


75365it [08:31, 147.28it/s]


P5 Nan-Value: False


# DL

## 3D PKL

In [7]:
# 3D pickle with shuffle
import pickle

def data_for_sequencemodels(start_partition, end_partition, data_dir, data, labels):
    
    category_mapping = {'X': 1, 'M': 1, 'B': 0, 'C': 0, 'FQ': 0}
    
    sequence_length = 60
    num_features = 25

    for i in range(start_partition-1,end_partition):
        
        num_samples = np.array(data[i]).shape[2]
        X_train = np.zeros((num_samples, sequence_length, num_features-1))
        Y_train = np.zeros(num_samples)
        
        the_labels = pd.DataFrame()
        the_labels['FLARE_CLASS'] = labels[i]['FLARE_CLASS'].map(category_mapping)
        each_partition = np.zeros((sequence_length, num_features, num_samples))
        each_partition = np.array(data[i])

        with tqdm(num_samples) as pbar:
            for j in range(0, num_samples):

                X_train[j, :, :] = each_partition[:,1:num_features,j]
                Y_train[j] = the_labels['FLARE_CLASS'].iloc[j]
                
                pbar.update(1)

        print("P"+str(i+1)+" Nan-Value: "+ str(np.isnan(X_train).any()))            
        num_samples = X_train.shape[0]
        shuffle_indices = np.random.permutation(num_samples)

        X_train_shuffled = X_train[shuffle_indices]
        Y_train_shuffled = Y_train[shuffle_indices]
    

        with open(data_dir + "Partition" + str(i+1) 
                       + "_3DPKL_KnnImputation_ZscoreMinMaxNormalization" + ".pkl", 'wb') as f:
            pickle.dump(X_train_shuffled, f)

        with open(data_dir + "Partition" + str(i+1) 
                       + "_Labels_3DPKL_KnnImputation_ZscoreMinMaxNormalization" + ".pkl", 'wb') as f:
            pickle.dump(Y_train_shuffled, f)

In [8]:
from tqdm import tqdm
import pandas as pd
data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/5_6_FinalData_DL_3DPKL_KnnImputation_ZscoreMinMaxNormalization/"

data_for_sequencemodels(1, 5, data_dir, normalized_data, labels)

73492it [00:00, 132330.74it/s]


P1 Nan-Value: False


88557it [00:00, 135305.60it/s]


P2 Nan-Value: False


42510it [00:00, 123685.96it/s]


P3 Nan-Value: False


51261it [00:00, 134622.14it/s]


P4 Nan-Value: False


75365it [00:00, 137389.99it/s]


P5 Nan-Value: False
