# Implementation of all the Imputation Techniques including FPCKNN

## Doing Imputation on Multivariate Time Series Data (3D)

### Reading the Data built by the Previous Notebooks

In [8]:
# Loading the Raw Data
import warnings
warnings.filterwarnings("ignore")

import pickle
import numpy as np

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/1_Raw/"
raw_data = []

num_partitions = 5

for i in range(0,num_partitions):
# Load the array with Pickle
    with open(data_dir + "Partition" + str(i+1) + ".pkl", 'rb') as f:
        raw_data.append(pickle.load(f))

### Nextvalue Imputation Technique

In [6]:
import pandas as pd
import numpy as np

def next_or_previous_value_imputation(vector):
    # Convert the vector to a Pandas Series
    series = pd.Series(vector)
    
    # Perform next value imputation (backfill)
    next_imputed_series = series[::-1].fillna(method='ffill')[::-1]

    # Perform previous value imputation (forward fill) only for NaNs at the end
    previous_imputed_series = next_imputed_series.fillna(method='ffill')

    return previous_imputed_series

In [7]:
# Inter Column and Between Instance Imputation

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/3_2_BaselineImputation/"

from tqdm import tqdm

k = 100
number_of_partitions = 5
for i in range(0,number_of_partitions):
    new_partition = np.zeros((60,25,np.array(raw_data[i]).shape[2]))
    new_partition = np.array(raw_data[i])
    
    with tqdm(np.array(raw_data[i]).shape[2]) as pbar:
        for j in range(0,np.array(raw_data[i]).shape[2]):
            new_column = np.zeros((60,25))  
            new_column = new_partition[:,:,j]
            new_column[new_column == 0.0] = np.nan
            for m in range(0,24):

                if np.isnan(new_column[:,m+1]).all():
                    new_column[:,m+1] = np.ones(60) 
                else:
                    new_column[:,m+1] = next_or_previous_value_imputation(new_column[:,m+1])
                 
                
            new_partition[:,:,j] = new_column

            pbar.update(1)

        with open(data_dir + "Partition" + str(i+1) + "_NextvalueImputation" + ".pkl", 'wb') as f:
            pickle.dump(new_partition, f)
            
            
# Between Instance Imputation

73492it [01:58, 622.50it/s]
88557it [02:28, 596.86it/s]
42510it [01:13, 575.67it/s]
51261it [01:27, 585.09it/s]
75365it [02:12, 566.97it/s]


### Mean Imputation Technique

In [2]:
# Inter Column and Between Instance Imputation

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/3_2_BaselineImputation/"

from tqdm import tqdm

number_of_partitions = 5
for i in range(0,number_of_partitions):
    new_partition = np.zeros((60,25,np.array(raw_data[i]).shape[2]))
    new_partition = np.array(raw_data[i])
    
    with tqdm(np.array(raw_data[i]).shape[2]) as pbar:
        for j in range(0,np.array(raw_data[i]).shape[2]):
            new_column = np.zeros((60,25))  
            new_column = new_partition[:,:,j]
            new_column[new_column == 0.0] = np.nan
            for m in range(0,24):

                if np.isnan(new_column[:,m+1]).all():
                    new_column[:,m+1] = np.ones(60) 
                else:
                    mean_non_zero = np.mean(new_column[:,m+1][~np.isnan(new_column[:,m+1])])
                    new_column[:,m+1][np.isnan(new_column[:,m+1])] = mean_non_zero
                 
                
            new_partition[:,:,j] = new_column

            pbar.update(1)

        with open(data_dir + "Partition" + str(i+1) + "_MeanImputation" + ".pkl", 'wb') as f:
            pickle.dump(new_partition, f)
            
            
# Between Instance Imputation

73492it [00:09, 8046.90it/s]
88557it [00:10, 8127.42it/s]
42510it [00:05, 7531.84it/s]
51261it [00:06, 7532.10it/s]
75365it [00:09, 7755.86it/s]


# FPCKNN Imputation Technique (our Novelty)

In [2]:
# Correlation Function
from scipy.stats import pearsonr

def mvts_pearson_correlation(X, Y):
    if X.shape != Y.shape:
        raise ValueError("Input arrays X and Y must have the same shape.")

    # Calculate the Pearson correlation coefficient between X and Y
    X = X.T.flatten()
    Y = Y.T.flatten()
    try:
        correlation_coefficient , p = pearsonr(X, Y)
        if np.isnan(correlation_coefficient):
            correlation_coefficient = -1.0
    except:
        correlation_coefficient = -1.0
    
    return correlation_coefficient

In [3]:
# Inter Column and Between Instance Imputation
import warnings
warnings.filterwarnings("ignore")

from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=10)

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/3_KnnImputation/"

from tqdm import tqdm

k_count = 50
k = 50
number_of_partitions = 5
num_attributes = 25
num_timestamps = 60

for i in range(0,number_of_partitions):
    new_partition = np.zeros((num_timestamps,num_attributes,np.array(raw_data[i]).shape[2]))
    new_partition = np.array(raw_data[i])
    
    with tqdm(new_partition.shape[2]) as pbar:
        for j in range(0,new_partition.shape[2]):
            new_column = np.zeros((num_timestamps,num_attributes))  
            new_column = new_partition[:,:,j]
            
            
            
            new_column[new_column == 0.] = np.nan
            
            if np.isnan(new_column[:,1:25]).all():
                new_column = new_partition[:,:,j-1]
            
            nan_index = []
            if np.isnan(new_column).any():
                for m in range(0,num_attributes-1):
                    if np.isnan(new_column[:,m+1]).any():
                        nan_index.append(m+1)
                
                if j < k_count:
                    if j == 0:
                        k = 1
                    else:
                        k = j
                else:
                    k = k_count
                correlation_coefficient = np.full(k, -2.0)
                the_X = new_column
                
                the_X = np.nan_to_num(the_X, nan=0.0)
                
                for n in range(0, k):
                    the_Y = new_partition[:,:,j-n-1]
                    correlation_coefficient[n] = mvts_pearson_correlation(the_X[:,1:25], the_Y[:,1:25])
                    
                for m in range(0,num_attributes-1):

                    if np.isnan(new_column[:,m+1]).all():
                        
                        indices_of_largest = np.where(correlation_coefficient == np.max(correlation_coefficient))
                        first_occurrence_index = indices_of_largest[0][0]
                        new_column[:,m+1] = new_partition[:,m+1,j-first_occurrence_index-1]

                    else:
                        if j>=2:

                            sorted_indices = np.argsort(correlation_coefficient)[::-1]  # Sort in descending order
                            largest_index = sorted_indices[0]  # Index of the largest item
                            second_largest_index = sorted_indices[1]

                            new_2d = [new_partition[:,m+1,j-second_largest_index-1], new_partition[:,m+1,j-largest_index-1], new_column[:,m+1]]
                            new_column[:,m+1] = imputer.fit_transform(new_2d)[2,:]
                        else:
                            new_2d = new_column[:,m+1].reshape(-1, 1)
                            new_column[:,m+1] = imputer.fit_transform(new_2d)[:,0]
                
                
            new_partition[:,:,j] = new_column

            pbar.update(1)

        with open(data_dir + "Partition" + str(i+1) + "_KnnImputation" + ".pkl", 'wb') as f:
            pickle.dump(new_partition, f)
            
            
# Between Instance Imputation

73492it [14:03, 87.11it/s] 
88557it [17:51, 82.66it/s] 
42510it [08:15, 85.86it/s] 
51261it [10:58, 77.81it/s]
75365it [16:19, 76.97it/s] 


### Missing Value Exploration

In [4]:
import pickle
import numpy as np

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/3_KnnImputation/"
imputed_data = []

number_of_partitions = 5
for i in range(1,number_of_partitions +1):
# Load the array with Pickle
    with open(data_dir + "Partition" + str(i) + "_KnnImputation" + ".pkl", 'rb') as f:
        imputed_data.append(pickle.load(f))

In [5]:
def print_missing_values(data, start_partition, end_partition):
    abt_header = ['Timestamp', 'R_VALUE','TOTUSJH','TOTBSQ','TOTPOT','TOTUSJZ','ABSNJZH','SAVNCPP',
                               'USFLUX','TOTFZ','MEANPOT', 'EPSX', 'EPSY','EPSZ','MEANSHR','SHRGT45','MEANGAM',
                                  'MEANGBT','MEANGBZ','MEANGBH','MEANJZH','TOTFY','MEANJZD','MEANALP','TOTFX']
    num_columns = 25
    num_timestamps = 60
    num_partitions = 5
    null_count = [0,0,0,0,0]
    non_null_count = [0,0,0,0,0]
    null_count_per_feature = np.zeros((num_partitions,num_columns), dtype=int)

    for i in range(start_partition-1, end_partition):
        partition = np.array(data[i])

        for j in range(0,partition.shape[2]):
            mvts = partition[:,:, j]
            for m in range(0,num_columns):
                for n in range (0,num_timestamps):
                    if (mvts[n,m] == 0.0 or np.isnan(mvts[n,m]) or np.isinf(mvts[n,m])):
                        null_count[i] += 1
                        null_count_per_feature[i,m] += 1
                    else:
                        non_null_count[i] += 1

        print("Partition" + str(i+1) + ":")
        print("null counts in P" + str(i+1) + ": " + str(null_count[i]))
        print("non-null counts in P"+ str(i+1) + ": " + str(non_null_count[i]))
        for x in range(0,num_columns):
            print(abt_header[x] + ": " + str(null_count_per_feature[i,x]))

        print("\n")

In [15]:
def print_values(data, start_partition, end_partition):
    abt_header = ['Timestamp', 'R_VALUE','TOTUSJH','TOTBSQ','TOTPOT','TOTUSJZ','ABSNJZH','SAVNCPP',
                               'USFLUX','TOTFZ','MEANPOT', 'EPSX', 'EPSY','EPSZ','MEANSHR','SHRGT45','MEANGAM',
                                  'MEANGBT','MEANGBZ','MEANGBH','MEANJZH','TOTFY','MEANJZD','MEANALP','TOTFX']
    num_columns = 2
    num_partitions = 5

    for i in range(start_partition-1, end_partition):
        partition = np.array(data[i])
        print('Partition: ' + str(i+1))
        for j in range(1,partition.shape[1]):
            mvts = partition[:,j, :]
            print('Min ' + abt_header[j] + ': ' + str(np.min(mvts)))
            print('Max ' + abt_header[j] + ': ' + str(np.max(mvts)))

            print("\n")

In [16]:
print_values(imputed_data,1,5)

Partition: 1
Min R_VALUE: 0.0017177569608999
Max R_VALUE: 5.674543728986246


Min TOTUSJH: 0.0001809902644906
Max TOTUSJH: 6078.23759924654


Min TOTBSQ: 31324.6457
Max TOTBSQ: 102662810417.13412


Min TOTPOT: 5.350247853635544e+16
Max TOTPOT: 2.820688751045556e+25


Min TOTUSJZ: 128034521.39929274
Max TOTUSJZ: 187665592928940.44


Min ABSNJZH: 5.330749738963192e-06
Max ABSNJZH: 2582.482977714385


Min SAVNCPP: 5204636.654815674
Max SAVNCPP: 92546068787422.56


Min USFLUX: 6175406931195526.0
Max USFLUX: 8.775930293666179e+22


Min TOTFZ: -3.928351131267749e+25
Max TOTFZ: 1.28553633794317e+24


Min MEANPOT: 40.286658827899146
Max MEANPOT: 55455407.40811653


Min EPSX: -0.4803478799556892
Max EPSX: 0.4596854816593336


Min EPSY: -0.4722807843417359
Max EPSY: 0.4917124917771694


Min EPSZ: -0.9905897598353742
Max EPSZ: 0.9992097972611852


Min MEANSHR: 2.6141170203785107
Max MEANSHR: 78.62631173441771


Min SHRGT45: 0.0097399435083276
Max SHRGT45: 100.0


Min MEANGAM: 8.845779332189387
Ma

### Saving Final Data Imputed by FPCKNN

We chose to name our imputation technique KnnImputation instead of FPCKNN for saving as a file, as it more clearly indicates that it is based on the KNN technique.

In [7]:
import pickle
import numpy as np

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/3_KnnImputation/"
imputed_data = []

number_of_partitions = 5
for i in range(1,number_of_partitions +1):
# Load the array with Pickle
    with open(data_dir + "Partition" + str(i) + "_KnnImputation" + ".pkl", 'rb') as f:
        imputed_data.append(pickle.load(f))

In [2]:
import pandas as pd
labels = []
data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/2_Labels/"

for i in range(1,6):
    labels.append(pd.read_csv(data_dir + "Partition" + str(i) + "_labels.csv"))

## Doing Imputation on Concatenated Multivariate Time Series Data (2D)

### Converting the 3D Multivariate Time Series Data to a 2D Data

In [1]:
# Concatenation with Shuffle

def multi_to_uni(start_partition, end_partition, data_dir, data, labels, name):
    category_mapping = {'X': 1, 'M': 1, 'B': 0, 'C': 0, 'FQ': 0}
    
    num_attributes = 25
    num_timestamps = 60
    
    for i in range(start_partition-1,end_partition):
        
        the_labels = pd.DataFrame()
        the_labels['FLARE_CLASS'] = labels[i]['FLARE_CLASS'].map(category_mapping)
        new_partition = np.zeros((np.array(data[i]).shape[2], num_timestamps*(num_attributes-1)))
        new_partition_label = np.zeros(new_partition.shape[0])
        
        each_partition = np.zeros((num_timestamps, num_attributes, new_partition.shape[0]))
        each_partition = np.array(data[i])
        
        with tqdm(new_partition.shape[0]) as pbar:
            for j in range(0,new_partition.shape[0]):
                new_column = np.zeros((num_timestamps,num_attributes)) 
                new_column = each_partition[:,:,j]

                flettened = np.zeros(num_timestamps*(num_attributes-1))

                for m in range(1,num_attributes):
                    flettened[(m-1)*num_timestamps:m*num_timestamps] = new_column[:,m]

                new_partition[j,:] = flettened
                new_partition_label[j] = the_labels.iloc[j]
                
                
                pbar.update(1)
        
        print("P"+str(i+1)+" Nan-Value: "+ str(np.isnan(new_partition).any()))
        X_train = new_partition
        Y_train = new_partition_label


        num_samples = X_train.shape[0]
        shuffle_indices = np.random.permutation(num_samples)

        X_train_shuffled = X_train[shuffle_indices]
        Y_train_shuffled = Y_train[shuffle_indices]
    

        with open(data_dir + "Partition" + str(i+1) 
                       + "_Concatenation_" + name + ".pkl", 'wb') as f:
            pickle.dump(X_train_shuffled, f)

        with open(data_dir + "Partition" + str(i+1) 
                       + "_Labels_Concatenation_" + name + ".pkl", 'wb') as f:
            pickle.dump(Y_train_shuffled, f)

In [10]:
from tqdm import tqdm
import pandas as pd
data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/4_1_FinalData_Concatenation_KnnImputation/"

multi_to_uni(1, 5, data_dir, imputed_data, labels)

73492it [00:01, 41704.33it/s]


P1 Nan-Value: False


88557it [00:02, 37354.93it/s]


P2 Nan-Value: False


42510it [00:01, 42142.43it/s]


P3 Nan-Value: False


51261it [00:01, 43855.74it/s]


P4 Nan-Value: False


75365it [00:01, 43659.81it/s]


P5 Nan-Value: False


### Mean Imputation Technique

In [1]:
import pickle
import numpy as np

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/3_2_BaselineImputation/"
imputed_data = []

number_of_partitions = 5
for i in range(1,number_of_partitions +1):
# Load the array with Pickle
    with open(data_dir + "Partition" + str(i) + "_MeanImputation" + ".pkl", 'rb') as f:
        imputed_data.append(pickle.load(f))

In [2]:
import pandas as pd
labels = []
data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/2_Labels/"

for i in range(1,6):
    labels.append(pd.read_csv(data_dir + "Partition" + str(i) + "_labels.csv"))

In [4]:
from tqdm import tqdm
import pandas as pd
data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/4_0_FinalData_BaseLineImputation/"
name = 'MeanImputation'
multi_to_uni(1, 5, data_dir, imputed_data, labels, name)

73492it [00:01, 40753.18it/s]


P1 Nan-Value: False


88557it [00:02, 37072.17it/s]


P2 Nan-Value: False


42510it [00:01, 38876.49it/s]


P3 Nan-Value: False


51261it [00:01, 40859.67it/s]


P4 Nan-Value: False


75365it [00:01, 40216.50it/s]


P5 Nan-Value: False


### Nextvalue Imputation Technique

In [2]:
import pickle
import numpy as np

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/3_2_BaselineImputation/"
imputed_data = []

number_of_partitions = 5
for i in range(1,number_of_partitions +1):
# Load the array with Pickle
    with open(data_dir + "Partition" + str(i) + "_NextvalueImputation" + ".pkl", 'rb') as f:
        imputed_data.append(pickle.load(f))

In [3]:
import pandas as pd
labels = []
data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/2_Labels/"

for i in range(1,6):
    labels.append(pd.read_csv(data_dir + "Partition" + str(i) + "_labels.csv"))

In [4]:
from tqdm import tqdm
import pandas as pd
data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/4_0_FinalData_BaseLineImputation/"
name = 'NextvalueImputation'
multi_to_uni(1, 5, data_dir, imputed_data, labels, name)

73492it [00:01, 42710.72it/s]


P1 Nan-Value: False


88557it [00:02, 40533.38it/s]


P2 Nan-Value: False


42510it [00:01, 39358.24it/s]


P3 Nan-Value: False


51261it [00:01, 41648.52it/s]


P4 Nan-Value: False


75365it [00:01, 40292.19it/s]


P5 Nan-Value: False


## Doing Imputation on Vector Data

For each time series feature of SWAN-SF, we calculate nine statistical features to convert the multivariate time series data into vector data.

In [5]:
# NewFeatures with Shuffle

def new_Features_pkl(start_partition, end_partition, data_dir, data, labels, name):
    category_mapping = {'X': 1, 'M': 1, 'B': 0, 'C': 0, 'FQ': 0}

    number_of_new_features = 9
    num_attributes = 25
    num_timestamps = 60
    
    for i in range(start_partition-1,end_partition):
        
        the_labels = pd.DataFrame()
        the_labels['FLARE_CLASS'] = labels[i]['FLARE_CLASS'].map(category_mapping)
        new_partition = np.zeros((np.array(data[i]).shape[2], number_of_new_features*24))
        new_partition_label = np.zeros(new_partition.shape[0])
        
        each_partition = np.zeros((num_timestamps, num_attributes, new_partition.shape[0]))
        each_partition = np.array(data[i])
        
        with tqdm(new_partition.shape[0]) as pbar:
            for j in range(0,new_partition.shape[0]):
                new_column = np.zeros((num_timestamps,num_attributes)) 
                new_column = each_partition[:,:,j]
                

                new_features = np.zeros(number_of_new_features*(num_attributes-1))

                for m in range(1,num_attributes):
                    
                    mean = np.mean(new_column[:,m])
                    new_features[((m-1)*number_of_new_features) + 0] = mean
                    median = np.median(new_column[:,m])
                    new_features[((m-1)*number_of_new_features) + 1] = median
                    std = np.std(new_column[:,m])
                    new_features[((m-1)*number_of_new_features) + 2] = std
                    
                    skewness = skew(new_column[:,m])
                    if np.isnan(skewness):
                        skewness = new_partition[j-1, ((m-1)*number_of_new_features) + 3]
                    new_features[((m-1)*number_of_new_features) + 3] = skewness
                    
                    kurtosis_value = kurtosis(new_column[:,m])
                    if np.isnan(kurtosis_value):
                        kurtosis_value = new_partition[j-1, ((m-1)*number_of_new_features) + 4]
                    new_features[((m-1)*number_of_new_features) + 4] = kurtosis_value
                    
                    indices = np.arange(num_timestamps)
                    weight_array = indices / num_timestamps
                    weighted_avg = np.average(new_column[:,m], weights=weight_array)
                    if np.isnan(weighted_avg):
                        weighted_avg = new_partition[j-1, ((m-1)*number_of_new_features) + 5]
                    new_features[((m-1)*number_of_new_features) + 5] = weighted_avg
                    
                    last_value = new_column[59,m]
                    new_features[((m-1)*number_of_new_features) + 6] = last_value
                    first_value = new_column[0,m]
                    new_features[((m-1)*number_of_new_features) + 7] = first_value
                    
                    numerator = np.sum((new_column[:,m] - mean) * (indices - np.mean(indices)))
                    denominator = np.sum((new_column[:,m] - mean) ** 2)
                    slope = numerator / denominator
                    if np.isnan(slope):
                        slope = new_partition[j-1, ((m-1)*number_of_new_features) + 8]
                    new_features[((m-1)*number_of_new_features) + 8] = slope
                    
                    if all(value == 1.0 for value in new_column[:,m]):
                        new_features[((m-1)*number_of_new_features) + 0 : ((m-1)*number_of_new_features) + 9] = np.ones(9)
                
                
                if np.isnan(new_features).any():
                    print('nan')
                    print()
                    mean_non_zero = np.mean(new_features[~np.isnan(new_features)])
                    new_features[np.isnan(new_features)] = mean_non_zero    
                    
                new_partition[j,:] = new_features
                new_partition_label[j] = the_labels.iloc[j]
                
                pbar.update(1)
                

        print("P"+str(i+1)+" Nan-Value: "+ str(np.isnan(new_partition).any()))  
        X_train = new_partition
        Y_train = new_partition_label


        num_samples = X_train.shape[0]
        shuffle_indices = np.random.permutation(num_samples)

        X_train_shuffled = X_train[shuffle_indices]
        Y_train_shuffled = Y_train[shuffle_indices]
    

        with open(data_dir + "Partition" + str(i+1) 
                       + "_NewFeatures_" + name + ".pkl", 'wb') as f:
            pickle.dump(X_train_shuffled, f)

        with open(data_dir + "Partition" + str(i+1) 
                       + "_Labels_NewFeatures_" + name + ".pkl", 'wb') as f:
            pickle.dump(Y_train_shuffled, f)

In [None]:
from tqdm import tqdm
import pandas as pd
from scipy.stats import skew, kurtosis
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/4_2_FinalData_NewFeatures_KnnImputation/"

new_Features_pkl(1, 5, data_dir, imputed_data, labels)

### Mean Imputation Technique

In [12]:
from tqdm import tqdm
import pandas as pd
from scipy.stats import skew, kurtosis
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/4_0_FinalData_BaseLineImputation/"
name = 'MeanImputation'
new_Features_pkl(1, 5, data_dir, imputed_data, labels, name)

  skewness = skew(new_column[:,m])
  kurtosis_value = kurtosis(new_column[:,m])
  slope = numerator / denominator
73492it [07:41, 159.09it/s]


P1 Nan-Value: False


  skewness = skew(new_column[:,m])
  kurtosis_value = kurtosis(new_column[:,m])
  slope = numerator / denominator
88557it [09:21, 157.83it/s]


P2 Nan-Value: False


  skewness = skew(new_column[:,m])
  kurtosis_value = kurtosis(new_column[:,m])
  slope = numerator / denominator
42510it [04:28, 158.60it/s]


P3 Nan-Value: False


  skewness = skew(new_column[:,m])
  kurtosis_value = kurtosis(new_column[:,m])
  slope = numerator / denominator
51261it [05:24, 157.96it/s]


P4 Nan-Value: False


  skewness = skew(new_column[:,m])
  kurtosis_value = kurtosis(new_column[:,m])
  slope = numerator / denominator
75365it [07:53, 159.07it/s]


P5 Nan-Value: False


### Nextvalue Imputation Technique

In [6]:
from tqdm import tqdm
import pandas as pd
from scipy.stats import skew, kurtosis
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/4_0_FinalData_BaseLineImputation/"
name = 'NextvalueImputation'
new_Features_pkl(1, 5, data_dir, imputed_data, labels, name)

  skewness = skew(new_column[:,m])
  kurtosis_value = kurtosis(new_column[:,m])
  slope = numerator / denominator
73492it [10:57, 111.74it/s]


P1 Nan-Value: False


  skewness = skew(new_column[:,m])
  kurtosis_value = kurtosis(new_column[:,m])
  slope = numerator / denominator
88557it [14:19, 103.08it/s]


P2 Nan-Value: False


  skewness = skew(new_column[:,m])
  kurtosis_value = kurtosis(new_column[:,m])
  slope = numerator / denominator
42510it [07:08, 99.31it/s] 


P3 Nan-Value: False


  skewness = skew(new_column[:,m])
  kurtosis_value = kurtosis(new_column[:,m])
  slope = numerator / denominator
51261it [08:08, 104.83it/s]


P4 Nan-Value: False


  skewness = skew(new_column[:,m])
  kurtosis_value = kurtosis(new_column[:,m])
  slope = numerator / denominator
75365it [11:33, 108.71it/s]


P5 Nan-Value: False


## Final Data Without Imputation

### Concatenated Multivariate Time Series Data (2D)

In [13]:
# Concatenation with Shuffle

def multi_to_uni(start_partition, end_partition, data_dir, data, labels):
    category_mapping = {'X': 1, 'M': 1, 'B': 0, 'C': 0, 'FQ': 0}
    
    num_attributes = 25
    num_timestamps = 60
    
    for i in range(start_partition-1,end_partition):
        
        the_labels = pd.DataFrame()
        the_labels['FLARE_CLASS'] = labels[i]['FLARE_CLASS'].map(category_mapping)
        new_partition = np.zeros((np.array(data[i]).shape[2], num_timestamps*(num_attributes-1)))
        new_partition_label = np.zeros(new_partition.shape[0])
        
        each_partition = np.zeros((num_timestamps, num_attributes, new_partition.shape[0]))
        each_partition = np.array(data[i])
        
        with tqdm(new_partition.shape[0]) as pbar:
            for j in range(0,new_partition.shape[0]):
                new_column = np.zeros((num_timestamps,num_attributes)) 
                new_column = each_partition[:,:,j]
                
                new_column = np.nan_to_num(new_column, nan=0.0)
                
                flettened = np.zeros(num_timestamps*(num_attributes-1))

                for m in range(1,num_attributes):
                    flettened[(m-1)*num_timestamps:m*num_timestamps] = new_column[:,m]

                new_partition[j,:] = flettened
                new_partition_label[j] = the_labels.iloc[j]
                
                
                pbar.update(1)
        
        print("P"+str(i+1)+" Nan-Value: "+ str(np.isnan(new_partition).any()))
        X_train = new_partition
        Y_train = new_partition_label


        num_samples = X_train.shape[0]
        shuffle_indices = np.random.permutation(num_samples)

        X_train_shuffled = X_train[shuffle_indices]
        Y_train_shuffled = Y_train[shuffle_indices]
    

        with open(data_dir + "Partition" + str(i+1) 
                       + "_Concatenation_Raw" + ".pkl", 'wb') as f:
            pickle.dump(X_train_shuffled, f)

        with open(data_dir + "Partition" + str(i+1) 
                       + "_Labels_Concatenation_Raw" + ".pkl", 'wb') as f:
            pickle.dump(Y_train_shuffled, f)

In [14]:
from tqdm import tqdm
import pandas as pd
data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/4_6_FinalData_Concatenation_Raw/"

multi_to_uni(1, 5, data_dir, raw_data, labels)

73492it [00:02, 28942.39it/s]


P1 Nan-Value: False


88557it [00:03, 28486.30it/s]


P2 Nan-Value: False


42510it [00:01, 27675.02it/s]


P3 Nan-Value: False


51261it [00:01, 28415.85it/s]


P4 Nan-Value: False


75365it [00:02, 26911.71it/s]


P5 Nan-Value: False


### Vector Data

In [5]:
# NewFeatures with Shuffle

def new_Features_pkl(start_partition, end_partition, data_dir, data, labels):
    category_mapping = {'X': 1, 'M': 1, 'B': 0, 'C': 0, 'FQ': 0}

    number_of_new_features = 9
    num_attributes = 25
    num_timestamps = 60
    
    for i in range(start_partition-1,end_partition):
        
        the_labels = pd.DataFrame()
        the_labels['FLARE_CLASS'] = labels[i]['FLARE_CLASS'].map(category_mapping)
        new_partition = np.zeros((np.array(data[i]).shape[2], number_of_new_features*24))
        new_partition_label = np.zeros(new_partition.shape[0])
        
        each_partition = np.zeros((num_timestamps, num_attributes, new_partition.shape[0]))
        each_partition = np.array(data[i])
        
        with tqdm(new_partition.shape[0]) as pbar:
            for j in range(0,new_partition.shape[0]):
                new_column = np.zeros((num_timestamps,num_attributes)) 
                new_column = each_partition[:,:,j]
                
                new_column = np.nan_to_num(new_column, nan=0.0)

                new_features = np.zeros(number_of_new_features*(num_attributes-1))

                for m in range(1,num_attributes):
                    all_zeros = np.all(new_column[:,m] == 0.0)
                    if all_zeros:
                        new_features[((m-1)*number_of_new_features):((m-1)*number_of_new_features)+9] = 0
                    else:
                        mean = np.mean(new_column[:,m])
                        new_features[((m-1)*number_of_new_features) + 0] = mean
                        median = np.median(new_column[:,m])
                        new_features[((m-1)*number_of_new_features) + 1] = median
                        std = np.std(new_column[:,m])
                        new_features[((m-1)*number_of_new_features) + 2] = std

                        skewness = skew(new_column[:,m])
                        if np.isreal(skewness) == False:
                            skewness = new_partition[j-1, ((m-1)*number_of_new_features) + 3]
                        new_features[((m-1)*number_of_new_features) + 3] = skewness

                        kurtosis_value = kurtosis(new_column[:,m])
                        if np.isreal(kurtosis_value) == False:
                            kurtosis_value = new_partition[j-1, ((m-1)*number_of_new_features) + 4]
                        new_features[((m-1)*number_of_new_features) + 4] = kurtosis_value

                        indices = np.arange(num_timestamps)
                        weight_array = indices / num_timestamps
                        weighted_avg = np.average(new_column[:,m], weights=weight_array)
                        if weighted_avg == np.nan:
                            weighted_avg = new_partition[j-1, ((m-1)*number_of_new_features) + 5]
                        new_features[((m-1)*number_of_new_features) + 5] = weighted_avg

                        last_value = new_column[59,m]
                        new_features[((m-1)*number_of_new_features) + 6] = last_value
                        first_value = new_column[0,m]
                        new_features[((m-1)*number_of_new_features) + 7] = first_value

                        numerator = np.sum((new_column[:,m] - mean) * (indices - np.mean(indices)))
                        denominator = np.sum((new_column[:,m] - mean) ** 2)
                        slope = numerator / denominator
                        if np.isreal(slope) == False:
                            slope = new_partition[j-1, ((m-1)*number_of_new_features) + 8]
                        new_features[((m-1)*number_of_new_features) + 8] = slope
                
                new_partition[j,:] = new_features
                new_partition_label[j] = the_labels.iloc[j]
                
                pbar.update(1)
                

        print("P"+str(i+1)+" Nan-Value: "+ str(np.isnan(new_partition).any()))  
        X_train = new_partition
        Y_train = new_partition_label


        num_samples = X_train.shape[0]
        shuffle_indices = np.random.permutation(num_samples)

        X_train_shuffled = X_train[shuffle_indices]
        Y_train_shuffled = Y_train[shuffle_indices]
    

        with open(data_dir + "Partition" + str(i+1) 
                       + "_NewFeatures_Raw" + ".pkl", 'wb') as f:
            pickle.dump(X_train_shuffled, f)

        with open(data_dir + "Partition" + str(i+1) 
                       + "_Labels_NewFeatures_Raw" + ".pkl", 'wb') as f:
            pickle.dump(Y_train_shuffled, f)

In [6]:
from tqdm import tqdm
import pandas as pd
from scipy.stats import skew, kurtosis

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/4_7_FinalData_NewFeatures_Raw/"

new_Features_pkl(1, 5, data_dir, raw_data, labels)

73492it [07:42, 158.87it/s]


P1 Nan-Value: False


88557it [09:06, 162.00it/s]


P2 Nan-Value: False


42510it [04:25, 160.27it/s]


P3 Nan-Value: False


51261it [05:19, 160.53it/s]


P4 Nan-Value: False


75365it [07:49, 160.61it/s]


P5 Nan-Value: False
