# Oversampling

In [5]:
from imblearn.over_sampling import SMOTE, ADASYN
import numpy as np

## Smote and Adasyn

In [6]:
# SMOTE, TomekLinks, and RUS
# ADASYN, TomekLinks, and RUS

def smote_adasyn(start_position, end_position, data_dir, name, X_train, Y_train):

    for i in range(start_position-1, end_position):
                
        smote = SMOTE(sampling_strategy='minority', random_state=42)
        X_train_Smote, Y_train_Smote = smote.fit_resample(X_train[i], Y_train[i])

        adasyn = ADASYN(sampling_strategy='minority', random_state=42)
        X_train_Adasyn, Y_train_Adasyn = adasyn.fit_resample(X_train[i], Y_train[i])

        print('Partition' + str(i+1) + ': ')
        print(X_train_Smote.shape)
        print(Y_train_Smote.shape)
        print(X_train_Adasyn.shape)
        print(str(Y_train_Adasyn.shape) + '\n')
        
        num_samples = X_train_Smote.shape[0]
        shuffle_indices = np.random.permutation(num_samples)

        X_train_Smote_shuffled = X_train_Smote[shuffle_indices]
        Y_train_Smote_shuffled = Y_train_Smote[shuffle_indices]
    
        with open(data_dir + "Smote" + "_Partition" + str(i+1) 
                       + "_" + name + ".pkl", 'wb') as f:
            pickle.dump(X_train_Smote_shuffled, f)

        with open(data_dir + "Smote" + "_Partition" + str(i+1) 
                       + "_Labels_" + name + ".pkl", 'wb') as f:
            pickle.dump(Y_train_Smote_shuffled, f)
            
        
        num_samples = X_train_Adasyn.shape[0]
        shuffle_indices = np.random.permutation(num_samples)

        X_train_Adasyn_shuffled = X_train_Adasyn[shuffle_indices]
        Y_train_Adasyn_shuffled = Y_train_Adasyn[shuffle_indices]
    
        with open(data_dir + "Adasyn" + "_Partition" + str(i+1) 
                       + "_" + name + ".pkl", 'wb') as f:
            pickle.dump(X_train_Adasyn_shuffled, f)

        with open(data_dir + "Adasyn" + "_Partition" + str(i+1) 
                       + "_Labels_" + name + ".pkl", 'wb') as f:
            pickle.dump(Y_train_Adasyn_shuffled, f)



## Gaussian Noise Injection

In [None]:
def gni(start_position, end_position, data_dir, name, X_train, Y_train):

    for i in range(start_position-1, end_position):
        
        num_attributes = 24
        
        class_0_indices = np.where(Y_train[i] == 0)[0]
        class_1_indices = np.where(Y_train[i] == 1)[0]
        
        X_train_0 = X_train[i][class_0_indices]
        Y_train_0 = Y_train[i][class_0_indices]
        X_train_1 = X_train[i][class_1_indices]
        Y_train_1 = Y_train[i][class_1_indices]
        
        ratio = round(Y_train_0.shape[0] / Y_train_1.shape[0])

        synthetic_time_series = np.array([])
        for time_series_instance in X_train_1:
            for i in range(0, ratio):
                
                for j in range(0, num_attributes)
                    noise_std
                    noise = np.random.normal(0, noise_std, len(time_series_instance))
                    synthetic_time_series_instance = time_series_instance + noise
                    synthetic_time_series.append(synthetic_time_series_instance)


        synthetic_time_series_labels = np.ones(synthetic_time_series.shape[0])
        
        X_train_GNI = np.concatenate((X_train_0, X_train_1, synthetic_time_series), axis=0)
        Y_train_GNI = np.concatenate((Y_train_0, Y_train_1, synthetic_time_series_labels), axis=0)
        
        print('Partition' + str(i+1) + ': ')
        print(X_train_GNI.shape)
        print(str(Y_train_GNI.shape) + '\n')
        
        num_samples = Y_train_GNI.shape[0]
        shuffle_indices = np.random.permutation(num_samples)

        X_train_GNI_shuffled = X_train_GNI[shuffle_indices]
        Y_train_GNI_shuffled = Y_train_GNI[shuffle_indices]
    
        with open(data_dir + "GNI" + "_Partition" + str(i+1) 
                       + "_" + name + ".pkl", 'wb') as f:
            pickle.dump(X_train_GNI_shuffled, f)

        with open(data_dir + "GNI" + "_Partition" + str(i+1) 
                       + "_Labels_" + name + ".pkl", 'wb') as f:
            pickle.dump(Y_train_GNI_shuffled, f)
            

## TimeGan (state-of-the-art)

In [None]:
import warnings
warnings.filterwarnings("ignore")

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
# 1. TimeGAN model
from timegan import timegan


def timegan(start_position, end_position, data_dir, name, X_train, Y_train):

    for i in range(start_position-1, end_position):
        
        num_attributes = 24
        
        class_0_indices = np.where(Y_train[i] == 0)[0]
        class_1_indices = np.where(Y_train[i] == 1)[0]
        
        X_train_0 = X_train[i][class_0_indices]
        Y_train_0 = Y_train[i][class_0_indices]
        X_train_1 = X_train[i][class_1_indices]
        Y_train_1 = Y_train[i][class_1_indices]
        
        ratio = round(Y_train_0.shape[0] / Y_train_1.shape[0])
        
        ## Newtork parameters
        parameters = dict()

        parameters['module'] = 'gru' 
        parameters['hidden_dim'] = 1440
        parameters['num_layer'] = 3
        parameters['iterations'] = 10000
        parameters['batch_size'] = 128

        # Run TimeGAN
        synthetic_time_series = np.array([])
        for i in range(0, ratio):
            generated_data = timegan(X_train_1, parameters)
            synthetic_time_series.append(generated_data)

           
        synthetic_time_series_labels = np.ones(synthetic_time_series.shape[0])
        
        X_train_TimeGAN = np.concatenate((X_train_0, X_train_1, synthetic_time_series), axis=0)
        Y_train_TimeGAN  = np.concatenate((Y_train_0, Y_train_1, synthetic_time_series_labels), axis=0)
        
        print('Partition' + str(i+1) + ': ')
        print(X_train_TimeGAN.shape)
        print(str(Y_train_TimeGAN.shape) + '\n')
        
        num_samples = X_train_TimeGAN.shape[0]
        shuffle_indices = np.random.permutation(num_samples)

        X_train_TimeGAN_shuffled = X_train_TimeGAN[shuffle_indices]
        Y_train_TimeGAN_shuffled = Y_train_TimeGAN[shuffle_indices]
    
        with open(data_dir + "TimeGAN" + "_Partition" + str(i+1) 
                       + "_" + name + ".pkl", 'wb') as f:
            pickle.dump(X_train_TimeGAN_shuffled, f)

        with open(data_dir + "TimeGAN" + "_Partition" + str(i+1) 
                       + "_Labels_" + name + ".pkl", 'wb') as f:
            pickle.dump(Y_train_TimeGAN_shuffled, f)
            

In [7]:
import pickle
import pandas as pd

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/4_4_FinalData_WithoutB&C_Concatenation_KnnImputation/"

X_train_array = []
Y_train_array = []

num_partitions = 5
for i in range(0, num_partitions):
    with open(data_dir + "Partition" + str(i+1) + "_WithoutB&C_Concatenation_KnnImputation" +".pkl", 'rb') as f:
        X_train_array.append(pickle.load(f))
    with open(data_dir + "Partition" + str(i+1) + "_Labels_WithoutB&C_Concatenation_KnnImputation" +".pkl", 'rb') as f:
        Y_train_array.append(pickle.load(f))

In [8]:
data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/\
6_2_OSampling_Concatenation_KnnImputation/"

name = "OSampling_Concatenation_KnnImputation"

smote_adasyn(1, 5, data_dir, name, X_train_array, Y_train_array)

Partition1: 
(120260, 1440)
(120260,)
(120300, 1440)
(120300,)

Partition2: 
(146736, 1440)
(146736,)
(146767, 1440)
(146767,)

Partition3: 
(69524, 1440)
(69524,)
(69586, 1440)
(69586,)

Partition4: 
(86588, 1440)
(86588,)
(86615, 1440)
(86615,)

Partition5: 
(125376, 1440)
(125376,)
(125382, 1440)
(125382,)

