# Undersampling and Oversampling

In [1]:
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import TomekLinks
import numpy as np

## SMOTE and ADASYN (with RUS and TomekLinks)

In [5]:
# SMOTE, TomekLinks, and RUS
# ADASYN, TomekLinks, and RUS

def rus_tomek_smote_adasyn(start_position, end_position, data_dir, name, X_train, Y_train):

    for i in range(start_position-1, end_position):

        num_fl_to_oversample = 10000
        num_nf_to_keep = 10000
        
        class_0_indices = np.where(Y_train[i] == 0)[0]
        class_1_indices = np.where(Y_train[i] == 1)[0]
        selected_class_0_indices = np.random.choice(class_0_indices, size=num_nf_to_keep, replace=False)
        new_indices = np.concatenate((selected_class_0_indices, class_1_indices))
        X_train_with_RUS = X_train[i][new_indices]
        Y_train_with_RUS = Y_train[i][new_indices]

        tomek_links = TomekLinks(sampling_strategy='auto')
        X_train_RUS_Tomek, Y_train_RUS_Tomek = tomek_links.fit_resample(X_train_with_RUS, Y_train_with_RUS)


        smote = SMOTE(sampling_strategy='minority', random_state=42)
        X_train_RUS_Tomek_Smote, Y_train_RUS_Tomek_Smote = smote.fit_resample(X_train_RUS_Tomek, Y_train_RUS_Tomek)

        adasyn = ADASYN(sampling_strategy='minority', random_state=42)
        X_train_RUS_Tomek_Adasyn, Y_train_RUS_Tomek_Adasyn = adasyn.fit_resample(X_train_RUS_Tomek, Y_train_RUS_Tomek)

        print('Partition' + str(i+1) + ': ')
        print(X_train_RUS_Tomek_Smote.shape)
        print(Y_train_RUS_Tomek_Smote.shape)
        print(X_train_RUS_Tomek_Adasyn.shape)
        print(str(Y_train_RUS_Tomek_Adasyn.shape) + '\n')

        
        num_samples = X_train_RUS_Tomek_Smote.shape[0]
        shuffle_indices = np.random.permutation(num_samples)

        X_train_RUS_Tomek_Smote_shuffled = X_train_RUS_Tomek_Smote[shuffle_indices]
        Y_train_RUS_Tomek_Smote_shuffled = Y_train_RUS_Tomek_Smote[shuffle_indices]
    
        with open(data_dir + "RUS_Tomek_Smote" + "_Partition" + str(i+1) 
                       + "_" + name + ".pkl", 'wb') as f:
            pickle.dump(X_train_RUS_Tomek_Smote_shuffled, f)

        with open(data_dir + "RUS_Tomek_Smote" + "_Partition" + str(i+1) 
                       + "_Labels_" + name + ".pkl", 'wb') as f:
            pickle.dump(Y_train_RUS_Tomek_Smote_shuffled, f)
            
        
        num_samples = X_train_RUS_Tomek_Adasyn.shape[0]
        shuffle_indices = np.random.permutation(num_samples)

        X_train_RUS_Tomek_Adasyn_shuffled = X_train_RUS_Tomek_Adasyn[shuffle_indices]
        Y_train_RUS_Tomek_Adasyn_shuffled = Y_train_RUS_Tomek_Adasyn[shuffle_indices]
    
        with open(data_dir + "RUS_Tomek_Adasyn" + "_Partition" + str(i+1) 
                       + "_" + name + ".pkl", 'wb') as f:
            pickle.dump(X_train_RUS_Tomek_Adasyn_shuffled, f)

        with open(data_dir + "RUS_Tomek_Adasyn" + "_Partition" + str(i+1) 
                       + "_Labels_" + name + ".pkl", 'wb') as f:
            pickle.dump(Y_train_RUS_Tomek_Adasyn_shuffled, f)



# GNI, TomekLinks, RUS 

In [6]:
import numpy as np
import pickle
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE

def rus_tomek_gni(start_position, end_position, data_dir, name, X_train, Y_train, noise_proportion=0.05):
    for i in range(start_position - 1, end_position):
        # Random Under Sampling
        num_nf_to_keep = 10000
        class_0_indices = np.where(Y_train[i] == 0)[0]
        class_1_indices = np.where(Y_train[i] == 1)[0]
        selected_class_0_indices = np.random.choice(class_0_indices, size=num_nf_to_keep, replace=False)
        new_indices = np.concatenate((selected_class_0_indices, class_1_indices))
        X_train_RUS = X_train[i][new_indices]
        Y_train_RUS = Y_train[i][new_indices]

        # Tomek Links
        tomek_links = TomekLinks(sampling_strategy='auto')
        X_train_RUS_Tomek, Y_train_RUS_Tomek = tomek_links.fit_resample(X_train_RUS, Y_train_RUS)

        # Gaussian Noise Injection
        std_dev = np.std(X_train_RUS_Tomek, axis=0)
        noise_level = std_dev * noise_proportion
        class_counts = np.bincount(Y_train_RUS_Tomek.astype(int))
        max_class_count = np.max(class_counts)
        minority_class = np.argmin(class_counts)
        num_samples_to_add = max_class_count - class_counts[minority_class]
        minority_indices = np.where(Y_train_RUS_Tomek == minority_class)[0]

        new_samples = []
        for _ in range(num_samples_to_add):
            sample_index = np.random.choice(minority_indices)
            sample = X_train_RUS_Tomek[sample_index]
            noise = np.random.normal(0, noise_level, sample.shape)
            new_sample = sample + noise
            new_samples.append(new_sample)

        X_train_augmented = np.vstack((X_train_RUS_Tomek, new_samples))
        Y_train_augmented = np.append(Y_train_RUS_Tomek, np.full(num_samples_to_add, minority_class))

        print('Partition' + str(i+1) + ': ')
        print(X_train_augmented.shape)
        print(str(Y_train_augmented.shape) + '\n')
        
        # Shuffle and save the augmented data
        shuffle_indices = np.random.permutation(len(Y_train_augmented))
        X_train_augmented = X_train_augmented[shuffle_indices]
        Y_train_augmented = Y_train_augmented[shuffle_indices]

        with open(data_dir + "RUS_Tomek_GNI" + "_Partition" + str(i+1) + "_" + name + ".pkl", 'wb') as f:
            pickle.dump(X_train_augmented, f)

        with open(data_dir + "RUS_Tomek_GNI" + "_Partition" + str(i+1) + "_Labels_" + name + ".pkl", 'wb') as f:
            pickle.dump(Y_train_augmented, f)


# TimeGAN, TomekLinks, RUS

In [2]:
import numpy as np
import pickle
from imblearn.under_sampling import TomekLinks
from timegan import timegan

def rus_tomek_timegan(start_position, end_position, data_dir, name, X_train, Y_train):
    # TimeGAN network parameters
    timegan_params = dict()
    timegan_params['module'] = 'gru'
    timegan_params['hidden_dim'] = 24
    timegan_params['num_layer'] = 3
    timegan_params['iterations'] = 6000
    timegan_params['batch_size'] = 128

    for i in range(start_position-1, end_position):
        # Parameters for Random Under Sampling
        num_nf_to_keep = 10000

        # Random Under Sampling
        class_0_indices = np.where(Y_train[i] == 0)[0]
        class_1_indices = np.where(Y_train[i] == 1)[0]
        selected_class_0_indices = np.random.choice(class_0_indices, size=num_nf_to_keep, replace=False)
        new_indices = np.concatenate((selected_class_0_indices, class_1_indices))
        X_train_with_RUS = X_train[i][new_indices]
        Y_train_with_RUS = Y_train[i][new_indices]

        # Apply Tomek Links
        tomek_links = TomekLinks(sampling_strategy='auto')
        X_train_RUS_Tomek, Y_train_RUS_Tomek = tomek_links.fit_resample(X_train_with_RUS, Y_train_with_RUS)
        
        num_attributes = 24
        num_partitions = 5
        num_timestamps = 60
        X_train_RUS_Tomek_3D = []
        new_3D = np.zeros((X_train_RUS_Tomek.shape[0], num_timestamps, num_attributes))

        for j in range(0, X_train_RUS_Tomek.shape[0]):
            for m in range(0, num_attributes):
                new_3D[j,:,m] = X_train_RUS_Tomek[j,m*num_timestamps:(m+1)*num_timestamps]
        X_train_RUS_Tomek_3D = new_3D
            
        del X_train_RUS_Tomek

        # Identify the minority class after RUS and Tomek Links
        class_counts = np.bincount(Y_train_RUS_Tomek.astype(int))
        minority_class = np.argmin(class_counts)
        minority_indices = np.where(Y_train_RUS_Tomek == minority_class)[0]
        minority_class_data = X_train_RUS_Tomek_3D[minority_indices]

        # Number of data to be generated by TimeGAN
        num_of_data_to_be_generated = len(Y_train_RUS_Tomek) - 2 * len(minority_indices)

        # Generate synthetic data using TimeGAN
        generated_data = timegan(minority_class_data, timegan_params, num_of_data_to_be_generated)

        # Combine the original data with generated data
        X_train_augmented = np.vstack((X_train_RUS_Tomek_3D, generated_data))
        Y_train_augmented = np.append(Y_train_RUS_Tomek, np.full(len(generated_data), minority_class))
        
        num_timestamps = 60
        num_attributes = 24
        new_partition = np.zeros((np.array(X_train_augmented).shape[0], num_timestamps*(num_attributes)))        
        
        for l in range(0,new_partition.shape[0]):
            new_column = np.zeros((num_timestamps,num_attributes)) 
            new_column = X_train_augmented[l,:,:]

            flettened = np.zeros(num_timestamps*(num_attributes))

            for n in range(0,num_attributes):
                flettened[(n)*num_timestamps:(n+1)*num_timestamps] = new_column[:,n]

            new_partition[l,:] = flettened

        print('Partition' + str(i+1) + ': ')
        print(new_partition.shape)
        print(str(Y_train_augmented.shape) + '\n')
        
        # Shuffle the augmented dataset
        shuffle_indices = np.random.permutation(len(Y_train_augmented))
        X_augmented_shuffled = new_partition[shuffle_indices]
        Y_augmented_shuffled = Y_train_augmented[shuffle_indices]

        # Save the augmented data
        with open(data_dir + "RUS_Tomek_TimeGAN" + "_Partition" + str(i+1) + "_" + name + ".pkl", 'wb') as f:
            pickle.dump(X_augmented_shuffled, f)
        with open(data_dir + "RUS_Tomek_TimeGAN" + "_Partition" + str(i+1) + "_Labels_" + name + ".pkl", 'wb') as f:
            pickle.dump(Y_augmented_shuffled, f)


In [3]:
import pickle
import pandas as pd

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/4_4_FinalData_WithoutC_Concatenation_KnnImputation/"

X_train_array = []
Y_train_array = []

num_partitions = 5
for i in range(0, num_partitions):
    with open(data_dir + "Partition" + str(i+1) + "_WithoutC_Concatenation_KnnImputation" +".pkl", 'rb') as f:
        X_train_array.append(pickle.load(f))
    with open(data_dir + "Partition" + str(i+1) + "_Labels_WithoutC_Concatenation_KnnImputation" +".pkl", 'rb') as f:
        Y_train_array.append(pickle.load(f))

In [9]:
data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/\
6_1_OUSampling_Concatenation_KnnImputation/"

name = "OUSampling_WithoutC_Concatenation_KnnImputation"

rus_tomek_smote_adasyn(1, 5, data_dir, name, X_train_array, Y_train_array)

Partition1: 
(19996, 1440)
(19996,)
(20061, 1440)
(20061,)

Partition2: 
(19998, 1440)
(19998,)
(20043, 1440)
(20043,)

Partition3: 
(19994, 1440)
(19994,)
(19955, 1440)
(19955,)

Partition4: 
(19996, 1440)
(19996,)
(19981, 1440)
(19981,)

Partition5: 
(19998, 1440)
(19998,)
(20044, 1440)
(20044,)



In [10]:
data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/\
6_1_OUSampling_Concatenation_KnnImputation/"

name = "OUSampling_WithoutC_Concatenation_KnnImputation"

rus_tomek_gni(1, 5, data_dir, name, X_train_array, Y_train_array)

Partition1: 
(19996, 1440)
(19996,)

Partition2: 
(19998, 1440)
(19998,)

Partition3: 
(19998, 1440)
(19998,)

Partition4: 
(20000, 1440)
(20000,)

Partition5: 
(19994, 1440)
(19994,)



In [4]:
data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/\
6_1_OUSampling_Concatenation_KnnImputation/"

name = "OUSampling_WithoutC_Concatenation_KnnImputation"

rus_tomek_timegan(1, 5, data_dir, name, X_train_array, Y_train_array)

Start Embedding Network Training
step: 0/6000, e_loss: 0.2455
step: 1000/6000, e_loss: 0.0455
step: 2000/6000, e_loss: 0.0369
step: 3000/6000, e_loss: 0.028
step: 4000/6000, e_loss: 0.0235
step: 5000/6000, e_loss: 0.021
Finish Embedding Network Training
Start Training with Supervised Loss Only
step: 0/6000, s_loss: 0.2197
step: 1000/6000, s_loss: 0.0196
step: 2000/6000, s_loss: 0.0137
step: 3000/6000, s_loss: 0.0114
step: 4000/6000, s_loss: 0.0096
step: 5000/6000, s_loss: 0.0098
Finish Training with Supervised Loss Only
Start Joint Training
step: 0/6000, d_loss: 2.3775, g_loss_u: 0.468, g_loss_s: 0.0201, g_loss_v: 0.2965, e_loss_t0: 0.0412
step: 1000/6000, d_loss: 1.0942, g_loss_u: 1.7368, g_loss_s: 0.0212, g_loss_v: 0.0646, e_loss_t0: 0.0192
step: 2000/6000, d_loss: 1.0218, g_loss_u: 2.0445, g_loss_s: 0.0224, g_loss_v: 0.0436, e_loss_t0: 0.0173
step: 3000/6000, d_loss: 1.2536, g_loss_u: 1.8108, g_loss_s: 0.0201, g_loss_v: 0.0385, e_loss_t0: 0.0164
step: 4000/6000, d_loss: 1.0601, g_lo