# Undersampling and Oversampling

In [21]:
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import TomekLinks
import numpy as np

## SMOTE and ADASYN (with RUS and TomekLinks)

In [22]:
# SMOTE, TomekLinks, and RUS
# ADASYN, TomekLinks, and RUS

def rus_tomek_smote_adasyn(start_position, end_position, data_dir, name, X_train, Y_train):

    for i in range(start_position-1, end_position):

        num_fl_to_oversample = 7000
        num_nf_to_keep = 7000
        
        class_0_indices = np.where(Y_train[i] == 0)[0]
        class_1_indices = np.where(Y_train[i] == 1)[0]
        selected_class_0_indices = np.random.choice(class_0_indices, size=num_nf_to_keep, replace=False)
        new_indices = np.concatenate((selected_class_0_indices, class_1_indices))
        X_train_with_RUS = X_train[i][new_indices]
        Y_train_with_RUS = Y_train[i][new_indices]

        tomek_links = TomekLinks(sampling_strategy='auto')
        X_train_RUS_Tomek, Y_train_RUS_Tomek = tomek_links.fit_resample(X_train_with_RUS, Y_train_with_RUS)


        smote = SMOTE(sampling_strategy='minority', random_state=42)
        X_train_RUS_Tomek_Smote, Y_train_RUS_Tomek_Smote = smote.fit_resample(X_train_RUS_Tomek, Y_train_RUS_Tomek)

        adasyn = ADASYN(sampling_strategy='minority', random_state=42)
        X_train_RUS_Tomek_Adasyn, Y_train_RUS_Tomek_Adasyn = adasyn.fit_resample(X_train_RUS_Tomek, Y_train_RUS_Tomek)

        print('Partition' + str(i+1) + ': ')
        print(X_train_RUS_Tomek_Smote.shape)
        print(Y_train_RUS_Tomek_Smote.shape)
        print(X_train_RUS_Tomek_Adasyn.shape)
        print(str(Y_train_RUS_Tomek_Adasyn.shape) + '\n')

        
        num_samples = X_train_RUS_Tomek_Smote.shape[0]
        shuffle_indices = np.random.permutation(num_samples)

        X_train_RUS_Tomek_Smote_shuffled = X_train_RUS_Tomek_Smote[shuffle_indices]
        Y_train_RUS_Tomek_Smote_shuffled = Y_train_RUS_Tomek_Smote[shuffle_indices]
    
        with open(data_dir + "RUS_Tomek_Smote" + "_Partition" + str(i+1) 
                       + "_" + name + ".pkl", 'wb') as f:
            pickle.dump(X_train_RUS_Tomek_Smote_shuffled, f)

        with open(data_dir + "RUS_Tomek_Smote" + "_Partition" + str(i+1) 
                       + "_Labels_" + name + ".pkl", 'wb') as f:
            pickle.dump(Y_train_RUS_Tomek_Smote_shuffled, f)
            
        
        num_samples = X_train_RUS_Tomek_Adasyn.shape[0]
        shuffle_indices = np.random.permutation(num_samples)

        X_train_RUS_Tomek_Adasyn_shuffled = X_train_RUS_Tomek_Adasyn[shuffle_indices]
        Y_train_RUS_Tomek_Adasyn_shuffled = Y_train_RUS_Tomek_Adasyn[shuffle_indices]
    
        with open(data_dir + "RUS_Tomek_Adasyn" + "_Partition" + str(i+1) 
                       + "_" + name + ".pkl", 'wb') as f:
            pickle.dump(X_train_RUS_Tomek_Adasyn_shuffled, f)

        with open(data_dir + "RUS_Tomek_Adasyn" + "_Partition" + str(i+1) 
                       + "_Labels_" + name + ".pkl", 'wb') as f:
            pickle.dump(Y_train_RUS_Tomek_Adasyn_shuffled, f)



In [23]:
import pickle
import pandas as pd

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/4_4_FinalData_WithoutB&C_Concatenation_KnnImputation/"

X_train_array = []
Y_train_array = []

num_partitions = 5
for i in range(0, num_partitions):
    with open(data_dir + "Partition" + str(i+1) + "_WithoutB&C_Concatenation_KnnImputation" +".pkl", 'rb') as f:
        X_train_array.append(pickle.load(f))
    with open(data_dir + "Partition" + str(i+1) + "_Labels_WithoutB&C_Concatenation_KnnImputation" +".pkl", 'rb') as f:
        Y_train_array.append(pickle.load(f))

In [24]:
data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/\
6_1_OUSampling_Concatenation_KnnImputation/"

name = "OUSampling_Concatenation_KnnImputation"

rus_tomek_smote_adasyn(1, 5, data_dir, name, X_train_array, Y_train_array)

Partition1: 
(13996, 1440)
(13996,)
(13939, 1440)
(13939,)

Partition2: 
(13998, 1440)
(13998,)
(13965, 1440)
(13965,)

Partition3: 
(13996, 1440)
(13996,)
(13867, 1440)
(13867,)

Partition4: 
(14000, 1440)
(14000,)
(13982, 1440)
(13982,)

Partition5: 
(13998, 1440)
(13998,)
(13995, 1440)
(13995,)

