# Undersampling and Oversampling

In [27]:
import pickle
import pandas as pd
import numpy as np

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/5_2_FinalTrain&Test_ML_NewFeatures_KnnImputation_LogSquareBoxCoxZscoreMinMaxNormalization/"

X_train_array = []
Y_train_array = []

num_partitions = 5
for i in range(0, num_partitions):
    X_train_array.append(pd.read_csv(data_dir + 'Partition' + str(i+1) + '_NewFeatures_KnnImputation_LogSquareBoxCoxZscoreMinMaxNormalization.csv').astype('float32'))
    Y_train_array.append(pd.read_csv(data_dir + 'Partition' + str(i+1) + '_Labels_NewFeatures_KnnImputation_LogSquareBoxCoxZscoreMinMaxNormalization.csv').astype('float32'))
   

## SMOTE and ADASYN (with RUS and TomekLinks)

In [32]:
# SMOTE, TomekLinks, and RUS
# ADASYN, TomekLinks, and RUS

def rus_tomek_smote_adasyn(start_position, end_position, data_dir, X_train, Y_train):

    for i in range(start_position-1, end_position):

        num_fl_to_oversample = 7000
        num_nf_to_keep = 7000
                
        Y_train_0_indices = Y_train[i][Y_train[i]['Flare_Class'] == 0].index
        Y_train_1_indices = Y_train[i][Y_train[i]['Flare_Class'] == 1].index
        random_Y_train_0_indices = np.random.choice(Y_train_0_indices, num_nf_to_keep, replace=False)

        X_train_0 = X_train[i].loc[random_Y_train_0_indices]
        X_train_1 = X_train[i].loc[Y_train_1_indices]
        Y_train_0 = Y_train[i].loc[random_Y_train_0_indices]
        Y_train_1 = Y_train[i].loc[Y_train_1_indices]

        X_train_with_RUS = pd.concat([X_train_0, X_train_1], axis=0)
        Y_train_with_RUS = pd.concat([Y_train_0, Y_train_1], axis=0)

        tomek_links = TomekLinks(sampling_strategy='auto')
        X_train_RUS_Tomek, Y_train_RUS_Tomek = tomek_links.fit_resample(X_train_with_RUS, Y_train_with_RUS)


        smote = SMOTE(sampling_strategy='minority', random_state=42)
        X_train_RUS_Tomek_Smote, Y_train_RUS_Tomek_Smote = smote.fit_resample(X_train_RUS_Tomek, Y_train_RUS_Tomek)

        adasyn = ADASYN(sampling_strategy='minority', random_state=42)
        X_train_RUS_Tomek_Adasyn, Y_train_RUS_Tomek_Adasyn = adasyn.fit_resample(X_train_RUS_Tomek, Y_train_RUS_Tomek)

        print('Partition' + str(i+1) + ': ')
        print(X_train_RUS_Tomek_Smote.shape)
        print(Y_train_RUS_Tomek_Smote.shape)
        print(X_train_RUS_Tomek_Adasyn.shape)
        print(str(Y_train_RUS_Tomek_Adasyn.shape) + '\n')


        num_samples = len(X_train_RUS_Tomek_Smote)
        shuffle_indices = np.random.permutation(num_samples)
        X_train_RUS_Tomek_Smote_shuffled = X_train_RUS_Tomek_Smote.iloc[shuffle_indices].reset_index(drop=True)
        Y_train_RUS_Tomek_Smote_shuffled = Y_train_RUS_Tomek_Smote.iloc[shuffle_indices].reset_index(drop=True)


        X_train_RUS_Tomek_Smote_shuffled.to_csv(data_dir + 'RUS_Tomek_Smote' + "_Partition" + str(i+1) 
                       + "_NewFeatures_KnnImputation_LogSquareBoxCoxZscoreMinMaxNormalization" + ".csv", index=False)

        Y_train_RUS_Tomek_Smote_shuffled.to_csv(data_dir + 'RUS_Tomek_Smote' + "_Partition" + str(i+1) 
                       + "_Labels_NewFeatures_KnnImputation_LogSquareBoxCoxZscoreMinMaxNormalization" + ".csv", index=False)


        num_samples = len(X_train_RUS_Tomek_Adasyn)
        shuffle_indices = np.random.permutation(num_samples)
        X_train_RUS_Tomek_Adasyn_shuffled = X_train_RUS_Tomek_Adasyn.iloc[shuffle_indices].reset_index(drop=True)
        Y_train_RUS_Tomek_Adasyn_shuffled = Y_train_RUS_Tomek_Adasyn.iloc[shuffle_indices].reset_index(drop=True)


        X_train_RUS_Tomek_Adasyn_shuffled.to_csv(data_dir + 'RUS_Tomek_Adasyn' + "_Partition" + str(i+1) 
                       + "_NewFeatures_KnnImputation_LogSquareBoxCoxZscoreMinMaxNormalization" + ".csv", index=False)

        Y_train_RUS_Tomek_Adasyn_shuffled.to_csv(data_dir + 'RUS_Tomek_Adasyn' + "_Partition" + str(i+1) 
                       + "_Labels_NewFeatures_KnnImputation_LogSquareBoxCoxZscoreMinMaxNormalization" + ".csv", index=False)

In [33]:
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import TomekLinks
import numpy as np

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/6_1_OverUnderSamplingData_NewFeatures_KnnImputation_LogSquareBoxCoxZscoreMinMaxNormalization/"

rus_tomek_smote_adasyn(1, 5, data_dir, X_train_array, Y_train_array)

Partition1: 
(13956, 216)
(13956, 1)
(13803, 216)
(13803, 1)

Partition2: 
(13962, 216)
(13962, 1)
(14153, 216)
(14153, 1)

Partition3: 
(13944, 216)
(13944, 1)
(14019, 216)
(14019, 1)

Partition4: 
(13962, 216)
(13962, 1)
(13928, 216)
(13928, 1)

Partition5: 
(13984, 216)
(13984, 1)
(13850, 216)
(13850, 1)

