In [4]:
import os
import time
import numpy as np
import pandas as pd

import smote_variants as sv
from imblearn import FunctionSampler
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, InstanceHardnessThreshold, NearMiss
from imblearn.over_sampling import SMOTEN

from Dataset_Construction import Balance_Ratio

os.chdir('C:/Users/Darui Yen/OneDrive/桌面/data_after_mid')
os.getcwd()

'C:\\Users\\Darui Yen\\OneDrive\\桌面\\data_after_mid'

In [5]:
def label_divide(train, test, label = 'GB', train_only = False):
    
    train_x = train.drop(columns = label)
    train_y = train[label]
    
    if not train_only:
        test_x = test.drop(columns = label)
        test_y = test[label]    
        return train_x, train_y, test_x, test_y
    else:
        return train_x, train_y

### Oversampling 

In [6]:
def before_over(dataset, label = 'GB'):
    
    colnames = dataset.columns
    Y = dataset[label]
    Y = Y.reset_index(drop = True)
    Y = np.array(Y)
    X = dataset.drop(columns = [label])
    X = X.reset_index(drop = True)
    X = X.to_numpy()
    
    return X, Y, colnames


def after_over(X, Y, colnames):
    
    colnames = colnames[:X.shape[1]]
    X = pd.DataFrame(X, columns = colnames)
    Y = pd.Series(Y)
    
    return X, Y


def over_sample(X, Y, method, proportion = 0.5, n_neighbors = 5, *args):
    
    method_list = ['NoSMOTE', 'SMOTE', 'MSMOTE', 'ROSE', 'SMOTEN']
    if method not in method_list:
        raise Exception('Invalid method !')
    
    if method == method_list[0]:
        over_sampler = sv.NoSMOTE()
    elif method == method_list[1]:
        over_sampler = sv.SMOTE(proportion, n_neighbors)
    elif method == method_list[2]:
        over_sampler = sv.MSMOTE(proportion, n_neighbors)
    elif method == method_list[3]:
        over_sampler = sv.ROSE(proportion)   
    elif method == method_list[4]:
        over_sampler = SMOTEN(sampling_strategy = proportion, k_neighbors = n_neighbors)
    
    if method in method_list[0:4]:
        over_X, over_Y = over_sampler.sample(X, Y)
    else:
        over_X, over_Y = over_sampler.fit_resample(X, Y)
    
    if method == 'ROSE':
        for i in range(over_X.shape[0]):
            for j in range(over_X.shape[1]):
                if over_X[i, j] > 1:
                    over_X[i, j] = 1
                elif over_X[i, j] < -1:
                    over_X[i, j] = -1
    
    return over_X, over_Y

### Undersampling

In [7]:
def before_under(dataset, label = 'GB'):
    
    Y = dataset[label]
    X = dataset.drop(columns = [label])
    
    return X, Y


def under_sample(X, Y, method, proportion = 0.2, *args):
    
    method_list = [None, 'random', 'Tomek', 'IHT', 'NM']
    if method not in method_list:
        raise Exception('Invalid method !')
    
    if method == method_list[0]:
        return X, Y
        
    elif method == method_list[1]:
        under_sampler = RandomUnderSampler(sampling_strategy = proportion)    
    elif method == method_list[2]:
        under_sampler = TomekLinks(sampling_strategy = 'majority')
    elif method == method_list[3]:
        under_sampler = InstanceHardnessThreshold(sampling_strategy = proportion, cv = 5)
    elif method == method_list[4]:
        under_sampler = NearMiss(sampling_strategy = proportion, version = 2)
    
    under_X, under_Y = under_sampler.fit_resample(X, Y)
    return under_X, under_Y


def over_under(dataset, over_method, under_method, *args, label = 'GB'):
    
    #under
    if under_method != None:
        X, Y = before_under(dataset, label)
        Y = Y.astype(int)
        print('Size before Undersampling:', len(Y))
        under_X, under_Y = under_sample(X, Y, under_method)
        dataset = pd.concat([under_X, under_Y], axis = 1)
        print('Size after Undersampling:', len(under_Y))
    
    #over
    temp_X, temp_Y, colnames = before_over(dataset, label)
    print('Size before Oversampling:', len(temp_Y))
    over_X, over_Y = over_sample(temp_X, temp_Y, over_method)
    X, Y = after_over(over_X, over_Y, colnames)
    print('Size after Oversampling:', len(Y), '\n')
        
    return X, Y

### Generate multiple dataset file

In [8]:
def generate_set(train_data, over_method, under_method, index, label = 'GB'):
    
    train_x, train_y = over_under(train_data, over_method, under_method, label)
    train = pd.concat([train_x, train_y], axis = 1)
    train = train.rename(columns = {0: label})
    train.to_csv(f'dataset_{index}.csv')
    
    return train

## Data processing


In [9]:
panel = pd.read_csv('original_data/TrainingSet_new.csv', index_col = 'id').iloc[:, 1:]
bad_kinds = pd.read_csv('original_data/Kinds.csv').iloc[:, 2:]
print('Dimension of Data:', panel.shape, '\nBalance Ratio:', Balance_Ratio(panel))
print('Bad Kinds:', bad_kinds.shape)

sub_panel = bad_kinds[bad_kinds.GB_count <= 2].iloc[:, :-3]

Dimension of Data: (77138, 83) 
Balance Ratio: 18.17902
Bad Kinds: (6093, 86)


In [10]:
###for single dataset###
train_x, train_y = over_under(panel, 'NoSMOTE', None)
train = pd.concat([train_x, train_y], axis = 1)
train = train.rename(columns = {0: 'GB'})
train.to_csv('Train_sample.csv')

print('Dimension:', '\ntrain x:', train_x.shape, '\ntrain y:', train_y.shape, '\nBalance Ratio:', Balance_Ratio(train))

2021-07-05 23:04:04,382:INFO:NoSMOTE: Running sampling via ('NoSMOTE', '{}')


Size before Oversampling: 77138
Size after Oversampling: 77138 

Dimension: 
train x: (77138, 82) 
train y: (77138,) 
Balance Ratio: 18.17902


In [12]:
start = time.time()

dataset_0 = generate_set(panel, 'NoSMOTE', None, 0)

dataset_1 = generate_set(panel, 'SMOTE', None, 1)
dataset_2 = generate_set(panel, 'MSMOTE', None, 2)
dataset_3 = generate_set(panel, 'ROSE', None, 3)
dataset_4 = generate_set(panel, 'SMOTEN', None, 4)

dataset_5 = generate_set(panel, 'SMOTE', 'NM', 5)
dataset_6 = generate_set(panel, 'MSMOTE', 'NM', 6)
dataset_7 = generate_set(panel, 'ROSE', 'NM', 7)
dataset_8 = generate_set(panel, 'SMOTEN', 'NM', 8)

end = time.time()
print("\nRun Time：%f seconds" % (end - start))

2021-07-05 23:04:43,150:INFO:NoSMOTE: Running sampling via ('NoSMOTE', '{}')


Size before Oversampling: 77138
Size after Oversampling: 77138 



2021-07-05 23:04:45,969:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}")


Size before Oversampling: 77138
Size after Oversampling: 111685 



2021-07-05 23:04:50,853:INFO:MSMOTE: Running sampling via ('MSMOTE', "{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}")


Size before Oversampling: 77138
Size after Oversampling: 111685 



2021-07-05 23:05:02,026:INFO:ROSE: Running sampling via ('ROSE', "{'proportion': 0.5, 'random_state': None}")


Size before Oversampling: 77138
Size after Oversampling: 111685 

Size before Oversampling: 77138
Size after Oversampling: 109674 

Size before Undersampling: 77138


2021-07-05 23:07:52,345:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}")


Size after Undersampling: 24132
Size before Oversampling: 24132
Size after Oversampling: 32176 

Size before Undersampling: 77138


2021-07-05 23:08:15,737:INFO:MSMOTE: Running sampling via ('MSMOTE', "{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}")


Size after Undersampling: 24132
Size before Oversampling: 24132
Size after Oversampling: 32176 

Size before Undersampling: 77138


2021-07-05 23:08:40,440:INFO:ROSE: Running sampling via ('ROSE', "{'proportion': 0.5, 'random_state': None}")


Size after Undersampling: 24132
Size before Oversampling: 24132
Size after Oversampling: 32176 

Size before Undersampling: 77138
Size after Undersampling: 24132
Size before Oversampling: 24132
Size after Oversampling: 30165 


Run Time：326.670034 seconds
