In [1]:
import os
import time
import math
import random
import numpy as np
import pandas as pd
import itertools
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

import smote_variants as sv
#from imblearn import FunctionSampler
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, InstanceHardnessThreshold, NearMiss
from imblearn.over_sampling import ADASYN, SMOTEN

from Dataset_Construction import Balance_Ratio
from Training_Data_Processing import Corner, Kind

os.chdir('C:/Users/user/Desktop/Darui_R08621110')
os.getcwd()

'C:\\Users\\user\\Desktop\\Darui_R08621110'

In [2]:
def label_divide(train, test, label = 'GB', train_only = False):
    
    train_x = train.drop(columns = label)
    train_y = train[label]
    
    if not train_only:
        test_x = test.drop(columns = label)
        test_y = test[label]    
        return train_x, train_y, test_x, test_y
    else:
        return train_x, train_y

### Self-defined Oversampling

In [3]:
'''DEF 1'''
## Input : X+Y(146 cols)
## 計算距離 data1 及 data2 的距離
## Output : 資料點之間的距離
def Distance(data1, data2):
    
    data1 = data1.iloc[:,:-1].values
    data2 = data2.iloc[:,:-1].values

    df=pd.DataFrame()
    for i in tqdm(range(len(data1))):
        
        hamming_set=[]
        for j in range(len(data2)):
            
            hamming = abs(data1[i] - data2[j]).sum()
            hamming_set=np.append(hamming_set,hamming)
            
        hamming_set=pd.DataFrame(hamming_set).T
        df=pd.concat([df,hamming_set])
        
    dis_df = df.reset_index().iloc[:,1:]
    
    return dis_df


'''DEF 2'''
## 給定 df 和 value 找出其 row 和 col
def getIndexes(dfObj, value):
    ''' Get index positions of value in dataframe i.e. dfObj.'''
    listOfPos = list()
    # Get bool dataframe with True at positions where the given value exists
    result = dfObj.isin([value])
    # Get list of columns that contains the value
    seriesObj = result.any()
    columnNames = list(seriesObj[seriesObj == True].index)
    # Iterate over list of columns and fetch the rows indexes where value exists
    for col in columnNames:
        rows = list(result[col][result[col] == True].index)
        for row in rows:
            listOfPos.append((row, col))
    # Return a list of tuples indicating the positions of value in the dataframe
    
    return listOfPos


'''DEF 3'''
## Input : X+Y(level)
#給定一個 資料集 和 值 後, 找出相對應的位置
def ID_Given_Distance_2(data1, data2, d):
    
    D_Matrix = Distance(data1, data2)
    ##數量不同 無法使用上三角
    D_Matrix = D_Matrix.where(np.triu(np.ones(D_Matrix.shape)).astype(np.bool))
    combine = getIndexes(D_Matrix, d)
    
    return combine


'''DEF 4'''
# 2個以上不同 的排列組合
def perm(cols):
    
    from itertools import permutations
    s=set()
    num=cols-1 

    random_list = []
    for i in range(3) :
        TF_list_1 = [True]*(cols-1-i) + [False]*(i+1)
        TF_list_2 = [False]*(cols-1-i) + [True]*(i+1)
        shuffle_time = [1, 3, 6]
        
        for j in range(shuffle_time[i]) :
            temp1 = TF_list_1.copy()
            temp2 = TF_list_2.copy()
            random.shuffle(temp1)
            random.shuffle(temp2)
            random_list.append(temp1)
            random_list.append(temp2)
            
    return(random_list)
    

'''DEF 7'''
#找到各組間隔為 rank 的座標
def cumu_conbine(data, rank, level = 'GB'):
    
    data1=data[data[level]==1]
    data2=data[data[level]==0]
    combine=[]
    
    for i in range(rank):
        combine=combine+ID_Given_Distance_2(data1,data2,i+1)
        
    return combine


'''Main Function 1'''
def Border(data, Near_Minor = 3, Major_Ratio_max = 0.5, n_major_corner = 20, level = 'GB'):
    
    data1=data[data[level]==1]
    data2=data[data[level]==0]
    d=data.iloc[:,:-1].copy()
    training_df=pd.DataFrame()
    
    for a in tqdm(range(2,Near_Minor+1)):

        combine=ID_Given_Distance_2(data1, data1, a)
        smote_df=pd.DataFrame()
        if len(combine)!=0:

            for b in tqdm(range(len(combine))):

                ##選定一組數字,一個當中心點
                a_pair=combine[b]
                point_0=a_pair[0]
                point_1=a_pair[1]

                # d_smote 的初始值和中心點一樣
                d_smote=d.loc[point_0].copy()

                # 找出有差異處的 cols
                d_X=d.loc[point_0]-d.loc[point_1]
                cols=d_X[d_X!=0].index

                ## L為距離 2 的點之間的排列組合
                L=perm(len(cols))

                '''創造DATA_SMOTE(給定中心點:d_smote 和 L)'''
                s_df=pd.DataFrame()

                for i in tqdm(range(len(L))):

                    cb=L[i]
                    s=pd.DataFrame([d_smote]).copy()

                    for j in range(len(cb)):
                        if cb[j]==True:
                            s[cols[j]]=1-d_smote[cols[j]]
                        elif cb[j]==False:
                            s[cols[j]]=d_smote[cols[j]]
                    s_df=pd.concat([s_df,s])
                s_df=s_df.reset_index(drop=True)

                smote_df = pd.concat([smote_df, s_df]) #new added
                smote_df['GB'] = 1 #new_added

        smote_df=smote_df.drop_duplicates().reset_index(drop=True)
        training_df=pd.concat([training_df,smote_df])

    training_df=training_df.drop_duplicates().reset_index(drop=True)
    
    return training_df

### Oversampling 

In [4]:
def before_over(dataset, label = 'GB'):
    
    colnames = dataset.columns
    Y = dataset[label]
    Y = Y.reset_index(drop = True)
    Y = np.array(Y)
    X = dataset.drop(columns = [label])
    X = X.reset_index(drop = True)
    X = X.to_numpy()
    
    return X, Y, colnames


def after_over(X, Y, colnames, back_to_category = False):
    
    colnames = colnames[:X.shape[1]]
    X = pd.DataFrame(X, columns = colnames)
    
    if back_to_category:
        for j in tqdm(range(X.shape[1])):
            colvalue = X.iloc[:, j]
            upper = np.array(colvalue[colvalue < 1])
            lower = np.array(upper[upper > 0])
            colmean = np.mean(lower)
            
            mask = colvalue >= colmean
            X.iloc[mask, j] = 1
            X.iloc[~mask, j] = 0
    
    Y = pd.Series(Y)
    
    return X, Y


def over_sample(X, Y, method, ratio, n_neighbors = 5, *args):
    
    method_list = ['NoSMOTE', 'SMOTE', 'MSMOTE', 'ROSE', 'SMOTEN', 'ADASYN']
    if method not in method_list:
        raise Exception('Invalid method !')
    
    if method == method_list[0]:
        over_sampler = sv.NoSMOTE()
    elif method == method_list[1]:
        over_sampler = sv.SMOTE(ratio, n_neighbors)
    elif method == method_list[2]:
        over_sampler = sv.MSMOTE(ratio, n_neighbors)
    elif method == method_list[3]:
        over_sampler = sv.ROSE(ratio)   
    elif method == method_list[4]:
        over_sampler = SMOTEN(sampling_strategy = ratio, k_neighbors = n_neighbors)
    elif method == method_list[5]:
        over_sampler = ADASYN(sampling_strategy = ratio, n_neighbors = n_neighbors)    
    
    if method in method_list[0:4]:
        over_X, over_Y = over_sampler.sample(X, Y)
    else:
        over_X, over_Y = over_sampler.fit_resample(X, Y)
    
#     if method == 'ROSE':
#         for i in range(over_X.shape[0]):
#             for j in range(over_X.shape[1]):
#                 if over_X[i, j] > 1:
#                     over_X[i, j] = 1
#                 elif over_X[i, j] < -1:
#                     over_X[i, j] = -1
    
    return over_X, over_Y

### Undersampling

In [5]:
def before_under(dataset, label = 'GB'):
    
    Y = dataset[label]
    X = dataset.drop(columns = [label])
    
    return X, Y


def under_sample(X, Y, method, ratio, *args):
    
    method_list = [None, 'random', 'Tomek', 'IHT', 'NM', 'one-sided', 'r-one-sided']
    if method not in method_list:
        raise Exception('Invalid method !')
    
    if method == method_list[0]:
        return X, Y
        
    elif method == method_list[1]:
        under_sampler = RandomUnderSampler(sampling_strategy = ratio)    
    elif method == method_list[2]:
        under_sampler = TomekLinks(sampling_strategy = 'majority')
    elif method == method_list[3]:
        under_sampler = InstanceHardnessThreshold(sampling_strategy = ratio, cv = 5, n_jobs = -1)
    elif method in (method_list[4] + method_list[5]):
        under_sampler = NearMiss(sampling_strategy = ratio, version = 2, n_jobs = -1)
    elif method == method_list[6]:
        under_sampler = InstanceHardnessThreshold(sampling_strategy = 1, cv = 5, n_jobs = -1)
    
    under_X, under_Y = under_sampler.fit_resample(X, Y)
    
    if method == method_list[5]:
        second_sampler = InstanceHardnessThreshold(sampling_strategy = 1, cv = 5, n_jobs = -1)
        under_X, under_Y = second_sampler.fit_resample(under_X, under_Y)
    elif method == method_list[6]:
        second_sampler = NearMiss(sampling_strategy = ratio, version = 2, n_jobs = -1)
        under_X, under_Y = second_sampler.fit_resample(under_X, under_Y)
    
    return under_X, under_Y

### Generate multiple dataset file

In [6]:
def under_over(dataset, over_method, under_method, over_ratio, under_ratio, label = 'GB'):
    
    #undersampling
    if under_method != None:
        X, Y = before_under(dataset, label)
        Y = Y.astype(int)
        print('Size before Undersampling:', len(Y))
        under_X, under_Y = under_sample(X, Y, under_method, under_ratio)
        dataset = pd.concat([under_X, under_Y], axis = 1)
        print('Size after Undersampling:', len(under_Y))
    
    #oversampling
    temp_X, temp_Y, colnames = before_over(dataset, label)
    print('Size before Oversampling:', len(temp_Y))
    over_X, over_Y = over_sample(temp_X, temp_Y, over_method, over_ratio)
    X, Y = after_over(over_X, over_Y, colnames)
    print('Size after Oversampling:', len(Y))
        
    return X, Y


def over_under(dataset, over_method, under_method, over_ratio, under_ratio, label = 'GB') :
    
    #oversampling
    if over_method != None :
        X, Y, colnames = before_over(dataset, label)
        print('Size before Oversampling:', len(Y))
        temp_X, temp_Y = over_sample(X, Y, over_method, over_ratio)
        over_X, over_Y = after_over(temp_X, temp_Y, colnames)
        print('Size after Oversampling:', len(over_Y))
        over_dataset = pd.concat([over_X, over_Y], axis = 1)
        dataset = over_dataset.rename(columns = {0 : label})

    #undersampling
    X, Y = before_under(dataset, label)
    Y = Y.astype(int)
    under_X, under_Y = under_sample(X, Y, under_method, under_ratio)
    print('Size after Undersampling:', len(under_Y))
    
    return under_X, under_Y
    
    
def generate_set(train_data, over_method, under_method, index, over_ratio, under_ratio, order, label = 'GB'):
    
    print('\n', f'Generating Dataset {index}')
    
    if order == 'under' :
        train_x, train_y = under_over(train_data, over_method, under_method, over_ratio, under_ratio, label)
    elif order == 'over' :
        train_x, train_y = over_under(train_data, over_method, under_method, over_ratio, under_ratio, label)
        
    train = pd.concat([train_x, train_y], axis = 1)
    train = train.rename(columns = {0: label})
    train.to_csv(f'dataset_{index}.csv')
    
    return train

## Data processing


### Event data

In [None]:
panel = pd.read_csv('original_data/TrainingSet_new.csv', index_col = 'id').iloc[:, 1:]
kinds = pd.read_csv('original_data/Kinds.csv').iloc[:, 2:]
print('Dimension of Data:', panel.shape, '\nBalance Ratio:', Balance_Ratio(panel))
print('Kinds:', kinds.shape)

#sub_panel = kinds[kinds.GB_count <= 2].iloc[:, :-3]

In [None]:
###for single dataset###
train_x, train_y = over_under(panel, 'NoSMOTE', None)
train = pd.concat([train_x, train_y], axis = 1)
train = train.rename(columns = {0: 'GB'})
train.to_csv('Train_sample.csv')

print('Dimension:', '\ntrain x:', train_x.shape, '\ntrain y:', train_y.shape, '\nBalance Ratio:', Balance_Ratio(train))

In [None]:
###for multiole datasets###
dataset_0 = generate_set(panel, 'NoSMOTE', None, 0)

dataset_1 = generate_set(panel, 'ADASYN', None, 1)
dataset_2 = generate_set(panel, 'MSMOTE', None, 2)
dataset_3 = generate_set(panel, 'ROSE', None, 3)
dataset_4 = generate_set(panel, 'SMOTEN', None, 4)

dataset_5 = generate_set(panel, 'ADASYN', 'NM', 5)
dataset_6 = generate_set(panel, 'MSMOTE', 'NM', 6)
dataset_7 = generate_set(panel, 'ROSE', 'NM', 7)
dataset_8 = generate_set(panel, 'SMOTEN', 'NM', 8)

### Runhist data

In [7]:
runhist = pd.read_csv('train_runhist_new.csv', index_col = 'id').iloc[:, 1:]
run_kind = pd.read_csv('run_kind.csv').iloc[:, 2:-3]
# run_dic = pd.read_csv('run_dictionary.csv').iloc[:, 1:]

print('Dimension of Data:', runhist.shape, '\nBalance Ratio:', Balance_Ratio(runhist))
print('Kinds:', run_kind.shape)

Dimension of Data: (80518, 141) 
Balance Ratio: 1101.9863
Kinds: (63061, 141)


In [None]:
# ##### distance evaluated by hamming distance ##### (very slow)
g_runkind = run_kind[run_kind.GB == 0].iloc[:, :-1]
b_runkind = run_kind[run_kind.GB == 1].iloc[:, :-1]
dis_goodbad = Distance(g_runkind, b_runkind)
dis_bad = Distance(b_runkind, b_runkind)
all_goodbad = dis_goodbad.values.flatten()
all_bad = dis_bad.values.flatten()

fig, axs = plt.subplots(1, 2, figsize = (12, 4))
axs[0].hist(all_goodbad, bins = 40, color = 'red')
axs[0].set_title('Distance between Good & Bad Instances')
axs[0].set_xlabel('Hamming Distance')
axs[0].set_ylabel('Counts')
axs[1].hist(all_bad, bins = 40, color = 'blue')
axs[1].set_title('Distance between Bad Instances')
axs[1].set_xlabel('Hamming Distance')
axs[1].set_ylabel('Counts')

#### oversampling by self-defined method

In [8]:
##### oversampling first #####
OS_B = Border(run_kind, Near_Minor = 15)
self_runhist = pd.concat([runhist, OS_B], axis = 0).reset_index(drop = True)

dataset_2 = generate_set(self_runhist, None, 'NM', 2, over_ratio = None, under_ratio = 1, order = 'over')
print('Size of dataset2:', dataset_2.shape, ', Balance Ratio:', Balance_Ratio(dataset_2))
dataset_2.to_csv('dataset_2.csv')

##### undersampling first #####
self_under = generate_set(runhist, None, 'NM', 6, over_ratio = None, under_ratio = 0.09, order = 'over')

corner_overlap = Corner(self_under)
under_kind = Kind(corner_overlap).iloc[:, :-3]

US_B = Border(under_kind, Near_Minor = 15)
dataset_6 = pd.concat([self_under, US_B], axis = 0).reset_index(drop = True)
print('Size of dataset6:', dataset_6.shape, ', Balance Ratio:', Balance_Ratio(dataset_6))
dataset_6.to_csv('dataset_6.csv')

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  D_Matrix = D_Matrix.where(np.triu(np.ones(D_Matrix.shape)).astype(np.bool))


  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]


 Generating Dataset 2
Size after Undersampling: 1484
Size of dataset2: (1484, 141) , Balance Ratio: 1.0

 Generating Dataset 6
Size after Undersampling: 884


  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  D_Matrix = D_Matrix.where(np.triu(np.ones(D_Matrix.shape)).astype(np.bool))


  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Size of dataset6: (1555, 141) , Balance Ratio: 1.09005


#### oversampling & undersampling 

In [9]:
##### generate datasets #####
dataset_0 = generate_set(runhist, 'NoSMOTE', None, 0, over_ratio = None, under_ratio = None, order = 'over')

dataset_1 = generate_set(runhist, 'ADASYN', 'NM', 1, over_ratio = 0.01, under_ratio = 1, order = 'over')
#dataset_2 = generate_set(runhist, 'MSMOTE', 'NM', 2, over_ratio = 0.01, under_ratio = 1, order = 'over')
dataset_3 = generate_set(runhist, 'ROSE', 'NM', 3, over_ratio = 0.01, under_ratio = 1, order = 'over')
dataset_4 = generate_set(runhist, 'SMOTEN', 'NM', 4, over_ratio = 0.01, under_ratio = 1, order = 'over')

dataset_5 = generate_set(runhist, 'ADASYN', 'NM', 5, over_ratio = 1, under_ratio = 0.09, order = 'under')
#dataset_6 = generate_set(runhist, 'MSMOTE', 'NM', 6, over_ratio = 1, under_ratio = 0.09, order = 'under')
dataset_7 = generate_set(runhist, 'ROSE', 'NM', 7, over_ratio = 1, under_ratio = 0.09, order = 'under')
dataset_8 = generate_set(runhist, 'SMOTEN', 'NM', 8, over_ratio = 1, under_ratio = 0.09, order = 'under')

dataset_9 = generate_set(runhist, None, 'NM', 9, over_ratio = None, under_ratio = 0.1, order = 'over')

2021-09-13 15:49:34,566:INFO:NoSMOTE: Running sampling via ('NoSMOTE', '{}')



 Generating Dataset 0
Size before Oversampling: 80518
Size after Oversampling: 80518
Size after Undersampling: 80518

 Generating Dataset 1
Size before Oversampling: 80518
Size after Oversampling: 81262


2021-09-13 15:49:40,143:INFO:ROSE: Running sampling via ('ROSE', "{'proportion': 0.01, 'random_state': None}")


Size after Undersampling: 1634

 Generating Dataset 3
Size before Oversampling: 80518
Size after Oversampling: 81321
Size after Undersampling: 1752

 Generating Dataset 4
Size before Oversampling: 80518
Size after Oversampling: 81249
Size after Undersampling: 1608

 Generating Dataset 5
Size before Undersampling: 80518
Size after Undersampling: 884
Size before Oversampling: 884
Size after Oversampling: 1618

 Generating Dataset 7
Size before Undersampling: 80518


2021-09-13 15:49:53,973:INFO:ROSE: Running sampling via ('ROSE', "{'proportion': 1, 'random_state': None}")


Size after Undersampling: 884
Size before Oversampling: 884
Size after Oversampling: 1622

 Generating Dataset 8
Size before Undersampling: 80518
Size after Undersampling: 884
Size before Oversampling: 884
Size after Oversampling: 1622

 Generating Dataset 9
Size after Undersampling: 803


#### onde-sided selection 

In [9]:
##### onde-sided selection #####
dataset_0 = generate_set(runhist, 'NoSMOTE', None, 0, over_ratio = None, under_ratio = None, order = 'over')

dataset_1 = generate_set(runhist, 'ADASYN', 'one-sided', 1, over_ratio = 0.01, under_ratio = 1, order = 'over')
#dataset_2 = generate_set(runhist, 'MSMOTE', 'one-sided', 2, over_ratio = 0.01, under_ratio = 1, order = 'over')
dataset_3 = generate_set(runhist, 'ROSE', 'one-sided', 3, over_ratio = 0.01, under_ratio = 1, order = 'over')
dataset_4 = generate_set(runhist, 'SMOTEN', 'one-sided', 4, over_ratio = 0.01, under_ratio = 1, order = 'over')

dataset_5 = generate_set(runhist, 'ADASYN', 'one-sided', 5, over_ratio = 1, under_ratio = 0.05, order = 'under')
#dataset_6 = generate_set(runhist, 'MSMOTE', 'one-sided', 6, over_ratio = 1, under_ratio = 0.05, order = 'under')
dataset_7 = generate_set(runhist, 'ROSE', 'one-sided', 7, over_ratio = 1, under_ratio = 0.05, order = 'under')
dataset_8 = generate_set(runhist, 'SMOTEN', 'one-sided', 8, over_ratio = 1, under_ratio = 0.05, order = 'under')

dataset_9 = generate_set(runhist, None, 'one-sided', 9, over_ratio = None, under_ratio = 0.05, order = 'over')

2021-09-12 00:12:56,733:INFO:NoSMOTE: Running sampling via ('NoSMOTE', '{}')



 Generating Dataset 0
Size before Oversampling: 80518
Size after Oversampling: 80518
Size after Undersampling: 80518

 Generating Dataset 1
Size before Oversampling: 80518
Size after Oversampling: 81262


2021-09-12 00:13:02,546:INFO:ROSE: Running sampling via ('ROSE', "{'proportion': 0.01, 'random_state': None}")


Size after Undersampling: 1634

 Generating Dataset 3
Size before Oversampling: 80518
Size after Oversampling: 81321
Size after Undersampling: 1752

 Generating Dataset 4
Size before Oversampling: 80518
Size after Oversampling: 81249
Size after Undersampling: 1608

 Generating Dataset 5
Size before Undersampling: 80518
Size after Undersampling: 845
Size before Oversampling: 845
Size after Oversampling: 1551

 Generating Dataset 7
Size before Undersampling: 80518


2021-09-12 00:13:17,091:INFO:ROSE: Running sampling via ('ROSE', "{'proportion': 1, 'random_state': None}")


Size after Undersampling: 828
Size before Oversampling: 828
Size after Oversampling: 1510

 Generating Dataset 8
Size before Undersampling: 80518
Size after Undersampling: 836
Size before Oversampling: 836
Size after Oversampling: 1526

 Generating Dataset 9
Size after Undersampling: 831
