In [1]:
import os
import math
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

import smote_variants as sv
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, InstanceHardnessThreshold, NearMiss
from imblearn.over_sampling import ADASYN, SMOTEN

from library.Data_Preprocessing import Balance_Ratio, training_def
from library.Training_Data_Processing import Corner, Kind

os.chdir('C:/Users/user/Desktop/Darui_R08621110')
os.getcwd()

'C:\\Users\\user\\Desktop\\Darui_R08621110'

## 

In [2]:
def label_divide(train, test, label = 'GB', train_only = False):
    
    train_x = train.drop(columns = label)
    train_y = train[label]
    
    if not train_only:
        test_x = test.drop(columns = label)
        test_y = test[label]    
        return train_x, train_y, test_x, test_y
    else:
        return train_x, train_y

### self-defined oversampling (border)
first writen by chungcheng, and then modified 

In [3]:
## Output : 資料點之間的距離
def distance_matrix(data1, data2, triangle = False):
    
    data1 = np.array(data1.iloc[:, :-1])
    data2 = np.array(data2.iloc[:, :-1])
    dis_mat = pd.DataFrame((data1[:, None, :] != data2).sum(2))
    if triangle:
        dis_mat = dis_mat.where(np.triu(np.ones(dis_mat.shape)).astype(bool))
    
    return dis_mat


## 給定 df 和 value 找出其 row 和 col
def get_indexes(dis_mat, value):

    pos_list = []
    # Get bool dataframe with True at positions where the given value exists
    result = dis_mat.isin([value])
    # Get list of columns that contains the value
    col_target = result.any()
    colnames = list(col_target[col_target == True].index)
    # Iterate over list of columns and fetch the rows indexes where value exists
    for col in colnames:
        rows = list(result[col][result[col] == True].index)
        for row in rows:
            pos_list.append((row, col))
    
    return pos_list


#給定一個 資料集 和 值 後, 找出相對應的位置
def ID_given_distance(data1, data2, target_dis):
    
    dis_mat = distance_matrix(data1, data2)
    index_set = get_indexes(dis_mat, target_dis)
    
    return index_set


# 2個以上不同 的排列組合
def perm(diff_series, num_over, farest_generate = 3):
    
    random_list = []
    for i in range(num_over) :
        change_num = random.sample(range(1, farest_generate+1), 1)[0]
        diff_index = diff_series[diff_series == True].index.tolist()
        change_index = random.sample(diff_index, change_num)
        mask = diff_series.copy()
        for j in change_index:
            mask[j] = False
        random_list.append(mask.values.tolist())
            
    return random_list


def Border(data, max_distance, num_over):
    
    data1 = data[data.GB == 1]
    data2 = data[data.GB == 0]
    full_data = data.iloc[:, :-1].copy()
    training_df = pd.DataFrame()
    for k in range(3, max_distance+1):

        bad_indexes = ID_given_distance(data1, data1, k)
        smote_df = pd.DataFrame()
        if len(bad_indexes) != 0:
            for l in range(len(bad_indexes)):

                ##選定一組數字,一個當中心點
                point_0 = bad_indexes[l][0]
                point_1 = bad_indexes[l][1]

                # d_smote 的初始值和中心點一樣
                d_smote = full_data.loc[point_0].copy()

                # 找出有差異處的 cols
                points_diff = (full_data.loc[point_0] != full_data.loc[point_1])
                diff_mask = points_diff.values
                diff_cols = points_diff.index

                ## L為距離 i 的點之間的排列組合
                perm_set = perm(points_diff, num_over) 
                
                '''創造DATA_SMOTE(給定中心點:d_smote 和 L)'''
                s_df = pd.DataFrame()
                for i in range(len(perm_set)):

                    s = pd.DataFrame([d_smote]).copy()
                    
                    for j in range(len(perm_set[i])):
                        
                        s[diff_cols[j]] = (1 - d_smote[diff_cols[j]]) if perm_set[i][j] else d_smote[diff_cols[j]]                     
                    s_df = pd.concat([s_df, s], axis = 0)
                smote_df = pd.concat([smote_df, s_df], axis = 0) # new added
                smote_df['GB'] = 1 
        training_df = pd.concat([training_df, smote_df], axis = 0)  
    training_df = training_df.drop_duplicates().reset_index(drop = True)
    
    return training_df

### oversampling

In [4]:
def before_over(dataset, label = 'GB'):
    
    colnames = dataset.columns
    Y = dataset[label]
    Y = Y.reset_index(drop = True)
    Y = np.array(Y)
    X = dataset.drop(columns = [label])
    X = X.reset_index(drop = True)
    X = X.to_numpy()
    
    return X, Y, colnames


def after_over(X, Y, colnames, back_to_category = False):
    
    colnames = colnames[:X.shape[1]]
    X = pd.DataFrame(X, columns = colnames)
    
    if back_to_category:
        for j in tqdm(range(X.shape[1])):
            colvalue = X.iloc[:, j]
            upper = np.array(colvalue[colvalue < 1])
            lower = np.array(upper[upper > 0])
            colmean = np.mean(lower)
            
            mask = colvalue >= colmean
            X.iloc[mask, j] = 1
            X.iloc[~mask, j] = 0
    
    Y = pd.Series(Y)
    
    return X, Y


def over_sample(X, Y, method, ratio, n_neighbors = 5, *args):
    
    method_list = ['NoSMOTE', 'SMOTE', 'MSMOTE', 'ROSE', 'SMOTEN', 'ADASYN']
    if method not in method_list:
        raise Exception('Invalid method !')
    
    if method == method_list[0]:
        over_sampler = sv.NoSMOTE()
    elif method == method_list[1]:
        over_sampler = sv.SMOTE(ratio, n_neighbors)
    elif method == method_list[2]:
        over_sampler = sv.MSMOTE(ratio, n_neighbors)
    elif method == method_list[3]:
        over_sampler = sv.ROSE(ratio)   
    elif method == method_list[4]:
        over_sampler = SMOTEN(sampling_strategy = ratio, k_neighbors = n_neighbors)
    elif method == method_list[5]:
        over_sampler = ADASYN(sampling_strategy = ratio, n_neighbors = n_neighbors)    
    
    if method in method_list[0:4]:
        over_X, over_Y = over_sampler.sample(X, Y)
    else:
        over_X, over_Y = over_sampler.fit_resample(X, Y)
    
    return over_X, over_Y

### undersampling

In [5]:
def before_under(dataset, label = 'GB'):
    
    Y = dataset[label]
    X = dataset.drop(columns = [label])
    
    return X, Y


def under_sample(X, Y, method, ratio, *args):
    
    method_list = [None, 'random', 'Tomek', 'IHT', 'NM', 'one-sided', 'r-one-sided']
    if method not in method_list:
        raise Exception('Invalid method !')
    
    if method == method_list[0]:
        return X, Y
        
    elif method == method_list[1]:
        under_sampler = RandomUnderSampler(sampling_strategy = ratio)    
    elif method == method_list[2]:
        under_sampler = TomekLinks(sampling_strategy = 'majority')
    elif method == method_list[3]:
        under_sampler = InstanceHardnessThreshold(sampling_strategy = ratio, cv = 5, n_jobs = -1)
    elif method in (method_list[4] + method_list[5]):
        under_sampler = NearMiss(sampling_strategy = ratio, version = 2, n_jobs = -1)
    elif method == method_list[6]:
        under_sampler = InstanceHardnessThreshold(sampling_strategy = 1, cv = 5, n_jobs = -1)
    
    under_X, under_Y = under_sampler.fit_resample(X, Y)
    
    if method == method_list[5]:
        second_sampler = InstanceHardnessThreshold(sampling_strategy = 1, cv = 5, n_jobs = -1)
        under_X, under_Y = second_sampler.fit_resample(under_X, under_Y)
    elif method == method_list[6]:
        second_sampler = NearMiss(sampling_strategy = ratio, version = 2, n_jobs = -1)
        under_X, under_Y = second_sampler.fit_resample(under_X, under_Y)
    
    return under_X, under_Y

### protocol to generate datasets

In [6]:
def under_over(dataset, over_method, under_method, over_ratio, under_ratio, label = 'GB'):
    
    #undersampling
    if under_method != None:
        X, Y = before_under(dataset, label)
        Y = Y.astype(int)
        print('Size before Undersampling:', len(Y))
        under_X, under_Y = under_sample(X, Y, under_method, under_ratio)
        dataset = pd.concat([under_X, under_Y], axis = 1)
        print('Size after Undersampling:', len(under_Y))
    
    #oversampling
    temp_X, temp_Y, colnames = before_over(dataset, label)
    print('Size before Oversampling:', len(temp_Y))
    over_X, over_Y = over_sample(temp_X, temp_Y, over_method, over_ratio)
    X, Y = after_over(over_X, over_Y, colnames)
    print('Size after Oversampling:', len(Y))
        
    return X, Y


def over_under(dataset, over_method, under_method, over_ratio, under_ratio, label = 'GB') :
    
    #oversampling
    if over_method != None :
        X, Y, colnames = before_over(dataset, label)
        print('Size before Oversampling:', len(Y))
        temp_X, temp_Y = over_sample(X, Y, over_method, over_ratio)
        over_X, over_Y = after_over(temp_X, temp_Y, colnames)
        print('Size after Oversampling:', len(over_Y))
        over_dataset = pd.concat([over_X, over_Y], axis = 1)
        dataset = over_dataset.rename(columns = {0 : label})

    #undersampling
    X, Y = before_under(dataset, label)
    Y = Y.astype(int)
    under_X, under_Y = under_sample(X, Y, under_method, under_ratio)
    print('Size after Undersampling:', len(under_Y))
    
    return under_X, under_Y
    
    
def generate_set(train_data, over_method, under_method, index, over_ratio, under_ratio, order, label = 'GB'):
    
    print('\n', f'Generating Dataset {index}')
    
    if order == 'under' :
        train_x, train_y = under_over(train_data, over_method, under_method, over_ratio, under_ratio, label)
    elif order == 'over' :
        train_x, train_y = over_under(train_data, over_method, under_method, over_ratio, under_ratio, label)
        
    train = pd.concat([train_x, train_y], axis = 1)
    train = train.rename(columns = {0: label})
    
    return train


def border_set(train_data, kind_data, under_method, index, num_over, under_ratio, order, distance_init = 12):
    
    ##### oversampling first #####
    if order == 'over':
        print('Size before Border:', len(train_data))
        redo = True
        distance = distance_init
        while redo:
            print(f'Border distance = {distance}...')
            OS_B = Border(kind_data, distance, num_over)
            self_runhist = pd.concat([train_data, OS_B], axis = 0).reset_index(drop = True)
            if len(OS_B) < sum(train_data.GB)*num_over*0.9:
                distance += 1
            else:
                redo = False
        print('Size after Border:', len(self_runhist))
        
        dataset = generate_set(self_runhist, None, under_method, index, over_ratio = None, under_ratio = under_ratio, 
                               order = 'over')
        print(f'Size after Undersampling:', dataset.shape, ', Balance Ratio:', Balance_Ratio(dataset), \
              ', distance:', distance)
        
        return dataset
    
    ##### undersampling first #####
    elif order == 'under':
        print('Size before Undersampling:', len(train_data))
        self_under = generate_set(train_data, None, under_method, index, over_ratio = None, under_ratio = under_ratio, 
                                  order = 'over')
        print('Size after Undersampling:', len(self_under))
        
        corner_overlap = Corner(self_under)
        under_kind = Kind(corner_overlap).iloc[:, :-3]
        
        redo = True
        distance = distance_init
        while redo:
            print(f'Border distance = {distance}...')
            US_B = Border(under_kind, distance, num_over)
            dataset = pd.concat([self_under, US_B], axis = 0).reset_index(drop = True)
            if len(US_B) < sum(train_data.GB)*num_over*0.9:
                distance += 1
            else:
                redo = False
        print('Size after Border:', dataset.shape, ', Balance Ratio:', Balance_Ratio(dataset), ', distance:', distance)
        
        return dataset

## 

### loading training data & kind

In [7]:
##### training data #####
training_month = range(2, 5)

runhist = {}
for i in training_month:
    runhist[f'm{i}'] = pd.read_csv(f'relabel_runhist_m{i}.csv', index_col = 'id').iloc[:, 1:]
    print(f'Month {i}:')
    print(f'Dimension:', runhist[f'm{i}'].shape, ', # Bad:', sum(runhist[f'm{i}'].GB))
runhist['all'] = training_def(runhist, training_month)
print('Dimension of all runhist:', runhist['all'].shape, ', # Bad:', sum(runhist['all'].GB), '\n')

##### kind data (for border) #####
kinds = {}
for i in training_month:
    kinds[f'm{i}'] = pd.read_csv(f'kind_m{i}.csv').iloc[:, 2:-3]
    print(f'Month {i}:')
    print(f'# kinds:', len(kinds[f'm{i}']))

Month 2:
Dimension: (39009, 88) , # Bad: 69
Month 3:
Dimension: (60396, 97) , # Bad: 113
Month 4:
Dimension: (57743, 100) , # Bad: 122
Dimension of all runhist: (157148, 128) , # Bad: 304 

Month 2:
# kinds: 23088
Month 3:
# kinds: 33754
Month 4:
# kinds: 32861


### oversampling & undersampling

In [8]:
##### generate datasets #####
dataset = {}
combine_dataset = {}
for i in range(10):
    combine_dataset[i] = pd.DataFrame()

for i in tqdm(training_month):
    
    print(f'Month {i}:')
    print('# bad:', sum(runhist[f'm{i}'].GB))
    br = Balance_Ratio(runhist[f'm{i}'])
    final_br = 1
    num_os = 10
    over_br = num_os / br
    under_br = final_br / num_os
    
    
    dataset[2] = border_set(runhist[f'm{i}'], kinds[f'm{i}'], 'NM', 2, num_over = num_os, under_ratio = final_br, 
                            order = 'over')
    dataset[6] = border_set(runhist[f'm{i}'], kinds[f'm{i}'], 'NM', 6, num_over = num_os, under_ratio = under_br, 
                            order = 'under')
    
    dataset[0] = generate_set(runhist[f'm{i}'], 'NoSMOTE', None, 0, over_ratio = None, under_ratio = None, order = 'over')

    dataset[1] = generate_set(runhist[f'm{i}'], 'ADASYN', 'NM', 1, over_ratio = over_br, under_ratio = final_br, 
                              order = 'over')
    dataset[3] = generate_set(runhist[f'm{i}'], 'ROSE', 'NM', 3, over_ratio = over_br, under_ratio = final_br,
                              order = 'over')
    dataset[4] = generate_set(runhist[f'm{i}'], 'SMOTEN', 'NM', 4, over_ratio = over_br, under_ratio = final_br, 
                              order = 'over')

    dataset[5] = generate_set(runhist[f'm{i}'], 'ADASYN', 'NM', 5, over_ratio = final_br, under_ratio = under_br, 
                              order = 'under')
    dataset[7] = generate_set(runhist[f'm{i}'], 'ROSE', 'NM', 7, over_ratio = final_br, under_ratio = under_br, 
                              order = 'under')
    dataset[8] = generate_set(runhist[f'm{i}'], 'SMOTEN', 'NM', 8, over_ratio = final_br, under_ratio = under_br, 
                              order = 'under')

    dataset[9] = generate_set(runhist[f'm{i}'], None, 'NM', 9, over_ratio = None, under_ratio = 0.1, order = 'over')
    
    ### combine all training data after sampling by each month ###
    for j in range(10):
        temp_combine = pd.concat([combine_dataset[j], dataset[j]], axis = 0).fillna(0)
        temp_cols = temp_combine.columns.to_list()
        GB_pos = temp_cols.index('GB')
        fine_cols = temp_cols[: GB_pos] + temp_cols[GB_pos+1: ] + temp_cols[GB_pos: GB_pos+1]
        combine_dataset[j] = temp_combine[fine_cols]
        
        dataset[j].to_csv(f'm{i}_dataset_{j}.csv')
        combine_dataset[j].to_csv(f'dataset_{j}.csv')

  0%|          | 0/3 [00:00<?, ?it/s]

Month 2:
# bad: 69
Size before Border: 39009
Border distance = 12...
Border distance = 13...
Border distance = 14...
Border distance = 15...
Size after Border: 39904

 Generating Dataset 2
Size after Undersampling: 1928
Size after Undersampling: (1928, 88) , Balance Ratio: 1.0 , distance: 15
Size before Undersampling: 39009

 Generating Dataset 6
Size after Undersampling: 759
Size after Undersampling: 759
Border distance = 12...
Border distance = 13...
Border distance = 14...
Border distance = 15...


2021-12-14 14:33:22,137:INFO:NoSMOTE: Running sampling via ('NoSMOTE', '{}')


Size after Border: (1659, 88) , Balance Ratio: 0.71 , distance: 15

 Generating Dataset 0
Size before Oversampling: 39009
Size after Oversampling: 39009
Size after Undersampling: 39009

 Generating Dataset 1
Size before Oversampling: 39009
Size after Oversampling: 39642


2021-12-14 14:33:23,731:INFO:ROSE: Running sampling via ('ROSE', "{'proportion': 0.017719500310091254, 'random_state': None}")


Size after Undersampling: 1404

 Generating Dataset 3
Size before Oversampling: 39009
Size after Oversampling: 39697
Size after Undersampling: 1514

 Generating Dataset 4
Size before Oversampling: 39009
Size after Oversampling: 39629
Size after Undersampling: 1378

 Generating Dataset 5
Size before Undersampling: 39009
Size after Undersampling: 759
Size before Oversampling: 759
Size after Oversampling: 1370

 Generating Dataset 7
Size before Undersampling: 39009


2021-12-14 14:33:28,934:INFO:ROSE: Running sampling via ('ROSE', "{'proportion': 1, 'random_state': None}")


Size after Undersampling: 759
Size before Oversampling: 759
Size after Oversampling: 1380

 Generating Dataset 8
Size before Undersampling: 39009
Size after Undersampling: 759
Size before Oversampling: 759
Size after Oversampling: 1380

 Generating Dataset 9
Size after Undersampling: 759
Month 3:
# bad: 113
Size before Border: 60396
Border distance = 12...
Border distance = 13...
Border distance = 14...
Border distance = 15...
Border distance = 16...
Size after Border: 61849

 Generating Dataset 2
Size after Undersampling: 3132
Size after Undersampling: (3132, 97) , Balance Ratio: 1.0 , distance: 16
Size before Undersampling: 60396

 Generating Dataset 6
Size after Undersampling: 1243
Size after Undersampling: 1243
Border distance = 12...
Border distance = 13...
Border distance = 14...
Border distance = 15...
Border distance = 16...


2021-12-14 14:34:16,747:INFO:NoSMOTE: Running sampling via ('NoSMOTE', '{}')


Size after Border: (2693, 97) , Balance Ratio: 0.72 , distance: 16

 Generating Dataset 0
Size before Oversampling: 60396
Size after Oversampling: 60396
Size after Undersampling: 60396

 Generating Dataset 1
Size before Oversampling: 60396
Size after Oversampling: 61435


2021-12-14 14:34:21,044:INFO:ROSE: Running sampling via ('ROSE', "{'proportion': 0.018744845167578916, 'random_state': None}")


Size after Undersampling: 2304

 Generating Dataset 3
Size before Oversampling: 60396
Size after Oversampling: 61523
Size after Undersampling: 2480

 Generating Dataset 4
Size before Oversampling: 60396
Size after Oversampling: 61412
Size after Undersampling: 2258

 Generating Dataset 5
Size before Undersampling: 60396
Size after Undersampling: 1243
Size before Oversampling: 1243
Size after Oversampling: 2271

 Generating Dataset 7
Size before Undersampling: 60396


2021-12-14 14:34:33,814:INFO:ROSE: Running sampling via ('ROSE', "{'proportion': 1, 'random_state': None}")


Size after Undersampling: 1243
Size before Oversampling: 1243
Size after Oversampling: 2260

 Generating Dataset 8
Size before Undersampling: 60396
Size after Undersampling: 1243
Size before Oversampling: 1243
Size after Oversampling: 2260

 Generating Dataset 9
Size after Undersampling: 1243
Month 4:
# bad: 122
Size before Border: 57743
Border distance = 12...
Border distance = 13...
Border distance = 14...
Border distance = 15...
Size after Border: 59307

 Generating Dataset 2
Size after Undersampling: 3372
Size after Undersampling: (3372, 100) , Balance Ratio: 1.0 , distance: 15
Size before Undersampling: 57743

 Generating Dataset 6
Size after Undersampling: 1342
Size after Undersampling: 1342
Border distance = 12...
Border distance = 13...
Border distance = 14...
Border distance = 15...


2021-12-14 14:35:28,435:INFO:NoSMOTE: Running sampling via ('NoSMOTE', '{}')


Size after Border: (2894, 100) , Balance Ratio: 0.73 , distance: 15

 Generating Dataset 0
Size before Oversampling: 57743
Size after Oversampling: 57743
Size after Undersampling: 57743

 Generating Dataset 1
Size before Oversampling: 57743
Size after Oversampling: 58834


2021-12-14 14:35:32,859:INFO:ROSE: Running sampling via ('ROSE', "{'proportion': 0.021172983273343212, 'random_state': None}")


Size after Undersampling: 2426

 Generating Dataset 3
Size before Oversampling: 57743
Size after Oversampling: 58960
Size after Undersampling: 2678

 Generating Dataset 4
Size before Oversampling: 57743
Size after Oversampling: 58841
Size after Undersampling: 2440

 Generating Dataset 5
Size before Undersampling: 57743
Size after Undersampling: 1342
Size before Oversampling: 1342
Size after Oversampling: 2419

 Generating Dataset 7
Size before Undersampling: 57743


2021-12-14 14:35:46,294:INFO:ROSE: Running sampling via ('ROSE', "{'proportion': 1, 'random_state': None}")


Size after Undersampling: 1342
Size before Oversampling: 1342
Size after Oversampling: 2440

 Generating Dataset 8
Size before Undersampling: 57743
Size after Undersampling: 1342
Size before Oversampling: 1342
Size after Oversampling: 2440

 Generating Dataset 9
Size after Undersampling: 1342
