In [1]:
import os
import numpy as np
import pandas as pd
import pickle
import random
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

from lightgbm import LGBMClassifier
import optuna
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from library.Data_Preprocessing import Balance_Ratio
from library.Imbalance_Sampling import label_divide, under_over, over_under
from library.Aging_Score_Contour import score1
from library.AdaBoost import train_set, multiple_set, multiple_month, line_chart, cf_matrix, all_optuna, optuna_history

os.chdir('C:/Users/user/Desktop/Darui_R08621110') 
os.getcwd()

'C:\\Users\\user\\Desktop\\Darui_R08621110'

## 

### Balance Cascade

In [16]:
# train & test of balance cascade (scheme 1 & 2)
class BalanceCascade:
    
    def __init__(self, base_clf = 'LightGBM', num_iter = 10, over_method = None, under_method = 'NM', 
                 over_num = 5, verbose = True):
        self.classifier = base_clf
        self.num_iter = num_iter
        self.over_method = over_method
        self.under_method = under_method
        self.over_num = over_num if over_method else 1
        self.verbose = verbose
        if over_method not in ['ADASYN', 'SMOTEN', None]:
            raise Exception(f'{over_method} is not implemented !') 
        if under_method not in ['NM', 'random']:
            raise Exception(f'{under_method} is not implemented !') 
        if base_clf not in ['LightGBM', 'RandomForest']:
            raise Exception(f'{base_clf} is not implemented !') 
    
    
    def training(self, train_data, clf_config):
        origin_good = train_data[train_data.GB == 0]
        origin_bad = train_data[train_data.GB == 1] 
        br_0 = (len(origin_bad)*self.over_num) / len(origin_good)
        false_rate = br_0**(1/(self.num_iter - 1))
        
        keep_bad = origin_bad.copy()
        keep_good = {0: origin_good.copy()}
        br_list = []
        clf_threshold = []
        clf_cascade = {}
        for j in range(self.num_iter):
            temp_train = pd.concat([keep_good[j], keep_bad], axis = 0)
            if self.verbose:
                print(f'Iteration {j+1}:')
                print('Size before undersampling:', len(temp_train))
            temp_br = len(keep_bad) / len(keep_good[j])
            br_list.append(temp_br)

            # undersampling first
            if j < (self.num_iter - 1):
                under_ratio = 1/self.over_num
                under_X, under_Y = over_under(temp_train, None, self.under_method, 0, under_ratio)
                temp_combine = pd.concat([under_X, under_Y], axis = 1)
            else:
                temp_combine = temp_train.copy()
                if self.verbose:
                    print('Stop Undersampling !')            
            under_good = temp_combine[temp_combine.GB == 0]
            
            # oversampling
            if all([self.over_method, len(under_good) > len(keep_bad)]):
                over_ratio = 1
                over_X, over_Y = under_over(temp_combine, self.over_method, None, over_ratio, 0)
                over_sample = pd.concat([over_X, over_Y], axis = 1)
                train_combine = over_sample.rename({0: 'GB'}, axis = 'columns')
            else:
                train_combine = temp_combine.copy()
                if self.verbose:
                    print('Stop Oversampling !')

            # train the base learner, find the threshold, and discard the redundant good instances
            valid_good = keep_good[j].copy()
            train_x, train_y, valid_x, valid_y = label_divide(train_combine, valid_good, 'GB', train_only = False)
            if self.classifier == 'LightGBM':
                clf = LGBMClassifier(**clf_config)
            elif self.classifier == 'RandomForest':
                clf = RandomForestClassifier(**clf_config)
            clf.fit(train_x, train_y)
            predict = clf.predict_proba(valid_x)[:, 1]
            predict_df = pd.DataFrame(dict(predict = predict), index = valid_x.index)
            predict_df = predict_df.sort_values(by = 'predict', ascending = False)
            keep_num = int(len(predict_df)*false_rate) + 1
            keep_index = predict_df.index[:keep_num]
            threshold = predict_df.loc[keep_index[-1]].values[0]
            clf_threshold.append(threshold)
            clf_cascade[j] = clf
            
            if j != (self.num_iter - 1):
                keep_good[j+1] = keep_good[j].loc[keep_index].copy()        
        self.good_data = keep_good
        self.bad_data = keep_bad
        self.threshold = clf_threshold
        self.cascade = clf_cascade
        self.balance_ratio = br_list
    
    
    def testing(self, test_data):
        clf_cascade = self.cascade
        if isinstance(self.threshold, int):
            clf_threshold = [self.threshold]*len(clf_cascade)
        else:
            clf_threshold = self.threshold

        test_x, test_y = label_divide(test_data, 'GB', train_only = True)
        predict_df = pd.DataFrame()
        for i in range(len(clf_cascade)):
            clf = clf_cascade[i]
            predict = clf.predict_proba(test_x)[:, 1]
            answer = (predict > clf_threshold[i]).astype(int)
            predict = pd.DataFrame({str(i): answer})
            predict_df = pd.concat([predict_df, predict], axis = 1)
        predict_y = (predict_df.apply(sum, axis = 1) == len(clf_cascade)).astype(int)
        result = pd.DataFrame(dict(predict = predict_y, truth = test_y))

        return result


# run all resampling datasets
def runall_cascade(train_set, test_data, base_config, base_clf = 'LightGBM', num_iter = 10, meta_config = None):
    
    num_set = len(train_set)
    table_set = pd.DataFrame()
    for i in range(1, num_set):
        print('\n', f'Dataset {i}:')
        if isinstance(meta_config, dict):
            BC = BalanceCascade(base_clf = base_clf, **meta_config[f'set{i}'])
        else:
            BC = BalanceCascade(base_clf = base_clf, num_iter = num_iter)
        BC.training(train_set[f'set{i}'], base_config[f'set{i}'])
        result = BC.testing(test_data)
        table = cf_matrix(result, train_set[f'set{i}'].GB)
        table_set = pd.concat([table_set, table]).rename(index = {0: f'dataset {i}'})
    
    return table_set

### Optuna

In [3]:
# creator of optuna study for balance cascade
def BalanceCascade_creator(train_data, mode, num_valid = 3, label = 'GB') :

    def objective(trial) :
    
        base_param = {
            'n_estimators': trial.suggest_categorical('n_estimators', [100, 300, 500, 1000]),
            'learning_rate': trial.suggest_float('learning_rate', 0.025, 0.325, step = 0.05),
            'max_depth': trial.suggest_int('max_depth', 3, 12, step = 3),
            'num_leaves': trial.suggest_int('num_leaves', 10, 130, step = 20),
            'min_child_samples': trial.suggest_categorical('min_child_samples', [10, 50, 100, 500, 1000, 5000]),
            'min_split_gain': trial.suggest_int('min_split_gain', 0, 12, step = 2),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.9, step = 0.2),
            'subsample': trial.suggest_float('subsample', 0.3, 0.9, step = 0.2),
            'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-2, 10), # alpha
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-2, 10) # lambda
        }
    
        meta_param = {
            'num_iter': trial.suggest_int('num_iter', 10, 10, step = 5),
            'over_num': trial.suggest_int('over_num', 5, 5, step = 5),
            'over_method': trial.suggest_categorical('over_method', ['ADASYN']),
            'under_method': trial.suggest_categorical('under_method', ['NM'])
        }  

        result_list = []
        for i in range(num_valid):

            train_good = train_data[train_data.GB == 0]
            train_bad = train_data[train_data.GB == 1]
            train_good_x, train_good_y = label_divide(train_good, None, label, train_only = True)
            train_bad_x, train_bad_y = label_divide(train_bad, None, label, train_only = True)
            train_g_x, valid_g_x, train_g_y, valid_g_y = train_test_split(train_good_x, train_good_y, test_size = 0.25)
            train_b_x, valid_b_x, train_b_y, valid_b_y = train_test_split(train_bad_x, train_bad_y, test_size = 0.25)
            train_x = pd.concat([train_g_x, train_b_x], axis = 0)
            train_y = pd.concat([train_g_y, train_b_y], axis = 0)
            valid_x = pd.concat([valid_g_x, valid_b_x], axis = 0)
            valid_y = pd.concat([valid_g_y, valid_b_y], axis = 0)
            all_train = pd.concat([train_x, train_y], axis = 1)
            all_valid = pd.concat([valid_x, valid_y], axis = 1)

            if mode == 1:
                BC = BalanceCascade(num_iter = meta_param['num_iter'], under_method = meta_param['under_method'])
            elif mode == 2:
                BC = BalanceCascade(num_iter = meta_param['num_iter'], over_method = meta_param['over_method'], 
                                    under_method = meta_param['under_method'], over_num = meta_param['over_num'])
            BC.training(all_train, base_param)
            result = BC.testing(all_valid)
            table = cf_matrix(result, valid_y)
            recall = table['Recall'].values
            precision = table['Precision'].values
            beta = 1
            if recall > 0:
                fscore = ((1+beta**2)*recall*precision) / (recall+(beta**2)*precision) 
            else:
                fscore = 0
            result_list.append(fscore)

        return np.mean(result_list)
    return objective

## 

### Load Data

In [4]:
### training data ###
training_month = range(2, 5)

data_dict, trainset_x, trainset_y = multiple_month(training_month, num_set = 10, filename = 'dataset')

print('\nCombined training data:\n')
run_train = multiple_set(num_set = 10)
run_train_x, run_train_y = train_set(run_train, num_set = 10)

### testing data ###
run_test = pd.read_csv('test_runhist.csv').iloc[:, 2:]
run_test_x, run_test_y = label_divide(run_test, None, 'GB', train_only = True)
print('\n', 'Dimension of testing data:', run_test.shape)


Month 2:

Dimension of dataset 0 : (39009, 88)  balance ratio: 564.35
Dimension of dataset 1 : (8970, 88)  balance ratio: 25.0
Dimension of dataset 2 : (8892, 88)  balance ratio: 25.0
Dimension of dataset 3 : (8944, 88)  balance ratio: 25.0
Dimension of dataset 4 : (8944, 88)  balance ratio: 25.0
Dimension of dataset 5 : (8970, 88)  balance ratio: 25.0
Dimension of dataset 6 : (8969, 88)  balance ratio: 25.07
Dimension of dataset 7 : (8967, 88)  balance ratio: 25.22
Dimension of dataset 8 : (8970, 88)  balance ratio: 25.0
Dimension of dataset 9 : (1794, 88)  balance ratio: 25.0

 10 datasets are loaded.

Labels of  10 datasets are divided.

Month 3:

Dimension of dataset 0 : (60396, 97)  balance ratio: 533.48
Dimension of dataset 1 : (14924, 97)  balance ratio: 25.0
Dimension of dataset 2 : (14560, 97)  balance ratio: 25.0
Dimension of dataset 3 : (14664, 97)  balance ratio: 25.0
Dimension of dataset 4 : (14664, 97)  balance ratio: 25.0
Dimension of dataset 5 : (14674, 97)  balance ra

### Search for the Best Hyperparameters

In [None]:
best_paramC, all_scoreC = all_optuna(all_data = run_train, 
                                     mode = 2, 
                                     TPE_multi = False, 
                                     n_iter = 25,
                                     filename = 'runhist_array_m2m4_m5_3criteria_iter10_over5_nlast_BalanceCascade',
                                     creator = BalanceCascade_creator
                                    )

meta_item = ['num_iter', 'over_num', 'over_method', 'under_method']
base_paramC = {}
meta_paramC = {}
for i in range(1, len(run_train)):
    base_paramC.update({f'set{i}': {}})
    meta_paramC.update({f'set{i}': {}})
    [base_paramC[f'set{i}'].update({a: b}) for (a, b) in best_paramC[f'set{i}'].items() if a not in meta_item]
    [meta_paramC[f'set{i}'].update({a: b}) for (a, b) in best_paramC[f'set{i}'].items() if a in meta_item]

  0%|          | 0/9 [00:00<?, ?it/s]

[32m[I 2022-03-07 23:08:13,450][0m A new study created in memory with name: no-name-41fec6b0-e58d-4bb1-bfdc-7f965787c7de[0m


Dataset 1 :


  self._init_valid()


  0%|          | 0/25 [00:00<?, ?it/s]

Iteration 1:
Size before undersampling: 30068
Size after Undersampling: 6936
Size before Oversampling: 6936
Size after Oversampling: 11540
Iteration 2:
Size before undersampling: 25333
Size after Undersampling: 6936
Size before Oversampling: 6936
Size after Oversampling: 11590
Iteration 3:
Size before undersampling: 21374
Size after Undersampling: 6936
Size before Oversampling: 6936
Size after Oversampling: 11563
Iteration 4:
Size before undersampling: 18063
Size after Undersampling: 6936
Size before Oversampling: 6936
Size after Oversampling: 11565
Iteration 5:
Size before undersampling: 15294
Size after Undersampling: 6936
Size before Oversampling: 6936
Size after Oversampling: 11533
Iteration 6:
Size before undersampling: 12979
Size after Undersampling: 6936
Size before Oversampling: 6936
Size after Oversampling: 11550
Iteration 7:
Size before undersampling: 11043
Size after Undersampling: 6936
Size before Oversampling: 6936
Size after Oversampling: 11584
Iteration 8:
Size before un

  eff = r/ag


Precision: 0 
Recall: 0.0 
Aging Rate: 0.0
Iteration 1:
Size before undersampling: 30068
Size after Undersampling: 6936
Size before Oversampling: 6936
Size after Oversampling: 11577
Iteration 2:
Size before undersampling: 25333
Size after Undersampling: 6936
Size before Oversampling: 6936
Size after Oversampling: 11583
Iteration 3:
Size before undersampling: 21374
Size after Undersampling: 6936
Size before Oversampling: 6936
Size after Oversampling: 11552
Iteration 4:
Size before undersampling: 18063
Size after Undersampling: 6936
Size before Oversampling: 6936
Size after Oversampling: 11547
Iteration 5:
Size before undersampling: 15294
Size after Undersampling: 6936
Size before Oversampling: 6936
Size after Oversampling: 11615
Iteration 6:
Size before undersampling: 12979
Size after Undersampling: 6936
Size before Oversampling: 6936
Size after Oversampling: 11614
Iteration 7:
Size before undersampling: 11043
Size after Undersampling: 6936
Size before Oversampling: 6936
Size after Over

Precision: 0.045098039215686274 
Recall: 0.23469387755102042 
Aging Rate: 0.20174050632911392
Iteration 1:
Size before undersampling: 30068
Size after Undersampling: 6936
Size before Oversampling: 6936
Size after Oversampling: 11556
Iteration 2:
Size before undersampling: 25333
Size after Undersampling: 6936
Size before Oversampling: 6936
Size after Oversampling: 11580
Iteration 3:
Size before undersampling: 21374
Size after Undersampling: 6936
Size before Oversampling: 6936
Size after Oversampling: 11575
Iteration 4:
Size before undersampling: 18063
Size after Undersampling: 6936
Size before Oversampling: 6936
Size after Oversampling: 11534
Iteration 5:
Size before undersampling: 15294
Size after Undersampling: 6936
Size before Oversampling: 6936
Size after Oversampling: 11570
Iteration 6:
Size before undersampling: 12979
Size after Undersampling: 6936
Size before Oversampling: 6936
Size after Oversampling: 11526
Iteration 7:
Size before undersampling: 11043
Size after Undersampling: 

In [None]:
##### optimization history plot #####
optuna_history(best_paramC, all_scoreC, num_row = 3, num_col = 3, model = 'Balance Cascade Classifier')
            
##### best hyperparameter table #####
param_table = pd.DataFrame(best_paramC).T
param_table

### Classifier

#### Scheme 1

In [None]:
table_setC = runall_cascade(run_train, run_test, base_paramC, num_iter = 5)
line_chart(table_setC, title = 'Balance Cascade Classifier Scheme 1 (LightGBM)')

In [None]:
table_setC

#### Scheme 2

In [None]:
table_setC = runall_cascade(run_train, run_test, base_config = base_paramC, meta_config = meta_paramC)
line_chart(table_setC, title = 'Balance Cascade Classifier Scheme 2 (LightGBM)')

In [None]:
table_setC

### Export

In [None]:
savedate = '20220308'
TPE_multi = False
scheme = 2

table_setC['sampler'] = 'multivariate-TPE' if TPE_multi else 'univariate-TPE'
table_setC['model'] = f'BC_scheme{scheme}_iter10_over5_nlast'
with pd.ExcelWriter(f'{savedate}_Classifier.xlsx', mode = 'a') as writer:
    table_setC.to_excel(writer, sheet_name = f'BC_scheme{scheme}_iter10_over5_nlast')