In [6]:
import os
import numpy as np
import pandas as pd
import pickle
import random
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

from lightgbm import LGBMClassifier
import optuna
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from library.Data_Preprocessing import Balance_Ratio
from library.Imbalance_Sampling import label_divide
from library.Aging_Score_Contour import score1
from library.AdaBoost import train_set, multiple_set, multiple_month, line_chart, cf_matrix

os.chdir('C:/Users/user/Desktop/Darui_R08621110') 
os.getcwd()

'C:\\Users\\user\\Desktop\\Darui_R08621110'

## 

### balance cascade

In [2]:
def cascade_training(train_data, clf_config, classifier = 'LightGBM', num_iter = 10):
  
    good = train_data[train_data.GB == 0]
    bad = train_data[train_data.GB == 1]
    br = len(bad)/len(good)
    false_rate = br**(1/(num_iter - 1))
    
    keep_good = {}
    clf_threshold = []
    clf_cascade = {}
    for j, i in enumerate(range(num_iter)):
        keep_good[j] = good
        if j != (num_iter - 1):
            draw = random.sample(good.index.to_list(), len(bad))
            train_g = good.loc[draw]
            train_b = bad.copy()
            train_combine = pd.concat([train_g, train_b], axis = 0)
        elif j == (num_iter - 1):
            train_combine = pd.concat([good, bad], axis = 0)

        valid_g = good.copy()
        train_x, train_y, valid_x, valid_y = label_divide(train_combine, valid_g, 'GB', train_only = False)
        if classifier == 'LightGBM':
            clf = LGBMClassifier(**clf_config)
        elif classifier == 'RandomForest':
            clf = RandomForestClassifier(**clf_config)
        clf.fit(train_x, train_y)
        predict = clf.predict_proba(valid_x)[:, 1]
        predict_df = pd.DataFrame(dict(predict = predict), index = valid_x.index)
        predict_df = predict_df.sort_values(by = 'predict', ascending = False)
        keep_num = int(len(predict_df)*false_rate)
        keep_index = predict_df.index[:keep_num]
        threshold = predict_df.loc[keep_index[-1]].values[0]
        clf_threshold.append(threshold)
        clf_cascade[j] = clf

        if j != (num_iter - 1):
            good = good.loc[keep_index]
        
        return clf_cascade, clf_threshold, keep_good
    
    
def cascade_testing(test_data, clf_cascade, clf_threshold):
    
    if isinstance(clf_threshold, int):
        clf_threshold = [clf_threshold]*len(clf_cascade)
    
    test_x, test_y = label_divide(test_data, 'GB', train_only = True)
    predict_df = pd.DataFrame()
    for i in range(len(clf_cascade)):
        clf = clf_cascade[i]
        predict = clf.predict_proba(test_x)[:, 1]
        answer = (predict > clf_threshold[i]).astype(int)
        predict = pd.DataFrame({str(i): answer})
        predict_df = pd.concat([predict_df, predict], axis = 1)
    predict_y = (predict_df.apply(sum, axis = 1) == len(clf_cascade)).astype(int)
    result = pd.DataFrame(dict(predict = predict_y, truth = test_y))
    
    return result


def runall_cascade(train_set, test_data, config, classifier = 'LightGBM', num_iter = 10):
    
    num_set = len(train_set)
    table_set = pd.DataFrame()
    for i in range(num_set):
        print('\n', f'Dataset {i}:')
        clf_cascade, clf_threshold, _ = cascade_training(train_set[f'set{i}'], config[f'set{i}'], classifier = classifier,
                                                        num_iter = num_iter)
        result = cascade_testing(test_data, clf_cascade, clf_threshold)
        table = cf_matrix(result, train_set[f'set{i}'].GB)
        table_set = pd.concat([table_set, table]).rename(index = {0: f'dataset {i}'})
    
    return table_set

## 

### loading hyperparameters & datasets

In [3]:
TPE_multi= False
base_learner = 'LightGBM'

iteration = 200 if base_learner == 'LightGBM' else 50
TPE = 'multivariate-TPE' if TPE_multi else 'univariate-TPE'
with open(f'hyperparameter/20211221/runhist_array_m2m4_m5_3criteria_{base_learner}C_{TPE}_{iteration}.data', 'rb') as f:
    best_paramC = pickle.load(f)

In [4]:
### training data ###
training_month = range(2, 5)

data_dict, trainset_x, trainset_y = multiple_month(training_month, num_set = 10, filename = 'dataset')

print('\nCombined training data:\n')
run_train = multiple_set(num_set = 10)
run_train_x, run_train_y = train_set(run_train, num_set = 10)

### testing data ###
run_test = pd.read_csv('test_runhist.csv').iloc[:, 2:]
run_test_x, run_test_y = label_divide(run_test, None, 'GB', train_only = True)
print('\n', 'Dimension of testing data:', run_test.shape)


Month 2:

Dimension of dataset 0 : (39009, 88)  balance ratio: 564.35
Dimension of dataset 1 : (1404, 88)  balance ratio: 1.0
Dimension of dataset 2 : (1928, 88)  balance ratio: 1.0
Dimension of dataset 3 : (1514, 88)  balance ratio: 1.0
Dimension of dataset 4 : (1378, 88)  balance ratio: 1.0
Dimension of dataset 5 : (1370, 88)  balance ratio: 1.01
Dimension of dataset 6 : (1659, 88)  balance ratio: 0.71
Dimension of dataset 7 : (1380, 88)  balance ratio: 1.0
Dimension of dataset 8 : (1380, 88)  balance ratio: 1.0
Dimension of dataset 9 : (759, 88)  balance ratio: 10.0

 10 datasets are loaded.

Labels of  10 datasets are divided.

Month 3:

Dimension of dataset 0 : (60396, 97)  balance ratio: 533.48
Dimension of dataset 1 : (2304, 97)  balance ratio: 1.0
Dimension of dataset 2 : (3132, 97)  balance ratio: 1.0
Dimension of dataset 3 : (2480, 97)  balance ratio: 1.0
Dimension of dataset 4 : (2258, 97)  balance ratio: 1.0
Dimension of dataset 5 : (2271, 97)  balance ratio: 0.99
Dimensio

### classifier

In [5]:
table_setC = runall_cascade(run_train, run_test, best_paramC, classifier = base_learner, num_iter = 10)
line_chart(table_setC, title = 'Balance Cascade Classifier (LightGBM)')


 Dataset 0:
Precision: 0.0010122277107458093 
Recall: 0.49019607843137253 
Aging Rate: 0.5076774445517893

 Dataset 1:
Precision: 0.0010483257620917182 
Recall: 1.0 
Aging Rate: 1.0

 Dataset 2:
Precision: 0.0010486706557275925 
Recall: 1.0 
Aging Rate: 0.9996711134864026

 Dataset 3:
Precision: 0.0010483257620917182 
Recall: 1.0 
Aging Rate: 1.0

 Dataset 4:
Precision: 0.0010484766251387689 
Recall: 1.0 
Aging Rate: 0.9998561121503011

 Dataset 5:
Precision: 0.0010488863295148386 
Recall: 1.0 
Aging Rate: 0.9994655594154043

 Dataset 6:


ValueError: Sample larger than population or is negative

In [None]:
table_setC

### export

In [None]:
savedate = '20211130'
TPE_multi = False

table_setC['sampler'] = 'multivariate-TPE' if TPE_multi else 'univariate-TPE'
table_setC['model'] = 'BalanceCascade'
with pd.ExcelWriter(f'{savedate}_Classifier.xlsx', mode = 'a') as writer:
    table_setC.to_excel(writer, sheet_name = 'BalanceCascade')