In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import pickle
from tqdm.auto import tqdm

from sklearn.linear_model import LogisticRegression, RidgeCV, Ridge
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor, RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, CatBoostRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier, XGBRegressor
import optuna

from library.Data_Preprocessing import Balance_Ratio
from library.Imbalance_Sampling import label_divide
from library.Aging_Score_Contour import score1
from library.AdaBoost import train_set, multiple_set, multiple_month, line_chart, cf_matrix, AUC, PR_curve, \
     multiple_curve, PR_matrix, best_threshold, all_optuna, optuna_history, AdaBoost_creator 
from library.XGBoost import XGBoost_creator
from library.LightGBM import LightGBM_creator
from library.CatBoost import CatBoost_creator
from library.Random_Forest import RandomForest_creator
from library.Extra_Trees import ExtraTrees_creator

os.chdir('C:/Users/user/Desktop/Darui_R08621110')  
os.getcwd()

## 

### optimize base learner

In [None]:
def optimize_base(num_set, train_data, mode, TPE_multi, base_list, iter_list, filename):
    
    best_param = {}
    month_list = list(train_data.keys())
    
    for i in tqdm(month_list):
        
        best_param[f'{i}'] = {}
        if 'XGBoost' in base_list:
            model_index = base_list.index('XGBoost')
            best_param[f'{i}'][f'XGBoost'], _ = all_optuna(num_set = num_set, 
                                                           all_data = train_data[f'{i}'], 
                                                           mode = mode, 
                                                           TPE_multi = TPE_multi, 
                                                           n_iter = iter_list[model_index],
                                                           filename = f'{filename}_{i}_XGBoost',
                                                           creator = XGBoost_creator)

        if 'LightGBM' in base_list:
            model_index = base_list.index('LightGBM')
            best_param[f'{i}'][f'LightGBM'], _ = all_optuna(num_set = num_set, 
                                                            all_data = train_data[f'{i}'], 
                                                            mode = mode, 
                                                            TPE_multi = TPE_multi, 
                                                            n_iter = iter_list[model_index],
                                                            filename = f'{filename}_{i}_LightGBM',
                                                            creator = LightGBM_creator)
        
        if 'AdaBoost' in base_list:
            model_index = base_list.index('AdaBoost')
            best_param[f'{i}'][f'AdaBoost'], _ = all_optuna(num_set = num_set, 
                                                            all_data = train_data[f'{i}'], 
                                                            mode = mode, 
                                                            TPE_multi = TPE_multi, 
                                                            n_iter = iter_list[model_index],
                                                            filename = f'{filename}_{i}_AdaBoost',
                                                            creator = AdaBoost_creator)
            
        if 'CatBoost' in base_list:
            model_index = base_list.index('CatBoost')
            best_param[f'{i}'][f'CatBoost'], _ = all_optuna(num_set = num_set, 
                                                            all_data = train_data[f'{i}'], 
                                                            mode = mode, 
                                                            TPE_multi = TPE_multi, 
                                                            n_iter = iter_list[model_index],
                                                            filename = f'{filename}_{i}_CatBoost',
                                                            creator = CatBoost_creator)
            
        if 'RandomForest' in base_list:
            model_index = base_list.index('RandomForest')
            best_param[f'{i}'][f'RandomForest'], _ = all_optuna(num_set = num_set, 
                                                                all_data = train_data[f'{i}'], 
                                                                mode = mode, 
                                                                TPE_multi = TPE_multi, 
                                                                n_iter = iter_list[model_index],
                                                                filename = f'{filename}_{i}_RandomForest',
                                                                creator = RandomForest_creator)

        if 'ExtraTrees' in base_list:
            model_index = base_list.index('ExtraTrees')
            best_param[f'{i}'][f'ExtraTrees'], _ = all_optuna(num_set = num_set, 
                                                              all_data = train_data[f'{i}'], 
                                                              mode = mode, 
                                                              TPE_multi = TPE_multi, 
                                                              n_iter = iter_list[model_index],
                                                              filename = f'{filename}_{i}_ExtraTrees',
                                                              creator = ExtraTrees_creator)
            
    return best_param

### transform data by base learner

In [None]:
def stratified_data(train_data, cv):
    
    good = train_data[train_data.GB == 0]
    bad = train_data[train_data.GB == 1]
    good_index = random.sample(good.index.to_list(), k = len(good))
    bad_index = random.sample(bad.index.to_list(), k = len(bad))
    
    train_x_dict = {}
    train_y_dict = {}
    valid_x_dict = {}
    valid_y_dict = {}
    for i in range(cv):
        
        if (i+1) == cv:
            good_valid_index = good_index[int(np.floor((i/cv)*len(good))) : ]
            bad_valid_index = bad_index[int(np.floor((i/cv)*len(bad))) : ]
        else:
            good_valid_index = good_index[int(np.floor((i/cv)*len(good))) : int(np.floor(((i+1)/cv)*len(good)))]
            bad_valid_index = bad_index[int(np.floor((i/cv)*len(bad))) : int(np.floor(((i+1)/cv)*len(bad)))]
        good_train_index = [x for x in good_index if x not in good_valid_index]
        bad_train_index = [x for x in bad_index if x not in bad_valid_index]
        
        good_train = good.loc[good_train_index]
        good_valid = good.loc[good_valid_index]
        bad_train = bad.loc[bad_train_index]
        bad_valid = bad.loc[bad_valid_index]
        train = pd.concat([good_train, bad_train], axis = 0)
        valid = pd.concat([good_valid, bad_valid], axis = 0)
        train_x_dict[i], train_y_dict[i], valid_x_dict[i], valid_y_dict[i] = label_divide(train, valid, 
                                                                                          train_only = False)

    return train_x_dict, train_y_dict, valid_x_dict, valid_y_dict


def transform_train(train_data, num_set, mode, base_param, cv):
    
    month_list = list(base_param.keys())
    model_list = list(base_param[month_list[0]].keys())
    set_dict = {}
    for x in range(num_set):
        set_dict[f'set{x}'] = pd.DataFrame()
        
    for month in tqdm(month_list):
        
        for i in tqdm(range(num_set)):
            
            train_x_dict, train_y_dict, valid_x_dict, valid_y_dict = stratified_data(train_data[month][f'set{i}'], cv = cv)
            all_cv = pd.DataFrame()
            for j in range(cv):
                
                model_predict = pd.DataFrame()
                if mode == 'C':

                    if 'XGBoost' in model_list:
                        
                        clf = XGBClassifier(**base_param[month]['XGBoost'][f'set{i}'], n_jobs = -1)
                        clf.fit(train_x_dict[j], train_y_dict[j])
                        predict_y = clf.predict_proba(valid_x_dict[j])
                        predict = pd.DataFrame({'X': predict_y[:, 0]})
                        model_predict = pd.concat([model_predict, predict], axis = 1)

                    if 'LightGBM' in model_list:
                        
                        clf = LGBMClassifier(**base_param[month]['LightGBM'][f'set{i}'])
                        clf.fit(train_x_dict[j], train_y_dict[j])
                        predict_y = clf.predict_proba(valid_x_dict[j])
                        predict = pd.DataFrame({'L': predict_y[:, 0]})
                        model_predict = pd.concat([model_predict, predict], axis = 1)
                        
                elif mode == 'R':
                    
                    if 'XGBoost' in model_list:

                        reg = XGBRegressor(**base_param[month]['XGBoost'][f'set{i}'], n_jobs = -1)
                        reg.fit(train_x_dict[j], train_y_dict[j])
                        predict_y = reg.predict(valid_x_dict[j])
                        predict = pd.DataFrame({'X': predict_y})
                        model_predict = pd.concat([model_predict, predict], axis = 1)

                    if 'LightGBM' in model_list:
                        reg = LGBMRegressor(**base_param[month]['LightGBM'][f'set{i}'])
                        reg.fit(train_x_dict[j], train_y_dict[j])
                        predict_y = reg.predict(valid_x_dict[j])
                        predict = pd.DataFrame({'L': predict_y})
                        model_predict = pd.concat([model_predict, predict], axis = 1)
                        
                test_label = valid_y_dict[j].reset_index(drop = True)
                done_cv = pd.concat([model_predict, test_label], axis = 1)
                all_cv = pd.concat([all_cv, done_cv], axis = 0)
                
            set_dict[f'set{i}'] = pd.concat([set_dict[f'set{i}'], all_cv], axis = 0)
            
    
    return set_dict


def transform_test(train_data, test_data, num_set, mode, base_param):
    
    model_list = base_param['all'].keys()
    test_dict = {}
    for i in tqdm(range(num_set)):
        
        train_x, train_y, test_x, test_y = label_divide(train_data[f'set{i}'], test_data, train_only = False)
        model_predict = pd.DataFrame()
        if mode == 'C':
            
            if 'XGBoost' in model_list:
                
                clf = XGBClassifier(**base_param['all']['XGBoost'][f'set{i}'], n_jobs = -1)
                clf.fit(train_x, train_y)
                predict_y = clf.predict_proba(test_x)
                predict = pd.DataFrame({'X': predict_y[:, 0]})
                model_predict = pd.concat([model_predict, predict], axis = 1)
            
            if 'LightGBM' in model_list:
                clf = LGBMClassifier(**base_param['all']['LightGBM'][f'set{i}'])
                clf.fit(train_x, train_y)
                predict_y = clf.predict_proba(test_x)
                predict = pd.DataFrame({'L': predict_y[:, 0]})
                model_predict = pd.concat([model_predict, predict], axis = 1)
                
        elif mode == 'R':
            
            if 'XGBoost' in model_list:
                
                reg = XGBRegressor(**base_param['all']['XGBoost'][f'set{i}'], n_jobs = -1)
                reg.fit(train_x, train_y)
                predict_y = reg.predict(test_x)
                predict = pd.DataFrame({'X': predict_y})
                model_predict = pd.concat([model_predict, predict], axis = 1)
            
            if 'LightGBM' in model_list:
                reg = LGBMRegressor(**base_param['all']['LightGBM'][f'set{i}'])
                reg.fit(train_x, train_y)
                predict_y = reg.predict(test_x)
                predict = pd.DataFrame({'L': predict_y})
                model_predict = pd.concat([model_predict, predict], axis = 1)
        
        done_test = pd.concat([model_predict, test_y], axis = 1)
        test_dict[f'set{i}'] = done_test
        
    return test_dict

### meta learner

In [None]:
def LR(train_x, test_x, train_y, test_y, config):
    
    clf = LogisticRegression(**config)
    clf.fit(train_x, train_y)
    predict_y = clf.predict(test_x)
    result = pd.DataFrame({'truth': test_y, 'predict': predict_y})
    
    return result


def RidgeR(train_x, test_x, train_y, test_y, config):
    
    reg = Ridge(**config)
    reg.fit(train_x, train_y)
    predict_y = reg.predict(test_x)
    result = pd.DataFrame({'truth': test_y, 'predict': predict_y})
    
    return result


def runall_LR(num_set, trainset_x, testset_x, trainset_y, testset_y, config):
    
    table_set = pd.DataFrame()
    bad_set = pd.DataFrame()
    judge = list(config.keys())[0]

    for i in tqdm(range(num_set)):
        print('\n', f'Dataset {i}:')
        
        if isinstance(config[judge], dict) :
            best_config = config[f'set{i}']
        else :
            best_config = config

        result = LR(trainset_x[f'set{i}'], testset_x[f'set{i}'], trainset_y[f'set{i}'], testset_y[f'set{i}'], best_config)
        table = cf_matrix(result, trainset_y[f'set{i}'])
        table_set = pd.concat([table_set, table]).rename(index = {0: f'dataset {i}'})
    
    return table_set


def runall_RidgeR(num_set, trainset_x, testset_x, trainset_y, testset_y, config, thres_target = 'Recall', 
                    threshold = False):
    
    table_set = pd.DataFrame()
    bad_set = pd.DataFrame()
    pr_dict = {}
    judge = list(config.keys())[0]

    for i in range(num_set):
        print('\n', f'Dataset {i}:')
        
        if isinstance(config[judge], dict) :
            best_config = config[f'set{i}']
        else :
            best_config = config

        predict = RidgeR(trainset_x[f'set{i}'], testset_x[f'set{i}'], trainset_y[f'set{i}'], testset_y[f'set{i}'], 
                           best_config)
        pr_matrix = PR_matrix(predict, trainset_y[f'set{i}'])
        pr_dict[f'set{i}'] = pr_matrix
        
        best_data, best_thres = best_threshold(pr_matrix, target = thres_target, threshold = threshold)
        table_set = pd.concat([table_set, best_data]).rename(index = {best_data.index.values[0]: f'dataset {i}'})
        
    return pr_dict, table_set

### optuna

In [None]:
def stackingCV_creator(train_data, mode, num_valid = 3) :
    
    def objective(trial) :
        # hyperparameters randomize setting
        if mode == 'C' :
            meta_learner = 'Logistic Regression'
            
            if meta_learner == 'Logistic Regression' :      
                param = {
                    'solver': 'lbfgs',
                    'C': trial.suggest_categorical('C', [100, 10 ,1 ,0.1, 0.01]),
                    'penalty': trial.suggest_categorical('penalty', ['none', 'l2']),
                    'n_jobs': -1
                }

            elif meta_learner == 'Extra Trees' :
                param = {
                    'n_estimators': trial.suggest_int('n_estimators', 100, 500, step = 100),
                    'min_samples_split': trial.suggest_int('min_samples_split', 2, 32, step = 5),
                    'max_depth': trial.suggest_int('max_depth', 3, 21, step = 3),
                    'n_jobs': -1
                }     

        elif mode == 'R' :
            meta_learner = 'RidgeCV'
            
            if meta_learner == 'RidgeCV' :
                param = {
                    'alpha': trial.suggest_float('alpha', 0, 1, step = 0.1)
                }
            
            elif meta_learner == 'Extra Trees' :
                param = {
                    'n_estimators': trial.suggest_int('n_estimators', 100, 500, step = 100),
                    'min_samples_split': trial.suggest_int('min_samples_split', 2, 32, step = 5),
                    'max_depth': trial.suggest_int('max_depth', 3, 21, step = 3),
                    'n_jobs': -1
                }
        
        # objective function
        result_list = []
        for i in range(num_valid):

            train_x, train_y = label_divide(train_data, None, 'GB', train_only = True)
            train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size = 0.25)

            if mode == 'C':
                result = LR(train_x, valid_x, train_y, valid_y, param)
                table = cf_matrix(result, valid_y)
                recall = table['Recall']
                aging = table['Aging Rate']
                result_list.append(recall - 0.1*aging)

            elif mode == 'R':
                result = RidgeR(train_x, valid_x, train_y, valid_y, param)
                pr_matrix = PR_matrix(result, valid_y)
                auc = AUC(pr_matrix['Recall'], pr_matrix['Aging Rate'])
                result_list.append((-1)*auc)

        return np.mean(result_list)
    
    return objective

## 

### loading training & testing data

In [None]:
### training data ### 
training_month = [2, 3, 4]

data_dict, trainset_x, trainset_y = multiple_month(training_month, num_set = 10, filename = 'dataset')

print('\nCombined training data:\n')
run_train = multiple_set(num_set = 10)
run_train_x, run_train_y = train_set(run_train, num_set = 10)

### testing data ###
run_test = pd.read_csv('test_runhist.csv').iloc[:, 2:]
run_test_x, run_test_y = label_divide(run_test, None, 'GB', train_only = True)
print('\n', 'Dimension of testing data:', run_test.shape)

## base learner

### optimize the base learners by one-month data

In [None]:
##### for training data transformation ##### 
base_param_trainC = optimize_base(num_set = 10, 
                                 train_data = data_dict, 
                                 mode = 'C', 
                                 TPE_multi = False, 
                                 base_list = ['XGBoost', 'LightGBM'],
                                 iter_list = [200, 200],
                                 filename = 'runhist_array_4criteria_m2m5')

base_param_trainR = optimize_base(num_set = 10, 
                                 train_data = data_dict, 
                                 mode = 'R', 
                                 TPE_multi = False, 
                                 base_list = ['XGBoost', 'LightGBM'],
                                 iter_list = [200, 200],
                                 filename = 'runhist_array_4criteria_m2m5')

In [None]:
##### for testing data transformation ##### 
base_param_testC = optimize_base(num_set = 10, 
                                train_data = {'all': run_train}, 
                                mode = 'C', 
                                TPE_multi = False, 
                                base_list = ['XGBoost', 'LightGBM'], 
                                iter_list = [200, 200],
                                filename = 'runhist_array_4criteria_m2m5')

base_param_testR = optimize_base(num_set = 10, 
                                train_data = {'all': run_train}, 
                                mode = 'R', 
                                TPE_multi = False, 
                                base_list = ['XGBoost', 'LightGBM'], 
                                iter_list = [200, 200],
                                filename = 'runhist_array_4criteria_m2m5')

### data transform for scheme 2

### data transform for scheme 3

In [None]:
train_firstC = transform_train(data_dict, num_set = 10, mode = 'C', base_param = base_param_trainC, cv = 5)
test_firstC = transform_test(run_train, run_test, num_set = 10, mode = 'R', base_param = base_param_testC)
train_firstC_x, train_firstC_y = train_set(train_firstC, num_set = 10)
test_firstC_x, test_firstC_y = train_set(test_firstC, num_set = 10) 

train_firstR = transform_train(data_dict, num_set = 10, mode = 'R', base_param = base_param_trainR, cv = 5)
test_firstR = transform_test(run_train, run_test, num_set = 10, mode = 'R', base_param = base_param_testR)
train_firstR_x, train_firstR_y = train_set(train_firstR, num_set = 10)
test_firstR_x, test_firstR_y = train_set(test_firstR, num_set = 10) 

## meta learner

### searching for best hyperparameters

In [None]:
best_paramC, _ = all_optuna(num_set = 10, 
                            all_data = train_firstC, 
                            mode = 'C', 
                            TPE_multi = False, 
                            n_iter = 10,
                            filename = f'runhist_array_4criteria_m2m5_StackingCV3',
                            creator = stackingCV_creator
)

best_paramR, _ = all_optuna(num_set = 10, 
                            all_data = train_firstR, 
                            mode = 'R', 
                            TPE_multi = False, 
                            n_iter = 10,
                            filename = f'runhist_array_4criteria_m2m5_StackingCV3',
                            creator = stackingCV_creator
)

In [None]:
##### optimization history plot #####
optuna_history(best_paramC, all_scoreC, num_row = 4, num_col = 3, model = 'StackingCV Classifier (scheme 3)')
            
##### best hyperparameter table #####
param_table = pd.DataFrame(best_paramC).T
param_table

### classifier

In [None]:
table_setC = runall_LR(10, train_firstC_x, test_firstC_x, train_firstC_y, test_firstC_y, best_paramC)
line_chart(table_setC, title = 'StackingCV Classifier (scheme 3)')

In [None]:
table_setC

### regressor

In [None]:
pr_dict, table_setR = runall_RidgeR(10, train_firstR_x, test_firstR_x, train_firstR_y, test_firstR_y, best_paramR,
                                                                  thres_target = 'Recall', threshold = 0.8)
line_chart(table_setR, title = 'StackingCV Regressor (scheme 3)')

In [None]:
table_setR

### export

In [None]:
##### saving the output table for tableau #####
table_setC['sampler'] = 'univariate-TPE'
table_setR['sampler'] = 'univariate-TPE'
table_setC['model'] = 'StackingCV 3'
table_setR['model'] = 'StackingCV 3'


# with pd.ExcelWriter('20211012_Classifier.xlsx') as writer:
#     table_setC.to_excel(writer, sheet_name = 'StackingCV_3')

# with pd.ExcelWriter('20211012_Regressor.xlsx') as writer:
#     table_setR.to_excel(writer, sheet_name = 'StackingCV_3')