In [None]:
import os
import time
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from tqdm.auto import tqdm
import openpyxl

from mlxtend.classifier import StackingCVClassifier
from mlxtend.regressor import StackingCVRegressor
from sklearn.linear_model import LogisticRegression, RidgeCV, Ridge
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor, RandomForestClassifier, RandomForestRegressor,\
                                AdaBoostClassifier, AdaBoostRegressor
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, CatBoostRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier, XGBRegressor
import optuna

from Dataset_Construction import Balance_Ratio 
from Sampling import label_divide
from AdaClassifier import train_set, multiple_set, print_badC, bad_plot, line_chart, cf_matrix, runall_AdaBoostC
from AdaRegressor import AUC, PR_curve, multiple_curve, PR_matrix, best_threshold, runall_AdaBoostR
from Aging_Score import score1
from XGBoost import optuna_history, runall_XGBoostC, runall_XGBoostR
from CatBoost import runall_CatBoostC, runall_CatBoostR
from Light_GBM import runall_LightGBMC, runall_LightGBMR
from Random_Forest import runall_ForestC, runall_ForestR
from Extra_Trees import runall_ExtraTreesC, runall_ExtraTreesR

os.chdir('C:/Users/user/Desktop/Darui_R08621110')  
os.getcwd()

### Load all hyperparamters

In [None]:
def load_hyper(num_set, date, model_list, iter_list, filename, mode, sampler_list) :
    
    allset_dict = {}
    for j in range(num_set) :

        sampler_dict = {}
        for sampler in sampler_list :
        
            model_dict = {}
            for i, model in enumerate(model_list) :
            
                    with open(f'hyperparameter/{date}/{filename}_{model}{mode}_{sampler}_{iter_list[i]}.data', 'rb') as f:
                        temp_dict = pickle.load(f)
                        model_dict[model] = temp_dict[f'set{j}']
            
            sampler_dict[sampler] = model_dict
        
        allset_dict[f'set{j}'] = sampler_dict
        
    return allset_dict


def month_hyper(num_set, date, iteration, filename_list, sampler_list, mode):
    
    allset_dict = {}
    for i in range(num_set):
        
        sampler_dict = {}
        for sampler in sampler_list:
            
            month_dict = {}
            for j, filename in enumerate(filename_list):

                with open(f'hyperparameter/{date}/{filename}{mode}_{sampler}_{iteration}.data', 'rb') as f:
                    temp_dict = pickle.load(f)
                    month_dict[filename] = temp_dict[f'set{i}']
                    
            sampler_dict[sampler] = month_dict
            
        allset_dict[f'set{i}'] = sampler_dict
        
    return allset_dict


def tableau_hyper(num_set, date, model_list, iter_list, filename, mode, sampler_list) :
    
    model_dict = {}
    for j, model in enumerate(model_list) :

        sampler_dict = {}
        for i, sampler in enumerate(sampler_list) :

            with open(f'hyperparameter/{date}/{filename}_{model}{mode}_{sampler}_{iter_list[j]}.data', 'rb') as f :
                temp_dict = pickle.load(f)
                sampler_dict[sampler] = temp_dict
                
        model_dict[model] = sampler_dict

    return model_dict

### StackingCV

In [None]:
def month_stackingCVC(train_x, train_y, test_x, test_y, config, TPE_multi, meta_config) :
    
    sampler = 'multivariate-TPE' if TPE_multi else 'univariate-TPE'
    clf_list = []

    for name in config[sampler].keys():
        
        if 'LightGBM' in name :
            clf = LGBMClassifier(**config[sampler][name])
        elif 'XGBoost' in name :
            clf = XGBClassifier(**config[sampler][name])      
        clf_list.append(clf)
    
    second_config = meta_config.copy()
    del second_config['meta_learner']
    
    if meta_config['meta_learner'] == 'Logistic Regression' :
        meta_clf = LogisticRegression(**second_config)
    elif meta_config['meta_learner'] == 'Extra Trees' :
        meta_clf = ExtraTreesClassifier(**second_config)

    sclf = StackingCVClassifier(classifiers = clf_list, 
                                meta_classifier = meta_clf, 
                                use_probas = True,
                                drop_proba_col = 'last',
                                cv = 5,
                                shuffle = True,
                                stratify = True,
                                n_jobs = -1)
    sclf.fit(train_x, train_y)
    predict_y = sclf.predict(test_x)
    result = pd.DataFrame({'truth': test_y, 'predict': predict_y})
    
    return result


def month_stackingCVR(train_x, train_y, test_x, test_y, config, TPE_multi, meta_config) :
    
    sampler = 'multivariate-TPE' if TPE_multi else 'univariate-TPE'
    reg_list = []
    
    for name in config[sampler].keys():
    
        if 'LightGBM' in name :
            reg = LGBMRegressor(**config[sampler][name])
        elif 'XGBoost' in name :
            reg = XGBRegressor(**config[sampler][name])   
        reg_list.append(reg)

    second_config = meta_config.copy()
    del second_config['meta_learner']
        
    if meta_config['meta_learner'] == 'Ridge Regression' :
        meta_reg = Ridge(**second_config)
    elif meta_config['meta_learner'] == 'Extra Trees' :
        meta_reg = ExtraTreesRegressor(**second_config)

    sreg = StackingCVRegressor(regressors = reg_list, 
                               meta_regressor = meta_reg, 
                               cv = 5,
                               shuffle = True,
                               n_jobs = -1)
    sreg.fit(train_x, train_y)
    predict_y = sreg.predict(test_x)
    result = pd.DataFrame({'truth': test_y, 'predict': predict_y})
    
    return result

In [None]:
def runall_stackingCVC(num_set, train_x, train_y, test_x, test_y, config, TPE_multi, meta_config) :    
    
    table_set = pd.DataFrame()
    for i in tqdm(range(num_set)) :
        
        print(f'Dataset {i}:\n')
        result = month_stackingCVC(train_x[f'set{i}'], train_y[f'set{i}'], test_x, test_y, config[f'set{i}'], TPE_multi, 
                             meta_config[f'set{i}'])
        table = cf_matrix(result, train_y[f'set{i}'])
        table_set = pd.concat([table_set, table]).rename(index = {0: f'dataset {i}'})
        
    return table_set


def runall_stackingCVR(num_set, train_x, train_y, test_x, test_y, config, TPE_multi, meta_config, thres_target, threshold):
    
    table_set = pd.DataFrame()
    pr_dict = {}
    for i in tqdm(range(num_set)) :
        
        print(f'Dataset {i}:\n')
        result = month_stackingCVR(train_x[f'set{i}'], train_y[f'set{i}'], test_x, test_y, config[f'set{i}'], TPE_multi, 
                             meta_config[f'set{i}'])
        pr_matrix = PR_matrix(result, train_y[f'set{i}'])
        pr_dict[f'set{i}'] = pr_matrix
        
        best_data, best_thres = best_threshold(pr_matrix, target = thres_target, threshold = threshold)
        table_set = pd.concat([table_set, best_data]).rename(index = {best_data.index.values[0]: f'dataset {i}'})
        
    return pr_dict, table_set

## Data Processing

### Runhist data

In [None]:
###bad types###
bad = pd.read_csv('event/Bad_Types.csv').iloc[:, 1:]
Bad_Types = {bad.cb[i]:i for i in range (len(bad))}
print('Total bad types:', len(bad))

###single dataset###
test = pd.read_csv('event/TestingSet_0.csv').iloc[:, 2:]
train = pd.read_csv('event/TrainingSet_new.csv').iloc[:, 2:]
print('\ntraining data:', train.shape, '\nBalance Ratio:', Balance_Ratio(train))
print('\ntesting data:', test.shape, '\nBalance Ratio:', Balance_Ratio(test))

train_x, train_y, test_x, test_y = label_divide(train, test, 'GB')

###multiple dataset###
data_dict = multiple_set(num_set = 10)
trainset_x, trainset_y = train_set(data_dict, num_set = 10, label = 'GB')
test_x, test_y = label_divide(test, None, 'GB', train_only = True)


#####for runhist dataset#####
# bad = pd.read_csv('run_bad_types.csv').iloc[:, 1:]
# Bad_Types = {bad.cb[i]:i for i in range (len(bad))}
# print('Total bad types:', len(bad))

run_test = pd.read_csv('test_runhist.csv').iloc[:, 2:]
run_test_x, run_test_y = label_divide(run_test, None, 'GB', train_only = True)
print('\n', 'Dimension of run test:', run_test.shape)

In [None]:
hyper_info = {
    'num_set': 10,
    'date': '20211005',
    'iteration': 200,
    'filename_list': ['runhist_array_m2_m3_4selection_XGBoost',
                      'runhist_array_m3_m4_4selection_XGBoost',
                      'runhist_array_m4_m5_4selection_XGBoost',
                      'runhist_array_m2_m3_4selection_LightGBM',
                      'runhist_array_m3_m4_4selection_LightGBM',
                      'runhist_array_m4_m5_4selection_LightGBM'],
    'sampler_list': ['univariate-TPE', 'multivariate-TPE']
}

month_hyperC = month_hyper(**hyper_info, mode = 'C')
month_hyperR = month_hyper(**hyper_info, mode = 'R')

### Classifier

In [None]:
table_setC = runall_stackingCVC(10, 
                                trainset_x, 
                                trainset_y, 
                                run_test_x, 
                                run_test_y, 
                                month_hyperC, 
                                TPE_multi = True,  
                                meta_config = best_paramC)

In [None]:
line_chart(table_setC, title = 'StackingCV Classifier (by month)')
table_setC

### Regressor

In [None]:
pr_dict, table_setR = runall_stackingCVR(10, 
                                         trainset_x, 
                                         trainset_y, 
                                         run_test_x,
                                         run_test_y, 
                                         month_hyperR, 
                                         TPE_multi = True,  
                                         meta_config = best_paramR,
                                         thres_target = 'Recall',
                                         threshold = 0.8
                                        )

In [None]:
multiple_curve(4, 3, pr_dict, table_setR, target = 'Aging Rate')
multiple_curve(4, 3, pr_dict, table_setR, target = 'Precision')
line_chart(table_setR, title = 'StackingCV Regressor (by month)')
table_setR

## Optimization

### Optuna

In [None]:
def objective_creator(train_data, mode, TPE_multi, config, num_valid = 3) :
    
    def objective(trial) :
        # hyperparameters randomize setting
        if mode == 'C' :
            meta_learner = trial.suggest_categorical('meta_learner', ['Logistic Regression'])
            
            if meta_learner == 'Logistic Regression' :
                
                param = {
                    'meta_learner': 'Logistic Regression',
                    'solver': trial.suggest_categorical('solver', ['newton-cg', 'lbfgs', 'sag', 'saga']),
                    'C': trial.suggest_categorical('C', [100, 10 ,1 ,0.1, 0.01]),
                    'penalty': trial.suggest_categorical('penalty', ['none', 'l2']),
                    'n_jobs': -1
                }

            elif meta_learner == 'Extra Trees' :
                
                param = {
                    'meta_learner': 'Extra Trees',
                    'n_estimators': trial.suggest_int('n_estimators', 100, 500, step = 100),
                    'min_samples_split': trial.suggest_int('min_samples_split', 2, 32, step = 5),
                    'max_depth': trial.suggest_int('max_depth', 3, 21, step = 3),
                    'n_jobs': -1
                }     

        elif mode == 'R' :
            meta_learner = trial.suggest_categorical('meta_learner', ['Ridge Regression'])
            
            if meta_learner == 'Ridge Regression' :
                param = {
                    'meta_learner': 'Ridge Regression',
                    'alpha': trial.suggest_float('alpha', 0, 1, step = 0.1)
                }
            
            elif meta_learner == 'Extra Trees' :
                
                param = {
                    'meta_learner': 'Extra Trees',
                    'n_estimators': trial.suggest_int('n_estimators', 100, 500, step = 100),
                    'min_samples_split': trial.suggest_int('min_samples_split', 2, 32, step = 5),
                    'max_depth': trial.suggest_int('max_depth', 3, 21, step = 3),
                    'n_jobs': -1
                }
        
        # objective function
        result_list = []
        for i in range(num_valid):

            train_x, train_y = label_divide(train_data, None, 'GB', train_only = True)
            train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size = 0.25)

            if mode == 'C':
                result = month_stackingCVC(train_x, train_y, valid_x, valid_y, config, TPE_multi, param)
                table = cf_matrix(result, valid_y)
                recall = table['Recall']
                aging = table['Aging Rate']

                result_list.append(recall - 0.1*aging)

            elif mode == 'R':
                result = month_stackingCVR(train_x, train_y, valid_x, valid_y, config, TPE_multi, param)
                pr_matrix = PR_matrix(result, valid_y)
                auc = AUC(pr_matrix['Recall'], pr_matrix['Aging Rate'])
                
                result_list.append((-1)*auc)

        return np.mean(result_list)
    
    return objective


def all_optuna(num_set, all_data, mode, TPE_multi, config, n_iter, filename, creator, num_valid = 3) :

    best_param = {}
    all_score = {}
    for i in tqdm(range(num_set)) :
        
        ##### define objective function and change optimized target dataset in each loop #####
        objective = creator(all_data[f'set{i}'], mode, TPE_multi, config[f'set{i}'], num_valid = num_valid)
        
        ##### optimize one dataset in each loop #####
        print(f'Dataset{i} :')
        
        study = optuna.create_study(sampler = optuna.samplers.TPESampler(multivariate = TPE_multi), direction = 'maximize')
        study.optimize(objective, n_trials = n_iter, show_progress_bar = True, gc_after_trial = True)
        best_param[f'set{i}'] = study.best_trial.params
        
        ##### return score and entire params for score plot or feature importance
        collect_score = []
        [collect_score.append(x.values) for x in study.trials]
        all_score[f'set{i}'] = collect_score 
        
        print(f"Sampler is {study.sampler.__class__.__name__}")
    
    ##### store the best hyperparameters #####
    multi_mode = 'multivariate-TPE' if TPE_multi else 'univariate-TPE'
    with open(f'{filename}{mode}_{multi_mode}_{n_iter}.data', 'wb') as f:
        pickle.dump(best_param, f)
    
    return best_param, all_score

In [None]:
best_paramC, all_scoreC = all_optuna(num_set = 10, 
                                     all_data = data_dict, 
                                     mode = 'C', 
                                     TPE_multi = True, 
                                     config = month_hyperC, 
                                     n_iter = 10, 
                                     filename = 'runhist_array_m2m5_4selection_stackingCV(LX)_bymonth', 
                                     creator = objective_creator, 
                                     num_valid = 3
)

In [None]:
best_paramR, all_scoreR = all_optuna(num_set = 10, 
                                     all_data = data_dict, 
                                     mode = 'R', 
                                     TPE_multi = True, 
                                     config = month_hyperR, 
                                     n_iter = 5, 
                                     filename = 'runhist_array_m2m5_4selection_stackingCV(LX)_bymonth', 
                                     creator = objective_creator, 
                                     num_valid = 3
)