In [1]:
import os
import time
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from tqdm.auto import tqdm
import openpyxl

from sklearn.linear_model import LogisticRegression, RidgeCV, Ridge
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, CatBoostRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier, XGBRegressor

from Dataset_Construction import Balance_Ratio 
from Sampling import label_divide
from AdaClassifier import train_set, multiple_set, print_badC, bad_plot, line_chart, cf_matrix, runall_AdaBoostC
from AdaRegressor import AUC, PR_curve, multiple_curve, PR_matrix, best_threshold, runall_AdaBoostR
from Aging_Score import score1
from XGBoost import optuna_history, runall_XGBoostC, runall_XGBoostR
from CatBoost import runall_CatBoostC, runall_CatBoostR
from Light_GBM import runall_LightGBMC, runall_LightGBMR

os.chdir('C:/Users/user/Desktop/Darui_R08621110')  
os.getcwd()

'C:\\Users\\user\\Desktop\\Darui_R08621110'

### Load all hyperparamters

In [2]:
def load_hyper(num_set, date, model_list, iter_list, filename, mode, sampler) :
    
    allset_dict = {}
    for j in range(num_set) :

        oneset_dict = {}
        for i, model in enumerate(model_list) :

            with open(f'hyperparameter/{date}/{filename}_{model}{mode}_{sampler}_{iter_list[i]}.data', 'rb') as f :
                temp_dict = pickle.load(f)
                oneset_dict[model] = temp_dict[f'set{j}']
        allset_dict[f'set{j}'] = oneset_dict
        
    return allset_dict


def tableau_hyper(num_set, date, model_list, iter_list, filename, mode, sampler_list) :
    
    model_dict = {}
    for j, model in enumerate(model_list) :

        sampler_dict = {}
        for i, sampler in enumerate(sampler_list) :

            with open(f'hyperparameter/{date}/{filename}_{model}{mode}_{sampler}_{iter_list[j]}.data', 'rb') as f :
                temp_dict = pickle.load(f)
                sampler_dict[sampler] = temp_dict
                
        model_dict[model] = sampler_dict

    return model_dict

## Data Processing

### Runhist data

In [4]:
###bad types###
bad = pd.read_csv('event/Bad_Types.csv').iloc[:, 1:]
Bad_Types = {bad.cb[i]:i for i in range (len(bad))}
print('Total bad types:', len(bad))

###single dataset###
test = pd.read_csv('event/TestingSet_0.csv').iloc[:, 2:]
train = pd.read_csv('event/TrainingSet_new.csv').iloc[:, 2:]
print('\ntraining data:', train.shape, '\nBalance Ratio:', Balance_Ratio(train))
print('\ntesting data:', test.shape, '\nBalance Ratio:', Balance_Ratio(test))

train_x, train_y, test_x, test_y = label_divide(train, test, 'GB')

###multiple dataset###
data_dict = multiple_set(num_set = 10)
trainset_x, trainset_y = train_set(data_dict, num_set = 10, label = 'GB')
test_x, test_y = label_divide(test, None, 'GB', train_only = True)


#####for runhist dataset#####
# bad = pd.read_csv('run_bad_types.csv').iloc[:, 1:]
# Bad_Types = {bad.cb[i]:i for i in range (len(bad))}
# print('Total bad types:', len(bad))

run_test = pd.read_csv('test_runhist.csv').iloc[:, 2:]
run_test_x, run_test_y = label_divide(run_test, None, 'GB', train_only = True)
print('\n', 'Dimension of run test:', run_test.shape)

Total bad types: 62

training data: (77138, 83) 
Balance Ratio: 18.17902

testing data: (55903, 83) 
Balance Ratio: 3104.72222
Dimension of dataset 0 : (80518, 141)  balance ratio: 1101.9863
Dimension of dataset 1 : (1634, 141)  balance ratio: 1.0
Dimension of dataset 2 : (1484, 141)  balance ratio: 1.0
Dimension of dataset 3 : (1752, 141)  balance ratio: 1.0
Dimension of dataset 4 : (1608, 141)  balance ratio: 1.0
Dimension of dataset 5 : (1618, 141)  balance ratio: 1.00496
Dimension of dataset 6 : (1555, 141)  balance ratio: 1.09005
Dimension of dataset 7 : (1622, 141)  balance ratio: 1.0
Dimension of dataset 8 : (1622, 141)  balance ratio: 1.0
Dimension of dataset 9 : (803, 141)  balance ratio: 10.0

 10 datasets are loaded.

Labels of  10 datasets are divided.

 Dimension of run test: (47725, 141)


## Output for Tableau

### Classifier

In [5]:
def tableau_classifier(hyper, num_set, trainset_x, trainset_y, test_x, test_y) :
    
    temp_models = pd.DataFrame()
    for model in hyper.keys() :
    
        if model == 'XGBoost' :

            temp_uni = runall_XGBoostC(num_set, trainset_x, test_x, trainset_y, test_y, 
                                       hyper['XGBoost']['univariate-TPE'], record_bad = False)
            temp_multi = runall_XGBoostC(num_set, trainset_x, test_x, trainset_y, test_y, 
                                         hyper['XGBoost']['multivariate-TPE'], record_bad = False)
            
        elif model == 'LightGBM' :
            
            temp_uni = runall_LightGBMC(num_set, trainset_x, test_x, trainset_y, test_y, 
                                        hyper['LightGBM']['univariate-TPE'], record_bad = False)
            temp_multi = runall_LightGBMC(num_set, trainset_x, test_x, trainset_y, test_y, 
                                          hyper['LightGBM']['multivariate-TPE'], record_bad = False)
            
        elif model == 'CatBoost' :
            
            temp_uni = runall_CatBoostC(num_set, trainset_x, test_x, trainset_y, test_y, 
                                        hyper['CatBoost']['univariate-TPE'], cat_feature = [], record_bad = False)
            temp_multi = runall_CatBoostC(num_set, trainset_x, test_x, trainset_y, test_y, 
                                          hyper['CatBoost']['multivariate-TPE'], cat_feature = [], record_bad = False)
            
        temp_uni['Sampler'] = 'univariate-TPE'
        temp_uni['Model'] = model
        temp_multi['Sampler'] = 'multivaraite-TPE'
        temp_multi['Model'] = model
        
        temp_samplers = pd.concat([temp_uni, temp_multi], axis = 0)
        temp_models = pd.concat([temp_models, temp_samplers], axis = 0)
        final_models = temp_models.reset_index().rename(columns = {'index': 'dataset'})
        
    return final_models



hyper_info = {
    'num_set': 10,
    'date': '20210914',
    'model_list': ['LightGBM', 'XGBoost', 'CatBoost'],
    'iter_list': [250, 250, 250],
    'filename': 'runhist_array_m8m3_joinnewevent_eqp+rework',
    'mode': 'C',
    'sampler_list': ['univariate-TPE', 'multivariate-TPE']
}

output_hyper = tableau_hyper(**hyper_info)
        
table_C = tableau_classifier(output_hyper, 10, trainset_x, trainset_y, run_test_x, run_test_y)      


 Dataset 0:




Precision: 0.0 
Recall: 0.0 
Aging Rate: 4.1906757464641176e-05

 Dataset 1:
Precision: 0.0006370921771786875 
Recall: 0.5588235294117647 
Aging Rate: 0.6248926139339969

 Dataset 2:




Precision: 0.0007584695769425249 
Recall: 0.7941176470588235 
Aging Rate: 0.7458983761131482

 Dataset 3:
Precision: 0.0008131805968038467 
Recall: 0.6764705882352942 
Aging Rate: 0.5926453640649555

 Dataset 4:




Precision: 0.0007052186177715092 
Recall: 0.6470588235294118 
Aging Rate: 0.653661602933473

 Dataset 5:
Precision: 0.0006040759228054558 
Recall: 0.5588235294117647 
Aging Rate: 0.6590466212676794

 Dataset 6:




Precision: 0.0007614454773311336 
Recall: 0.7058823529411765 
Aging Rate: 0.6604295442640126

 Dataset 7:
Precision: 0.000676328502415459 
Recall: 0.6176470588235294 
Aging Rate: 0.6506024096385542

 Dataset 8:




Precision: 0.0007656381594058648 
Recall: 0.5882352941176471 
Aging Rate: 0.5473441592456784

 Dataset 9:
Precision: 0.0007172929985966006 
Recall: 0.6764705882352942 
Aging Rate: 0.6718700890518596

 Dataset 0:




Precision: 0.0 
Recall: 0.0 
Aging Rate: 0.0022210581456259823

 Dataset 1:
Precision: 0.0006810829218457347 
Recall: 0.5882352941176471 
Aging Rate: 0.6152959664745941

 Dataset 2:




Precision: 0.0007140307033202428 
Recall: 0.7647058823529411 
Aging Rate: 0.7629753797799895

 Dataset 3:
Precision: 0.0007634912372028457 
Recall: 0.6470588235294118 
Aging Rate: 0.6037716081718177

 Dataset 4:




Precision: 0.0008231864174241125 
Recall: 0.7058823529411765 
Aging Rate: 0.6108957569408067

 Dataset 5:
Precision: 0.0006610009442870633 
Recall: 0.6176470588235294 
Aging Rate: 0.665688842325825





 Dataset 6:
Precision: 0.0006529972574115189 
Recall: 0.5882352941176471 
Aging Rate: 0.6417600838135149

 Dataset 7:




Precision: 0.0007427501130271912 
Recall: 0.6764705882352942 
Aging Rate: 0.6488423258250393

 Dataset 8:
Precision: 0.0006394270733422853 
Recall: 0.5882352941176471 
Aging Rate: 0.6553797799895233

 Dataset 9:




Precision: 0.0007075775119001673 
Recall: 0.6470588235294118 
Aging Rate: 0.6514824515453117

 Dataset 0:




Precision: 0.0 
Recall: 0.0 
Aging Rate: 6.286013619696175e-05

 Dataset 1:
Precision: 0.0006890441972635102 
Recall: 0.6176470588235294 
Aging Rate: 0.6385961236249346

 Dataset 2:
Precision: 0.000731858357259472 
Recall: 0.7647058823529411 
Aging Rate: 0.7443897328444211

 Dataset 3:
Precision: 0.0007877375521055569 
Recall: 0.7058823529411765 
Aging Rate: 0.6383865898376113

 Dataset 4:
Precision: 0.0006387939570091667 
Recall: 0.5882352941176471 
Aging Rate: 0.6560293347302253

 Dataset 5:
Precision: 0.0006944663657312415 
Recall: 0.6470588235294118 
Aging Rate: 0.6637820848611838

 Dataset 6:
Precision: 0.0006975269499048827 
Recall: 0.6470588235294118 
Aging Rate: 0.6608695652173913

 Dataset 7:
Precision: 0.0008229863670519197 
Recall: 0.6764705882352942 
Aging Rate: 0.5855840754321634

 Dataset 8:
Precision: 0.0007637643620907219 
Recall: 0.6764705882352942 
Aging Rate: 0.6309900471451021

 Dataset 9:
Precision: 0.0007360414987206897 
Recall: 0.6176470588235294 
Aging Rate: 0.5

Precision: 0.0007829738771442808 
Recall: 0.6470588235294118 
Aging Rate: 0.5887480356207438

 Dataset 4:
Precision: 0.0008110300081103001 
Recall: 0.5882352941176471 
Aging Rate: 0.5167103195390257

 Dataset 5:
Precision: 0.0006466678641997485 
Recall: 0.5294117647058824 
Aging Rate: 0.5832372970141435

 Dataset 6:
Precision: 0.0006537609414157557 
Recall: 0.5294117647058824 
Aging Rate: 0.5769093766369827

 Dataset 7:
Precision: 0.0008170490910328862 
Recall: 0.7058823529411765 
Aging Rate: 0.6154845468831849

 Dataset 8:
Precision: 0.0007540394973070018 
Recall: 0.6176470588235294 
Aging Rate: 0.5835515976951283

 Dataset 9:
Precision: 0.0008249852681202121 
Recall: 0.6176470588235294 
Aging Rate: 0.5333682556312206

 Dataset 0:
Precision: 0.0 
Recall: 0.0 
Aging Rate: 0.00020953378732320587

 Dataset 1:
Precision: 0.0006217101172959755 
Recall: 0.4411764705882353 
Aging Rate: 0.5055421686746988

 Dataset 2:
Precision: 0.0006132147784761613 
Recall: 0.5882352941176471 
Aging Rate: 0

### Regressor

In [6]:
def tableau_regressor(hyper, num_set, trainset_x, trainset_y, test_x, test_y, thres_target, threshold) :
    
    temp_models = pd.DataFrame()
    for model in hyper.keys() :
    
        if model == 'XGBoost' :

            _, temp_uni = runall_XGBoostR(num_set, trainset_x, test_x, trainset_y, test_y, 
                                          hyper['XGBoost']['univariate-TPE'], thres_target = thres_target, 
                                          threshold = threshold, record_bad = False)
            _, temp_multi = runall_XGBoostR(num_set, trainset_x, test_x, trainset_y, test_y, 
                                            hyper['XGBoost']['multivariate-TPE'], thres_target = thres_target, 
                                            threshold = threshold, record_bad = False)
            
        elif model == 'LightGBM' :
            
            _, temp_uni = runall_LightGBMR(num_set, trainset_x, test_x, trainset_y, test_y, 
                                           hyper['LightGBM']['univariate-TPE'], thres_target = thres_target, 
                                           threshold = threshold, record_bad = False)
            _, temp_multi = runall_LightGBMR(num_set, trainset_x, test_x, trainset_y, test_y, 
                                             hyper['LightGBM']['multivariate-TPE'], thres_target = thres_target, 
                                             threshold = threshold, record_bad = False)
            
        elif model == 'CatBoost' :
            
            _, temp_uni = runall_CatBoostR(num_set, trainset_x, test_x, trainset_y, test_y, 
                                           hyper['CatBoost']['univariate-TPE'], cat_feature = [], 
                                           thres_target = thres_target, threshold = threshold, record_bad = False)
            _, temp_multi = runall_CatBoostR(num_set, trainset_x, test_x, trainset_y, test_y, 
                                             hyper['CatBoost']['multivariate-TPE'], cat_feature = [], 
                                             thres_target = thres_target, threshold = threshold, record_bad = False)
            
        temp_uni['Sampler'] = 'univariate-TPE'
        temp_uni['Model'] = model
        temp_multi['Sampler'] = 'multivaraite-TPE'
        temp_multi['Model'] = model
        
        temp_samplers = pd.concat([temp_uni, temp_multi], axis = 0)
        temp_models = pd.concat([temp_models, temp_samplers], axis = 0)
        final_models = temp_models.reset_index().rename(columns = {'index': 'Dataset'})
        
    return final_models



hyper_info = {
    'num_set': 10,
    'date': '20210914',
    'model_list': ['LightGBM', 'XGBoost', 'CatBoost'],
    'iter_list': [250, 250, 250],
    'filename': 'runhist_array_m8m3_joinnewevent_eqp+rework',
    'mode': 'R',
    'sampler_list': ['univariate-TPE', 'multivariate-TPE']
}

output_hyper = tableau_hyper(**hyper_info)
        
table_7 = tableau_regressor(output_hyper, 10, trainset_x, trainset_y, run_test_x, run_test_y, 'Recall', threshold = 0.7)    
table_8 = tableau_regressor(output_hyper, 10, trainset_x, trainset_y, run_test_x, run_test_y, 'Recall', threshold = 0.8)    


 Dataset 0:




Best Threshold: 8.60987633180383e-05 

Recall: [0.70588235] ,   Precision: [0.00067991] ,   Aging Rate: [0.73963332]

 Dataset 1:
Best Threshold: 0.2354142175051204 

Recall: [0.70588235] ,   Precision: [0.00072734] ,   Aging Rate: [0.69139864]

 Dataset 2:
Best Threshold: 0.6546842580783218 

Recall: [0.70588235] ,   Precision: [0.00071835] ,   Aging Rate: [0.70005238]

 Dataset 3:
Best Threshold: 0.2452390864930486 

Recall: [0.70588235] ,   Precision: [0.00073329] ,   Aging Rate: [0.68578313]

 Dataset 4:
Best Threshold: 0.37404649794061257 

Recall: [0.70588235] ,   Precision: [0.00074377] ,   Aging Rate: [0.67612362]

 Dataset 5:
Best Threshold: 0.1640264323601627 

Recall: [0.70588235] ,   Precision: [0.00068292] ,   Aging Rate: [0.73636459]

 Dataset 6:
Best Threshold: 0.5505206546869672 

Recall: [0.70588235] ,   Precision: [0.00087114] ,   Aging Rate: [0.57726558]

 Dataset 7:
Best Threshold: 0.22228451542768662 

Recall: [0.70588235] ,   Precision: [0.00070786] ,   Aging Rate

Best Threshold: 0.03683226577238504 

Recall: [0.82352941] ,   Precision: [0.00074182] ,   Aging Rate: [0.79088528]

 Dataset 5:
Best Threshold: 0.090878813575646 

Recall: [0.82352941] ,   Precision: [0.00076411] ,   Aging Rate: [0.76781561]

 Dataset 6:
Best Threshold: 0.12229180193182403 

Recall: [0.82352941] ,   Precision: [0.00077095] ,   Aging Rate: [0.76100576]

 Dataset 7:
Best Threshold: 0.13986249223211952 

Recall: [0.82352941] ,   Precision: [0.00075417] ,   Aging Rate: [0.77793609]

 Dataset 8:
Best Threshold: 0.006200695344266687 

Recall: [0.82352941] ,   Precision: [0.00072376] ,   Aging Rate: [0.81062336]

 Dataset 9:
Best Threshold: 0.020660312907986127 

Recall: [0.82352941] ,   Precision: [0.00075372] ,   Aging Rate: [0.77839707]

 Dataset 0:
Best Threshold: 6.857744625792842e-08 

Recall: [0.82352941] ,   Precision: [0.00067797] ,   Aging Rate: [0.86537454]

 Dataset 1:
Best Threshold: 0.015317000415591496 

Recall: [0.82352941] ,   Precision: [0.00070605] ,   Agi

### Combine & output

In [7]:
with pd.ExcelWriter(f'{hyper_info["filename"]}_result_table.xlsx') as writer :  
    table_C.to_excel(writer, sheet_name = 'classifier')
    table_7.to_excel(writer, sheet_name = 'regressor_7')
    table_8.to_excel(writer, sheet_name = 'regressor_8')