In [2]:
import os
import time
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from tqdm.auto import tqdm

from sklearn.linear_model import LogisticRegression, RidgeCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
import optuna

from Dataset_Construction import Balance_Ratio 
from Sampling import label_divide
from AdaClassifier import train_set, multiple_set, print_badC, bad_plot, line_chart, cf_matrix, runall_AdaBoostC
from AdaRegressor import AUC, PR_curve, multiple_curve, PR_matrix, best_threshold, runall_AdaBoostR
from Aging_Score import score1
from XGBoost import optuna_history, all_optuna, runall_XGBoostC, runall_XGBoostR
from CatBoost import runall_CatBoostC, runall_CatBoostR
from Light_GBM import runall_LightGBMC, runall_LightGBMR

os.chdir('C:/Users/user/Desktop/Darui_R08621110')  
os.getcwd()

'C:\\Users\\user\\Desktop\\Darui_R08621110'

### Load all hyperparamters

In [3]:
def load_hyper(num_set, date, model_list, iter_list, filename, mode, sampler) :
    
    allset_dict = {}
    for j in range(num_set) :

        oneset_dict = {}
        for i, model in enumerate(model_list) :

            with open(f'hyperparameter/{date}/{filename}_{model}{mode}_{sampler}_{iter_list[i]}.data', 'rb') as f :
                temp_dict = pickle.load(f)
                oneset_dict[model] = temp_dict[f'set{j}']
        allset_dict[f'set{j}'] = oneset_dict
        
    return allset_dict

In [10]:
hyper_info = {
    'num_set': 10,
    'date' = '20210802',
    'model_list': ['LightGBM', 'XGBoost'],
    'iter_listC': [1000, 1000],
    'filename': 'runhist_array_label',
    'mode': 'R',
    'sampler': 'multivaraite-TPE'
}

all_hyper = load_hyper(**hyper_info)

In [284]:
with open('hyperparameter/20210802/runhist_array_label_LightGBMR_multivariate-TPE_1000.data', 'rb') as f :
    temp_dict = pickle.load(f)
temp_dict['set9'] = temp_dict['set8']
temp_dict

{'set0': {'boosting_type': 'gbdt',
  'num_iterations': 150,
  'subsample': 0.7,
  'num_leaves': 10,
  'min_child_samples': 6,
  'max_depth': 15,
  'learning_rate': 0.125,
  'lambda_l1': 0.0002429249667917133,
  'lambda_l2': 0.1068127782917783},
 'set1': {'boosting_type': 'goss',
  'num_iterations': 150,
  'subsample': 0.5,
  'num_leaves': 25,
  'min_child_samples': 3,
  'max_depth': 11,
  'learning_rate': 0.425,
  'lambda_l1': 0.00023689666839152366,
  'lambda_l2': 0.11260668391396508},
 'set2': {'boosting_type': 'goss',
  'num_iterations': 300,
  'subsample': 0.9,
  'num_leaves': 30,
  'min_child_samples': 12,
  'max_depth': 5,
  'learning_rate': 0.37500000000000006,
  'lambda_l1': 1.3617102824283553,
  'lambda_l2': 0.1530279986362752},
 'set3': {'boosting_type': 'goss',
  'num_iterations': 150,
  'subsample': 0.5,
  'num_leaves': 15,
  'min_child_samples': 6,
  'max_depth': 13,
  'learning_rate': 0.425,
  'lambda_l1': 0.7320668921292203,
  'lambda_l2': 0.0693182530107957},
 'set4': {

## Data Processing

In [167]:
##### test different time series data on boosting models #####
#####for runhist dataset#####
# bad = pd.read_csv('run_bad_types.csv').iloc[:, 1:]
# Bad_Types = {bad.cb[i]:i for i in range (len(bad))}
# print('Total bad types:', len(bad))

data_dict = multiple_set(num_set = 10)
trainset_x, trainset_y = train_set(data_dict, num_set = 10, label = 'GB')

# merge with module
test_m23 = pd.read_csv('array_m2_m3.csv').iloc[:, 2:]
test_m45 = pd.read_csv('array_m4_m5.csv').iloc[:, 2:]
test_m67 = pd.read_csv('array_m6_m7.csv').iloc[:, 2:]
testm23_x, testm23_y = label_divide(test_m23, None, 'GB', train_only = True)
testm45_x, testm45_y = label_divide(test_m45, None, 'GB', train_only = True)
testm67_x, testm67_y = label_divide(test_m67, None, 'GB', train_only = True)

# merge with DataSet
test_m23_dset = pd.read_csv('array_m23_dset.csv').iloc[:, 2:]
test_m45_dset = pd.read_csv('array_m45_dset.csv').iloc[:, 2:]
test_m67_dset = pd.read_csv('array_m67_dset.csv').iloc[:, 2:]
dsetm23_x, dsetm23_y = label_divide(test_m23_dset, None, 'GB', train_only = True)
dsetm45_x, dsetm45_y = label_divide(test_m45_dset, None, 'GB', train_only = True)
dsetm67_x, dsetm67_y = label_divide(test_m67_dset, None, 'GB', train_only = True)

print('Dimension of run test_1:', testm23_x.shape)
print('Dimension of run test_2:', testm45_x.shape)
print('Dimension of run test_3:', testm67_x.shape)
print('Dimension of run test_4:', dsetm23_x.shape)
print('Dimension of run test_5:', dsetm45_x.shape)
print('Dimension of run test_6:', dsetm67_x.shape)

Dimension of dataset 0 : (120255, 176)  balance ratio: 1063.20354
Dimension of dataset 1 : (2446, 176)  balance ratio: 1.0
Dimension of dataset 2 : (2158, 176)  balance ratio: 1.0
Dimension of dataset 3 : (2626, 176)  balance ratio: 1.0
Dimension of dataset 4 : (2402, 176)  balance ratio: 1.0
Dimension of dataset 5 : (2242, 176)  balance ratio: 1.01619
Dimension of dataset 6 : (2210, 176)  balance ratio: 1.0463
Dimension of dataset 7 : (2260, 176)  balance ratio: 1.0
Dimension of dataset 8 : (2260, 176)  balance ratio: 1.0
Dimension of dataset 9 : (1243, 176)  balance ratio: 10.0

 10 datasets are loaded.

Labels of  10 datasets are divided.
Dimension of run test_1: (99400, 175)
Dimension of run test_2: (106392, 175)
Dimension of run test_3: (51035, 175)
Dimension of run test_4: (47725, 175)
Dimension of run test_5: (67071, 175)
Dimension of run test_6: (33409, 175)


In [171]:
ohno = pd.read_csv('ohno.csv').iloc[:, 2:]
ohno_x, ohno_y = label_divide(ohno, None, 'GB', train_only = True)

In [259]:
xgb_tableC = runall_XGBoostC(10, trainset_x, ohno_x, trainset_y, ohno_y, temp_dict, record_bad = False)


 Dataset 0:




Precision: 0.0 
Recall: 0.0 
Aging Rate: 1.7596959245442388e-05

 Dataset 1:
Precision: 0.00036735443580481234 
Recall: 0.7058823529411765 
Aging Rate: 0.5748222707116211

 Dataset 2:
Precision: 0.00025687130747495504 
Recall: 0.7058823529411765 
Aging Rate: 0.8220595481100865

 Dataset 3:
Precision: 0.0003358038905279317 
Recall: 0.8235294117647058 
Aging Rate: 0.7336348279017386

 Dataset 4:
Precision: 0.00034277879341864715 
Recall: 0.7058823529411765 
Aging Rate: 0.6160343492644471

 Dataset 5:
Precision: 0.00037288587028898654 
Recall: 0.8235294117647058 
Aging Rate: 0.6606778348701344

 Dataset 6:
Precision: 0.000301229564312512 
Recall: 0.6470588235294118 
Aging Rate: 0.6425881607658197

 Dataset 7:
Precision: 0.00036263636638360884 
Recall: 0.7058823529411765 
Aging Rate: 0.5823009783909341

 Dataset 8:
Precision: 0.0003639315808627978 
Recall: 0.7647058823529411 
Aging Rate: 0.6285809812064476

 Dataset 9:
Precision: 0.0003161737230893047 
Recall: 0.6470588235294118 
Aging Rat

In [269]:
xgb_pr, xgb_tableR = runall_XGBoostR(10, trainset_x, ohno_x, trainset_y, ohno_y, temp_dict, thres_target = 'Recall', 
                                      threshold = 0.8, record_bad = False)


 Dataset 0:
Best Threshold: -0.00752921961247921 

Recall: [0.82352941] ,   Precision: [0.00026384] ,   Aging Rate: [0.93374745]

 Dataset 1:
Best Threshold: 0.465803861618042 

Recall: [0.82352941] ,   Precision: [0.00044536] ,   Aging Rate: [0.55316041]

 Dataset 2:
Best Threshold: 0.474698930978775 

Recall: [0.82352941] ,   Precision: [0.00033404] ,   Aging Rate: [0.73750616]

 Dataset 3:
Best Threshold: 0.29873254895210266 

Recall: [0.82352941] ,   Precision: [0.00029693] ,   Aging Rate: [0.82967903]

 Dataset 4:
Best Threshold: 0.416493684053421 

Recall: [0.82352941] ,   Precision: [0.00041382] ,   Aging Rate: [0.59532273]

 Dataset 5:
Best Threshold: 0.400826632976532 

Recall: [0.82352941] ,   Precision: [0.00040252] ,   Aging Rate: [0.61203984]

 Dataset 6:
Best Threshold: 0.32208555936813354 

Recall: [0.82352941] ,   Precision: [0.00032903] ,   Aging Rate: [0.74873302]

 Dataset 7:
Best Threshold: 0.4086665213108063 

Recall: [0.82352941] ,   Precision: [0.0003482] ,   Ag

In [277]:
lgb_tableC = runall_LightGBMC(10, trainset_x, ohno_x, trainset_y, ohno_y, temp_dict, record_bad = False)


 Dataset 0:




Precision: 0.0 
Recall: 0.0 
Aging Rate: 3.5193918490884776e-05

 Dataset 1:
Precision: 0.00033466268789915496 
Recall: 0.7058823529411765 
Aging Rate: 0.6309741676638277

 Dataset 2:




Precision: 0.0002897387893376126 
Recall: 0.7647058823529411 
Aging Rate: 0.7895403674245091

 Dataset 3:




Precision: 0.00032256463699072006 
Recall: 0.7647058823529411 
Aging Rate: 0.7091926515098191

 Dataset 4:




Precision: 0.0004124683283247893 
Recall: 0.8235294117647058 
Aging Rate: 0.5972759907088055

 Dataset 5:




Precision: 0.0003954011801204453 
Recall: 0.7647058823529411 
Aging Rate: 0.5785528260716548

 Dataset 6:
Precision: 0.00034027850486860017 
Recall: 0.7647058823529411 
Aging Rate: 0.672274231012881

 Dataset 7:




Precision: 0.0003411275817155616 
Recall: 0.6470588235294118 
Aging Rate: 0.5674315478285352

 Dataset 8:
Precision: 0.000362844702467344 
Recall: 0.7058823529411765 
Aging Rate: 0.5819666361652707

 Dataset 9:




Precision: 0.0003239041243791838 
Recall: 0.7058823529411765 
Aging Rate: 0.6519321461251496


In [287]:
lgb_pr, lgb_tableR = runall_LightGBMR(10, trainset_x, ohno_x, trainset_y, ohno_y, temp_dict, thres_target = 'Recall', 
                                      threshold = 0.8, record_bad = False)


 Dataset 0:




Best Threshold: -0.00010362633828748315 

Recall: [0.82352941] ,   Precision: [0.00031831] ,   Aging Rate: [0.77394946]

 Dataset 1:
Best Threshold: 0.41300879302096627 

Recall: [0.82352941] ,   Precision: [0.00042091] ,   Aging Rate: [0.58529246]

 Dataset 2:
Best Threshold: 0.3919229333414923 

Recall: [0.82352941] ,   Precision: [0.00032627] ,   Aging Rate: [0.75506792]

 Dataset 3:
Best Threshold: 0.17065852609249646 

Recall: [0.82352941] ,   Precision: [0.00027855] ,   Aging Rate: [0.88444077]

 Dataset 4:
Best Threshold: 0.5682245108103473 

Recall: [0.82352941] ,   Precision: [0.00052323] ,   Aging Rate: [0.47084184]

 Dataset 5:
Best Threshold: 0.22101899277914808 

Recall: [0.82352941] ,   Precision: [0.00032504] ,   Aging Rate: [0.75793623]

 Dataset 6:
Best Threshold: 0.3397129800365034 

Recall: [0.82352941] ,   Precision: [0.00035681] ,   Aging Rate: [0.69045189]

 Dataset 7:
Best Threshold: 0.29657056403514664 

Recall: [0.82352941] ,   Precision: [0.00035257] ,   Aging

In [289]:
Balance_Ratio

<function Dataset_Construction.Balance_Ratio(data, label='GB', n=5)>