In [1]:
import os
import time
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import pickle

from sklearn.ensemble import AdaBoostClassifier 
from sklearn.tree import DecisionTreeClassifier
import optuna
from sklearn.model_selection import train_test_split

from Dataset_Construction import Balance_Ratio
from Sampling import label_divide
from Aging_Score import score1
from XGBoost import optuna_history

os.chdir('C:/Users/user/Desktop/Darui_R08621110')  
os.getcwd()

'C:\\Users\\user\\Desktop\\Darui_R08621110'

### Load multiple dataset

In [2]:
def multiple_set(num_set):
    
    data_dict = {}
    for i in range(num_set):
        data_dict[f'set{i}'] = pd.read_csv(f'dataset_{i}.csv').iloc[:, 1:]
        print('Dimension of dataset', i, ':', data_dict[f'set{i}'].shape, ' balance ratio:', \
              Balance_Ratio(data_dict[f'set{i}']))
    
    print('\n', num_set, 'datasets are loaded.')
    return data_dict


def train_set(data_dict, num_set, label = 'GB'):
    
    trainset_x = {}
    trainset_y = {}
    
    for i in range(num_set):
        X, Y = label_divide(data_dict[f'set{i}'], None, label, train_only = True)
        trainset_x[f'set{i}'] = X
        trainset_y[f'set{i}'] = Y
        
    print('\nLabels of ', num_set, 'datasets are divided.')
    return trainset_x, trainset_y

### Boosting Model

In [3]:
def AdaBoostC(train_x, test_x, train_y, test_y, config):
    
    clf = AdaBoostClassifier(**config)
    clf.fit(train_x, train_y)
    predict_y = clf.predict(test_x)
    result = pd.DataFrame({'truth': test_y, 'predict': predict_y})
    
    return result

### Recall & Precision for Classifier

In [4]:
def cf_matrix(predict, train_y):
    
    # confusion matrix
    mask_FP = predict['predict'] > predict['truth']
    mask_FN = predict['predict'] < predict['truth']
    mask_TP = (predict['predict'] == predict['truth']) * (predict['predict'] == 1)
    mask_TN = (predict['predict'] == predict['truth']) * (predict['predict'] == 0)
    TP = mask_TP.sum()
    FP = mask_FP.sum()
    FN = mask_FN.sum()
    TN = mask_TN.sum()
    
    #balance ratio, train OK & NG
    train_OK = sum(train_y < 0.5)
    train_NG = len(train_y) - train_OK
    br = train_OK / train_NG
    
    #precision, recall, aging rate, efficiency, score
    num_pd = TP + FP
    if num_pd != 0:
        precision = TP / num_pd
    else:
        precision = 0
    
    recall = TP / (TP + FN)
    ar = (TP + FP) / (TP + FP + FN + TN)
    eff = recall / ar
    score = score1(recall, ar)
    
    table = pd.Series({'Balance Ratio': br, 'Train_OK': train_OK, 'Train_NG': train_NG, 'TP': TP, 'FP': FP, 'FN': FN, \
                       'TN': TN, 'Precision': precision, 'Recall': recall, 'Aging Rate': ar, 'Efficiency': eff, 'Score': score})
    table = pd.DataFrame(table).T
    
    print('Precision:', precision, '\nRecall:', recall, '\nAging Rate:', ar)
    return  table


def print_badC(predict, test_x, Bad_Types, threshold = 1):
    
    Bad = []
    Bad_miss = []
    TP = predict[(predict['truth'] == 1) & (predict['predict'] >= threshold)].index
    FN = predict[(predict['truth'] == 1) & (predict['predict'] < threshold)].index
    for j in range(len(TP)):
        Index = TP[j]
        Key = test_x.values[Index]
        Key = pd.DataFrame(Key).T.apply(lambda x:'_'.join(x.astype(str)), axis = 1)
        Bad.append(Bad_Types[Key[0]])
        Bad.sort()
    print('Types of Bad found:', Bad) 
    
    for j in range(len(FN)):
        Index = FN[j]
        Key = test_x.values[Index]
        Key = pd.DataFrame(Key).T.apply(lambda x:'_'.join(x.astype(str)),axis=1)
        Bad_miss.append(Bad_Types[Key[0]])
        Bad_miss.sort()
    print('Types of Bad not found:', Bad_miss)
    
    bad_table = pd.Series({'Bad_Found': set(Bad), 'Bad_Missed': set(Bad_miss)})
    bad_table = pd.DataFrame(bad_table).T
    bad_table['Detect Ratio'] = len(Bad) / (len(Bad) + len(Bad_miss))
    
    return bad_table

### Run all dataset

In [5]:
def runall_AdaBoostC(num_set, trainset_x, test_x, trainset_y, test_y, config, record_bad = True):
    
    table_set = pd.DataFrame()
    bad_set = pd.DataFrame()
    judge = list(config.keys())[0]

    for i in tqdm(range(num_set)):
        print('\n', f'Dataset {i}:')
        
        if isinstance(config[judge], dict) :
            best_config = config[f'set{i}']
        else :
            best_config = config
            
        # seperate the decision tree hyperparameter and adaboost hyperparameter
        tree_param = {'base_estimator': DecisionTreeClassifier(max_depth = best_config['max_depth'])}
        boost_param = dict((key, best_config[key]) for key in ['learning_rate', 'n_estimators'] if key in best_config)
        boost_param.update(tree_param)

        result = AdaBoostC(trainset_x[f'set{i}'], test_x, trainset_y[f'set{i}'], test_y, boost_param)
        table = cf_matrix(result, trainset_y[f'set{i}'])
        table_set = pd.concat([table_set, table]).rename(index = {0: f'dataset {i}'})
        
        if record_bad:
            bad_table = print_badC(result, test_x, Bad_Types) 
            bad_set = pd.concat([bad_set, bad_table]).rename(index = {0: f'dataset {i}'})

    if record_bad:
        return table_set, bad_set
    else:
        return table_set

### Plot all dataset

In [6]:
def bad_plot(bad_set):
    
    # record all bad types
    bad_list = []
    [bad_list.append(x) for x in bad_set.loc['dataset 1'][0]]
    [bad_list.append(x) for x in bad_set.loc['dataset 1'][1]]
    bad_list.sort()
    
    bad_array = np.empty([len(bad_set), len(bad_list)])
    for j in range(len(bad_set)):
        for i in range(len(bad_list)):
            if bad_list[i] in bad_set.iloc[j, 0]:
                bad_array[j, i] = 1
            else:
                bad_array[j ,i] = 0
                          
    bad_df = pd.DataFrame(bad_array)
    bad_df.columns = bad_list
    
    plt.pcolor(bad_df, cmap = 'Reds')
    plt.title("Bad Types Detection across All Datasets")
    plt.yticks(np.arange(0.5, len(bad_df.index), 1), bad_df.index)
    plt.xticks(np.arange(0.5, len(bad_df.columns), 1), bad_df.columns.astype(int))
    plt.xlabel("ID of Bad Types", size = 12)
    plt.ylabel("Dataset", size = 12)
    
    plt.savefig('Bad Types Detection across All Datasets.jpg')
    plt.show()
    
    
def line_chart(table_set, title):
    
    plt.style.use('seaborn-dark-palette')
    
    x = list(range(len(table_set)))
    fig, ax1 = plt.subplots(figsize = (15,8))
    ax2 = ax1.twinx()
    
    plt.title(title, fontsize = 16)
    plt.xticks(range(1,13,1))
    ax1.plot(x, table_set['Aging Rate'], 'b--', linewidth = 1, label = 'Aging Rate')
    ax1.plot(x, table_set['Aging Rate'], 'b.', markersize = 15)
    ax1.plot(x, table_set['Recall'], 'r-', linewidth = 1, label = 'Recall')
    ax1.plot(x, table_set['Recall'], 'r.', markersize = 15)
    ax2.plot(x, table_set['Precision'], 'g--', linewidth = 1, label = 'Precision')
    ax2.plot(x, table_set['Precision'], 'g.', markersize = 15)
    ax1.set_xlabel('\nDataset', fontsize = 12)
    ax1.set_ylabel('Recall & Aging Rate', color = 'b')
    ax2.set_ylabel('Precision', color = 'g')
    
    ax1.legend(loc = 'upper left', frameon = False)
    ax2.legend(loc = 'upper right', frameon = False)
    
    #plt.savefig(f'{title}.jpg')
    plt.show()

## Data Processing


In [7]:
###bad types###
bad = pd.read_csv('event/Bad_Types.csv').iloc[:, 1:]
Bad_Types = {bad.cb[i]:i for i in range (len(bad))}
print('Total bad types:', len(bad))

###single dataset###
test = pd.read_csv('event/TestingSet_0.csv').iloc[:, 2:]
train = pd.read_csv('event/TrainingSet_new.csv').iloc[:, 2:]
print('\ntraining data:', train.shape, '\nBalance Ratio:', Balance_Ratio(train))
print('\ntesting data:', test.shape, '\nBalance Ratio:', Balance_Ratio(test), '\n')

train_x, train_y, test_x, test_y = label_divide(train, test, 'GB')

###multiple dataset###
data_dict = multiple_set(num_set = 10)
trainset_x, trainset_y = train_set(data_dict, num_set = 10, label = 'GB')
test_x, test_y = label_divide(test, None, 'GB', train_only = True)


#####for runhist dataset#####
# bad = pd.read_csv('run_bad_types.csv').iloc[:, 1:]
# Bad_Types = {bad.cb[i]:i for i in range (len(bad))}
# print('Total bad types:', len(bad))

run_test = pd.read_csv('test_runhist.csv').iloc[:, 2:]
run_test_x, run_test_y = label_divide(run_test, None, 'GB', train_only = True)
print('\n', 'Dimension of run test:', run_test.shape)

Total bad types: 62

training data: (77138, 83) 
Balance Ratio: 18.17902

testing data: (55903, 83) 
Balance Ratio: 3104.72222 

Dimension of dataset 0 : (157140, 234)  balance ratio: 630.08434
Dimension of dataset 1 : (4904, 234)  balance ratio: 1.0
Dimension of dataset 2 : (4648, 234)  balance ratio: 1.0
Dimension of dataset 3 : (5196, 234)  balance ratio: 1.0
Dimension of dataset 4 : (4706, 234)  balance ratio: 1.0
Dimension of dataset 5 : (4985, 234)  balance ratio: 0.998
Dimension of dataset 6 : (4816, 234)  balance ratio: 1.07051
Dimension of dataset 7 : (4980, 234)  balance ratio: 1.0
Dimension of dataset 8 : (4980, 234)  balance ratio: 1.0
Dimension of dataset 9 : (1079, 234)  balance ratio: 3.33333

 10 datasets are loaded.

Labels of  10 datasets are divided.

 Dimension of run test: (48650, 234)


### Classifier

In [None]:
#table_set, bad_set = runall_AdaBoostC(9, trainset_x, test_x, trainset_y, test_y)
table_set = runall_AdaBoostC(10, trainset_x, run_test_x, trainset_y, run_test_y, best_paramC, record_bad = False)
line_chart(table_set, title = 'AdaBoost Classifier')
#bad_plot(bad_set)

In [None]:
table_set

## Optimization

### Optuna

In [8]:
def objective_creator(train_data, mode, num_valid = 3) :
    
    def objective(trial) :

        tree_param = {
            'max_depth': trial.suggest_int('max_depth', 1, 3)
        }
        
        param = {
            'base_estimator': DecisionTreeClassifier(**tree_param),
            'n_estimators': trial.suggest_int('n_estimators', 100, 300, step = 50),
            'learning_rate': trial.suggest_float('learning_rate', 0.025, 0.825, step = 0.05),
        }


        result_list = []
        for i in range(num_valid):

            train_x, train_y = label_divide(train_data, None, 'GB', train_only = True)
            train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size = 0.25)

            if mode == 'C':
                result = AdaBoostC(train_x, valid_x, train_y, valid_y, param)
                table = cf_matrix(result, valid_y)
                recall = table['Recall']
                aging = table['Aging Rate']
                effi = table['Efficiency']

                #result_list.append(effi)
                result_list.append(recall - 0.1*aging)

        return np.mean(result_list)
    
    return objective


def all_optuna(num_set, all_data, mode, TPE_multi, n_iter, num_valid = 3, return_addition = True) :

    best_param = {}
    #all_study = {}
    all_score = {}
    for i in tqdm(range(num_set)) :
        
        ##### define objective function and change optimized target dataset in each loop #####
        objective = objective_creator(train_data = data_dict[f'set{i}'], mode = mode, num_valid = num_valid)
        
        ##### optimize one dataset in each loop #####
        print(f'Dataset{i} :')
        
        study = optuna.create_study(sampler = optuna.samplers.TPESampler(multivariate = TPE_multi), 
                                       direction = 'maximize')
        study.optimize(objective, n_trials = n_iter, show_progress_bar = True, gc_after_trial = True)
        #n_trials or timeout
        best_param[f'set{i}'] = study.best_trial.params
        
        ##### return score and entire params for score plot or feature importance
        if return_addition :
            collect_score = []
            [collect_score.append(x.values) for x in study.trials]
            #all_study[f'set{i}'] = study
            all_score[f'set{i}'] = collect_score 
        
        print(f"Sampler is {study.sampler.__class__.__name__}")
    
    ##### store the best hyperparameters #####
    multi_mode = 'multivariate' if TPE_multi else 'univariate'
    with open(f'runhist_array_m2m5_AdaBoost{mode}_{multi_mode}-TPE_{n_iter}.data', 'wb') as f:
        pickle.dump(best_param, f)
    
    if return_addition :
        return best_param, all_score#, all_study
    else :
        return best_param

In [None]:
best_paramC, all_scoreC = all_optuna(num_set = 10, all_data = data_dict, mode = 'C', TPE_multi = False, n_iter = 50)

  0%|          | 0/10 [00:00<?, ?it/s]

[32m[I 2021-08-15 22:30:03,929][0m A new study created in memory with name: no-name-1f76f70d-b888-4932-a6da-a6bab957deb7[0m


Dataset0 :


  self._init_valid()


  0%|          | 0/50 [00:00<?, ?it/s]

  eff = r/ag


Precision: 0 
Recall: 0.0 
Aging Rate: 0.0
Precision: 0.0 
Recall: 0.0 
Aging Rate: 2.545500827287769e-05


  eff = r/ag


Precision: 0 
Recall: 0.0 
Aging Rate: 0.0
[32m[I 2021-08-15 22:35:35,737][0m Trial 0 finished with value: -8.485002757625896e-07 and parameters: {'max_depth': 1, 'n_estimators': 250, 'learning_rate': 0.825}. Best is trial 0 with value: -8.485002757625896e-07.[0m
Precision: 0.8813559322033898 
Recall: 0.7323943661971831 
Aging Rate: 0.0015018454880997836
Precision: 0.9361702127659575 
Recall: 0.7333333333333333 
Aging Rate: 0.0011963853888252514
Precision: 0.8 
Recall: 0.6896551724137931 
Aging Rate: 0.0012727504136438843
[32m[I 2021-08-15 22:48:32,648][0m Trial 1 finished with value: 0.7183285912717509 and parameters: {'max_depth': 3, 'n_estimators': 300, 'learning_rate': 0.675}. Best is trial 1 with value: 0.7183285912717509.[0m


  eff = r/ag


Precision: 0 
Recall: 0.0 
Aging Rate: 0.0


  eff = r/ag


Precision: 0 
Recall: 0.0 
Aging Rate: 0.0


  eff = r/ag


Precision: 0 
Recall: 0.0 
Aging Rate: 0.0
[32m[I 2021-08-15 22:55:36,105][0m Trial 2 finished with value: 0.0 and parameters: {'max_depth': 1, 'n_estimators': 300, 'learning_rate': 0.525}. Best is trial 1 with value: 0.7183285912717509.[0m
Precision: 0.7391304347826086 
Recall: 0.25757575757575757 
Aging Rate: 0.0005854651902761868
Precision: 0.9722222222222222 
Recall: 0.4861111111111111 
Aging Rate: 0.0009163802978235968
Precision: 0.75 
Recall: 0.3 
Aging Rate: 0.0006109201985490645
[32m[I 2021-08-15 23:11:56,325][0m Trial 3 finished with value: 0.3478251973727346 and parameters: {'max_depth': 3, 'n_estimators': 150, 'learning_rate': 0.37500000000000006}. Best is trial 1 with value: 0.7183285912717509.[0m
Precision: 0.7647058823529411 
Recall: 0.19402985074626866 
Aging Rate: 0.0004327351406389207
Precision: 0.875 
Recall: 0.14893617021276595 
Aging Rate: 0.00020364006618302151
Precision: 0.9 
Recall: 0.14285714285714285 
Aging Rate: 0.0002545500827287769
[32m[I 2021-08-15 2

  eff = r/ag


Precision: 0 
Recall: 0.0 
Aging Rate: 0.0


  eff = r/ag


Precision: 0 
Recall: 0.0 
Aging Rate: 0.0


  eff = r/ag


Precision: 0 
Recall: 0.0 
Aging Rate: 0.0
[32m[I 2021-08-16 00:10:17,742][0m Trial 9 finished with value: 0.0 and parameters: {'max_depth': 1, 'n_estimators': 150, 'learning_rate': 0.6250000000000001}. Best is trial 1 with value: 0.7183285912717509.[0m
Precision: 1.0 
Recall: 0.11764705882352941 
Aging Rate: 0.00020364006618302151
Precision: 1.0 
Recall: 0.07692307692307693 
Aging Rate: 0.00012727504136438844
Precision: 1.0 
Recall: 0.09230769230769231 
Aging Rate: 0.00015273004963726612
[32m[I 2021-08-16 00:35:50,251][0m Trial 10 finished with value: 0.09560982117952672 and parameters: {'max_depth': 3, 'n_estimators': 250, 'learning_rate': 0.07500000000000001}. Best is trial 1 with value: 0.7183285912717509.[0m
Precision: 0.9183673469387755 
Recall: 0.8035714285714286 
Aging Rate: 0.0012472954053710068
Precision: 0.8125 
Recall: 0.65 
Aging Rate: 0.001221840397098129
Precision: 0.9130434782608695 
Recall: 0.75 
Aging Rate: 0.0011709303805523736
[32m[I 2021-08-16 01:06:25,262]

Precision: 0.8421052631578947 
Recall: 0.2807017543859649 
Aging Rate: 0.0004836451571846761
Precision: 0.52 
Recall: 0.25 
Aging Rate: 0.0006363752068219422
[32m[I 2021-08-16 06:22:05,443][0m Trial 26 finished with value: 0.26538285644268494 and parameters: {'max_depth': 2, 'n_estimators': 300, 'learning_rate': 0.675}. Best is trial 18 with value: 0.7802305954460881.[0m
Precision: 0.8604651162790697 
Recall: 0.6065573770491803 
Aging Rate: 0.0010945653557337407
Precision: 0.9130434782608695 
Recall: 0.6666666666666666 
Aging Rate: 0.0011709303805523736
Precision: 1.0 
Recall: 0.7333333333333333 
Aging Rate: 0.0011200203640066183
[32m[I 2021-08-16 06:48:30,707][0m Trial 27 finished with value: 0.6687396084797169 and parameters: {'max_depth': 3, 'n_estimators': 250, 'learning_rate': 0.775}. Best is trial 18 with value: 0.7802305954460881.[0m
Precision: 0.8372093023255814 
Recall: 0.5901639344262295 
Aging Rate: 0.0010945653557337407
Precision: 0.8727272727272727 
Recall: 0.7164179

  eff = r/ag


Precision: 0 
Recall: 0.0 
Aging Rate: 0.0


  eff = r/ag


Precision: 0 
Recall: 0.0 
Aging Rate: 0.0
[32m[I 2021-08-16 10:19:05,312][0m Trial 35 finished with value: -1.6970005515251793e-06 and parameters: {'max_depth': 1, 'n_estimators': 300, 'learning_rate': 0.825}. Best is trial 18 with value: 0.7802305954460881.[0m
Precision: 0.8297872340425532 
Recall: 0.609375 
Aging Rate: 0.0011963853888252514
Precision: 0.9272727272727272 
Recall: 0.7183098591549296 
Aging Rate: 0.0014000254550082729
Precision: 0.8305084745762712 
Recall: 0.8166666666666667 
Aging Rate: 0.0015018454880997836
[32m[I 2021-08-16 10:38:44,491][0m Trial 36 finished with value: 0.7146472333961343 and parameters: {'max_depth': 3, 'n_estimators': 300, 'learning_rate': 0.675}. Best is trial 18 with value: 0.7802305954460881.[0m
Precision: 0.875 
Recall: 0.3442622950819672 
Aging Rate: 0.0006109201985490645
Precision: 0.5714285714285714 
Recall: 0.2711864406779661 
Aging Rate: 0.0007127402316405753
Precision: 0.8095238095238095 
Recall: 0.2786885245901639 
Aging Rate: 0.0

  eff = r/ag


Precision: 0 
Recall: 0.0 
Aging Rate: 0.0


  eff = r/ag


Precision: 0 
Recall: 0.0 
Aging Rate: 0.0


  eff = r/ag


Precision: 0 
Recall: 0.0 
Aging Rate: 0.0
[32m[I 2021-08-16 11:17:11,680][0m Trial 40 finished with value: 0.0 and parameters: {'max_depth': 2, 'n_estimators': 300, 'learning_rate': 0.025}. Best is trial 18 with value: 0.7802305954460881.[0m


In [None]:
##### optimization history plot #####
optuna_history(best_paramC, all_scoreC, model = 'AdaBoost Classifier')
            
##### best hyperparameter table #####
param_table = pd.DataFrame(best_paramC).T
param_table