In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import pickle
import plotly

from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
import optuna
from sklearn.model_selection import train_test_split

from library.Data_Preprocessing import Balance_Ratio
from library.Imbalance_Sampling import label_divide
from library.Aging_Score_Contour import score1

os.chdir('C:/Users/user/Desktop/Darui_R08621110')  
os.getcwd()

'C:\\Users\\user\\Desktop\\Darui_R08621110'

## 

### load multiple dataset

In [3]:
def multiple_month(month_list, num_set, filename = 'dataset'):
    
    month_dict = {}
    trainset_x = {}
    trainset_y = {}
    for i in month_list:
        print(f'\nMonth {i}:\n')
        month_dict[f'm{i}'] = multiple_set(num_set = num_set, filename = f'm{i}_{filename}')
        trainset_x[f'm{i}'], trainset_y[f'm{i}'] = train_set(month_dict[f'm{i}'], num_set = num_set)
        
    return month_dict, trainset_x, trainset_y


def multiple_set(num_set, filename = 'dataset'):
    
    data_dict = {}
    for i in range(num_set):
        data_dict[f'set{i}'] = pd.read_csv(f'{filename}_{i}.csv').iloc[:, 1:]
        print('Dimension of dataset', i, ':', data_dict[f'set{i}'].shape, ' balance ratio:', \
              Balance_Ratio(data_dict[f'set{i}']))
    
    print('\n', num_set, 'datasets are loaded.')
    return data_dict


def train_set(data_dict, num_set, label = 'GB'):
    
    trainset_x = {}
    trainset_y = {}
    
    for i in range(num_set):
        X, Y = label_divide(data_dict[f'set{i}'], None, label, train_only = True)
        trainset_x[f'set{i}'] = X
        trainset_y[f'set{i}'] = Y  
    print('\nLabels of ', num_set, 'datasets are divided.')
    
    return trainset_x, trainset_y

### for classifier

In [4]:
def cf_matrix(predict, train_y):
    
    # confusion matrix
    mask_FP = predict['predict'] > predict['truth']
    mask_FN = predict['predict'] < predict['truth']
    mask_TP = (predict['predict'] == predict['truth']) & (predict['predict'] == 1)
    mask_TN = (predict['predict'] == predict['truth']) & (predict['predict'] == 0)
    TP = mask_TP.sum()
    FP = mask_FP.sum()
    FN = mask_FN.sum()
    TN = mask_TN.sum()
    
    #balance ratio, train OK & NG
    train_OK = sum(train_y < 0.5)
    train_NG = len(train_y) - train_OK
    br = train_OK / train_NG
    
    #precision, recall, aging rate, efficiency, score
    num_pd = TP + FP
    if num_pd != 0:
        precision = TP / num_pd
    else:
        precision = 0
    
    recall = TP / (TP + FN)
    ar = (TP + FP) / (TP + FP + FN + TN)
    if ar != 0:
        eff = recall / ar
    elif ar == 0:
        eff = 0
    score = score1(recall, ar)
    
    table = pd.Series({'Balance Ratio': br, 'Train_OK': train_OK, 'Train_NG': train_NG, 'TP': TP, 'FP': FP, 'FN': FN, \
                       'TN': TN, 'Precision': precision, 'Recall': recall, 'Aging Rate': ar, 'Efficiency': eff, \
                       'Score': score})
    table = pd.DataFrame(table).T
    
    print('Precision:', precision, '\nRecall:', recall, '\nAging Rate:', ar)
    return  table

### for regressor

In [5]:
def PR_matrix(predict, train_y):
    
    Y_new = predict.sort_values(['predict', 'truth'], ascending = [False, True]).reset_index(drop = True)
    Y_new.loc[Y_new['truth'] != 1, 'truth'] = 0
    
    matrix = pd.DataFrame(Y_new.groupby('predict').sum()).rename(columns = {'truth': 'Bad_Count'})
    matrix = matrix.sort_index(ascending = False)
    matrix['All_Count'] = Y_new.groupby('predict').count()
    matrix['Class_Prob'] = matrix.index
    
    matrix['train_OK'] = sum(train_y < 0.5)
    matrix['train_NG'] = len(train_y) - matrix['train_OK'].values[0]
    matrix['Balance Ratio'] = matrix['train_OK'] / matrix['train_NG']
    
    matrix['TP'] = matrix['Bad_Count'].cumsum()
    matrix['FP'] = matrix['All_Count'].cumsum() - matrix['TP']
    matrix['FN'] = matrix['TP'].values[-1] - matrix['TP']
    matrix['TN'] = matrix['FP'].values[-1] - matrix['FP']
    
    matrix['Precision'] = matrix['TP'] / (matrix['TP'] + matrix['FP'])
    matrix['Recall'] = matrix['TP'] / (matrix['TP'] + matrix['FN'])
    matrix['Aging Rate'] = (matrix['TP'] + matrix['FP']) / (matrix['TP'] + matrix['FP'] + matrix['FN'] + matrix['TN'])
    matrix['Efficiency'] = matrix['Recall'] / matrix['Aging Rate']
    matrix['Score'] = score1(matrix['Recall'], matrix['Aging Rate'])
              
    matrix = matrix.drop(columns = ['Bad_Count', 'All_Count']).reset_index(drop = True)
    
    return matrix


def best_threshold(pr_matrix, target, threshold = False):
    
    # input threshold, or find maximum
    if threshold:
        index = pr_matrix[pr_matrix[target] >= threshold].head(1).index.values[0]
    else:
        index = pr_matrix[target].idxmax()
        
    best_data = pr_matrix.loc[index]
    best_thres = best_data['Class_Prob']
    best_data = pd.DataFrame(best_data).T
    print('Best Threshold:', best_thres, '\n')
    print('Recall:', best_data['Recall'].values, ',   Precision:', best_data['Precision'].values, \
          ',   Aging Rate:', best_data['Aging Rate'].values)

    return best_data, best_thres

### plot

In [6]:
def line_chart(table_set, title):
    
    plt.style.use('seaborn-dark-palette')
    
    x = list(range(len(table_set)))
    fig, ax1 = plt.subplots(figsize = (15,8))
    ax2 = ax1.twinx()
    
    plt.title(title, fontsize = 16)
    plt.xticks(range(1,13,1))
    ax1.plot(x, table_set['Aging Rate'], 'b--', linewidth = 1, label = 'Aging Rate')
    ax1.plot(x, table_set['Aging Rate'], 'b.', markersize = 15)
    ax1.plot(x, table_set['Recall'], 'r-', linewidth = 1, label = 'Recall')
    ax1.plot(x, table_set['Recall'], 'r.', markersize = 15)
    ax2.plot(x, table_set['Precision'], 'g--', linewidth = 1, label = 'Precision')
    ax2.plot(x, table_set['Precision'], 'g.', markersize = 15)
    ax1.set_xlabel('\nDataset', fontsize = 12)
    ax1.set_ylabel('Recall & Aging Rate', color = 'b')
    ax2.set_ylabel('Precision', color = 'g')
    
    ax1.legend(loc = 'upper left', frameon = False)
    ax2.legend(loc = 'upper right', frameon = False)
    
    plt.show()
    
    
def AUC(x, y):
    
    area = 0
    left = x[0]*y[0]
    right = (1 - x[len(x)-1])*y[len(x)-1]
    
    for i in range(1, len(x)):
        wide = x[i] - x[i-1]
        height = (y[i-1] + y[i])/2
        area = area + wide*height
        
    area = left + area + right
    
    return area


def PR_curve(pr_matrix, best_data, title = 'PR_curve'):
    
    plt.plot(pr_matrix['Recall'], pr_matrix['Precision'], 'b-')
    plt.plot(pr_matrix['Recall'], pr_matrix['Precision'], 'r.')
    plt.plot(best_data['Recall'], best_data['Precision'], 'go', markersize = 10)
    print('Precision, Recall, Aging Rate:', best_data['Precision'].values, best_data['Recall'].values, 
          best_data['Aging Rate'].values)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'{title}')
    plt.show()
    auc = AUC(pr_matrix['Recall'].values, pr_matrix['Precision'].values)
    print('AUC: ', auc, '\n')
    
    
def multiple_curve(row_num, col_num, pr_dict, table_set, target = 'Aging Rate'):
    
    fig, axs = plt.subplots(row_num, col_num, sharex = False, sharey = False, figsize = (row_num*8 + 1, col_num*6))
    plt.suptitle(f'{target} & Recall Curve of Dataset 0 - {len(table_set)}', y = 0.94, fontsize = 30)
    
    for row in range(row_num):
        for col in range(col_num):
            
            index = col_num*row + col
            if index < len(table_set) :
                auc = AUC(pr_dict[f'set{index}']['Recall'].values, pr_dict[f'set{index}'][target].values).round(5)
                ar = table_set["Aging Rate"][index].round(3)
                recall = table_set["Recall"][index].round(3)
                precision = table_set["Precision"][index].round(5)

                axs[row, col].plot(pr_dict[f'set{index}']['Recall'], pr_dict[f'set{index}'][target], 'b-')
                axs[row, col].plot(pr_dict[f'set{index}']['Recall'], pr_dict[f'set{index}'][target], 'r.', markersize = 10)
                axs[row, col].plot(table_set['Recall'][index], table_set[target][index], 'go', markersize = 15)
                axs[row, col].set_xlabel('Recall')
                axs[row, col].set_ylabel(target)

                if target == 'Aging Rate':
                    axs[row, col].set_title(f'dataset {index}, AUC = {auc}, Aging Rate = {ar}, Recall = {recall}, Precision = {precision}')
                elif target == 'Precision':
                    axs[row, col].set_title(f'dataset {index}, AUC = {auc}, Aging Rate = {ar}, Recall = {recall}')

### adaboost 

In [7]:
def AdaBoostC(train_x, test_x, train_y, test_y, config):
    
    clf = AdaBoostClassifier(**config)
    clf.fit(train_x, train_y)
    predict_y = clf.predict(test_x)
    result = pd.DataFrame({'truth': test_y, 'predict': predict_y})
    
    return result


def AdaBoostR(train_x, test_x, train_y, test_y, config) :
    
    reg = AdaBoostRegressor(**config)
    reg.fit(train_x, train_y)
    predict_y = reg.predict(test_x)
    result = pd.DataFrame({'truth': test_y, 'predict': predict_y})
    
    return result


def runall_AdaBoostC(num_set, trainset_x, test_x, trainset_y, test_y, config):
    
    table_set = pd.DataFrame()
    bad_set = pd.DataFrame()
    judge = list(config.keys())[0]

    for i in tqdm(range(num_set)):
        print('\n', f'Dataset {i}:')
        
        if isinstance(config[judge], dict) :
            best_config = config[f'set{i}']
        else :
            best_config = config
            
        # seperate the decision tree hyperparameter and adaboost hyperparameter
        tree_param = {'base_estimator': DecisionTreeClassifier(max_depth = best_config['max_depth'])}
        boost_param = dict((key, best_config[key]) for key in ['learning_rate', 'n_estimators'] if key in best_config)
        boost_param.update(tree_param)

        result = AdaBoostC(trainset_x[f'set{i}'], test_x, trainset_y[f'set{i}'], test_y, boost_param)
        table = cf_matrix(result, trainset_y[f'set{i}'])
        table_set = pd.concat([table_set, table]).rename(index = {0: f'dataset {i}'})
    
    return table_set


def runall_AdaBoostR(num_set, trainset_x, test_x, trainset_y, test_y, config, thres_target = 'Recall', threshold = False):
    
    table_set = pd.DataFrame()
    bad_set = pd.DataFrame()
    pr_dict = {}
    judge = list(config.keys())[0]

    for i in range(num_set):
        print('\n', f'Dataset {i}:')
        
        if isinstance(config[judge], dict) :
            best_config = config[f'set{i}']
        else :
            best_config = config
            
        # seperate the decision tree hyperparameter and adaboost hyperparameter
        tree_param = {'base_estimator': DecisionTreeRegressor(max_depth = best_config['max_depth'])}
        boost_param = dict((key, best_config[key]) for key in ['learning_rate', 'n_estimators'] if key in best_config)
        boost_param.update(tree_param)

        predict = AdaBoostR(trainset_x[f'set{i}'], test_x, trainset_y[f'set{i}'], test_y, boost_param)
        pr_matrix = PR_matrix(predict, trainset_y[f'set{i}'])
        pr_dict[f'set{i}'] = pr_matrix
        
        best_data, best_thres = best_threshold(pr_matrix, target = thres_target, threshold = threshold)
        table_set = pd.concat([table_set, best_data]).rename(index = {best_data.index.values[0]: f'dataset {i}'})
        
    return pr_dict, table_set

### optuna

In [8]:
def AdaBoost_creator(train_data, mode, num_valid = 3):
    
    def objective(trial) :

        tree_param = {
            'max_depth': trial.suggest_int('max_depth', 1, 3)
        }
        
        param = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 300, step = 50),
            'learning_rate': trial.suggest_float('learning_rate', 0.025, 0.825, step = 0.05),
        }
        if mode == 'C':
            base = {'base_estimator': DecisionTreeClassifier(**tree_param)}
        elif mode == 'R':
            base = {'base_estimator': DecisionTreeRegressor(**tree_param)}
        param.update(base)


        result_list = []
        for i in range(num_valid):

            train_x, train_y = label_divide(train_data, None, 'GB', train_only = True)
            train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size = 0.25)

            if mode == 'C':
                result = AdaBoostC(train_x, valid_x, train_y, valid_y, param)
                table = cf_matrix(result, valid_y)
                recall = table['Recall']
                precision = table['Precision']
                f1 = (recall*precision) / (recall+precision)
                result_list.append(f1)
                
            elif mode == 'R':
                result = AdaBoostR(train_x, valid_x, train_y, valid_y, param)
                pr_matrix = PR_matrix(result, valid_y)
                auc = AUC(pr_matrix['Recall'], pr_matrix['Aging Rate'])
                result_list.append((-1)*auc)

        return np.mean(result_list)
    
    return objective


def all_optuna(num_set, all_data, mode, TPE_multi, n_iter, filename, creator, num_valid = 3, return_addition = True):

    best_param = {}
    all_score = {}
    for i in tqdm(range(num_set)) :
        
        ##### define objective function and change optimized target dataset in each loop #####
        objective = creator(train_data = all_data[f'set{i}'], mode = mode, num_valid = num_valid)
        
        ##### optimize one dataset in each loop #####
        print(f'Dataset{i} :')
        
        study = optuna.create_study(sampler = optuna.samplers.TPESampler(multivariate = TPE_multi), 
                                       direction = 'maximize')
        study.optimize(objective, n_trials = n_iter, show_progress_bar = True, gc_after_trial = True)
        #n_trials or timeout
        best_param[f'set{i}'] = study.best_trial.params
        
        ##### return score and entire params for score plot or feature importance
        if return_addition :
            collect_score = []
            [collect_score.append(x.values) for x in study.trials]
            all_score[f'set{i}'] = collect_score 
        
        print(f"Sampler is {study.sampler.__class__.__name__}")
    
    ##### store the best hyperparameters #####
    multi_mode = 'multivariate-TPE' if TPE_multi else 'univariate-TPE'
    with open(f'{filename}{mode}_{multi_mode}_{n_iter}.data', 'wb') as f:
        pickle.dump(best_param, f)
    
    if return_addition :
        return best_param, all_score
    else :
        return best_param
    

def optuna_history(best_param, all_score, num_row, num_col, model):

    fig, axs = plt.subplots(num_row, num_col, figsize = (num_row*10, num_col*5))
    plt.suptitle(f'Optimization History of {model}', y = 0.94, fontsize = 25)    
    for row in range(num_row):
        for col in range(num_col):
            index = num_col*row + col
            
            if index < len(best_param) :
                axs[row, col].plot(range(len(all_score[f'set{index}'])), all_score[f'set{index}'], 'r-', linewidth = 1)
                axs[row, col].set_title(f'Dataset {index}')
                axs[row, col].set_xlabel('Iterations')
                axs[row, col].set_ylabel('Values')

## 

### loading training & testing data

In [9]:
### training data ### 
training_month = range(2, 5)

data_dict, trainset_x, trainset_y = multiple_month(training_month, num_set = 10, filename = 'dataset')

print('\nCombined training data:\n')
run_train = multiple_set(num_set = 10)
run_train_x, run_train_y = train_set(run_train, num_set = 10)

### testing data ###
run_test = pd.read_csv('test_runhist.csv').iloc[:, 2:]
run_test_x, run_test_y = label_divide(run_test, None, 'GB', train_only = True)
print('\n', 'Dimension of testing data:', run_test.shape)


Month 2:

Dimension of dataset 0 : (39009, 90)  balance ratio: 442.0
Dimension of dataset 1 : (1750, 90)  balance ratio: 1.0
Dimension of dataset 2 : (2238, 90)  balance ratio: 1.0
Dimension of dataset 3 : (1932, 90)  balance ratio: 1.0
Dimension of dataset 4 : (1760, 90)  balance ratio: 1.0
Dimension of dataset 5 : (1756, 90)  balance ratio: 1.0
Dimension of dataset 6 : (1986, 90)  balance ratio: 1.0
Dimension of dataset 7 : (1760, 90)  balance ratio: 1.0
Dimension of dataset 8 : (1760, 90)  balance ratio: 1.0
Dimension of dataset 9 : (968, 90)  balance ratio: 10.0

 10 datasets are loaded.

Labels of  10 datasets are divided.

Month 3:

Dimension of dataset 0 : (60396, 104)  balance ratio: 553.0
Dimension of dataset 1 : (2218, 104)  balance ratio: 1.0
Dimension of dataset 2 : (2712, 104)  balance ratio: 1.0
Dimension of dataset 3 : (2394, 104)  balance ratio: 1.0
Dimension of dataset 4 : (2180, 104)  balance ratio: 1.0
Dimension of dataset 5 : (2183, 104)  balance ratio: 1.0
Dimensi

### search for best hyperparameter

In [10]:
best_paramC, all_scoreC = all_optuna(num_set = 10, 
                                     all_data = run_train, 
                                     mode = 'C', 
                                     TPE_multi = False, 
                                     n_iter = 25, 
                                     filename = 'runhist_array_m2m5_4selection_AdaBoost',
                                     creator = AdaBoost_creator
                                    )

  0%|          | 0/10 [00:00<?, ?it/s]

[32m[I 2021-11-25 23:38:37,892][0m A new study created in memory with name: no-name-1f68d855-1612-42ae-aa01-fc5530109367[0m


Dataset0 :


  self._init_valid()


  0%|          | 0/2 [00:00<?, ?it/s]

  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 1.0 
Recall: 0.41025641025641024 
Aging Rate: 0.0008145187975666251


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.7666666666666667 
Recall: 0.3709677419354839 
Aging Rate: 0.000763611372718711


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.8333333333333334 
Recall: 0.33783783783783783 
Aging Rate: 0.000763611372718711
[32m[I 2021-11-25 23:41:54,742][0m Trial 0 finished with value: 0.3729426052918105 and parameters: {'max_depth': 3, 'n_estimators': 200, 'learning_rate': 0.32500000000000007}. Best is trial 0 with value: 0.3729426052918105.[0m


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "
  eff = r/ag


Precision: 0 
Recall: 0.0 
Aging Rate: 0.0


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.0 
Recall: 0.0 
Aging Rate: 2.5453712423957035e-05


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "
  eff = r/ag
[32m[I 2021-11-25 23:44:16,197][0m A new study created in memory with name: no-name-b0c8d454-28dd-47fb-9500-4e0301e6f707[0m


Precision: 0 
Recall: 0.0 
Aging Rate: 0.0
[32m[I 2021-11-25 23:44:16,143][0m Trial 1 finished with value: -8.484570807985679e-07 and parameters: {'max_depth': 1, 'n_estimators': 300, 'learning_rate': 0.675}. Best is trial 0 with value: 0.3729426052918105.[0m
Sampler is TPESampler
Dataset1 :


  self._init_valid()


  0%|          | 0/2 [00:00<?, ?it/s]

  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.9133425034387895 
Recall: 0.8877005347593583 
Aging Rate: 0.4745430809399478


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.9293333333333333 
Recall: 0.8958868894601543 
Aging Rate: 0.489556135770235


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.9218967921896792 
Recall: 0.8529032258064516 
Aging Rate: 0.4680156657963446
[32m[I 2021-11-25 23:44:22,846][0m Trial 0 finished with value: 0.8310930539251039 and parameters: {'max_depth': 2, 'n_estimators': 300, 'learning_rate': 0.025}. Best is trial 0 with value: 0.8310930539251039.[0m


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.9572192513368984 
Recall: 0.9546666666666667 
Aging Rate: 0.48825065274151436


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.9622641509433962 
Recall: 0.9610552763819096 
Aging Rate: 0.5189295039164491


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "
[32m[I 2021-11-25 23:44:25,158][0m A new study created in memory with name: no-name-1c2a18d6-0779-46a2-917d-2634327378f8[0m


Precision: 0.9586666666666667 
Recall: 0.952317880794702 
Aging Rate: 0.489556135770235
[32m[I 2021-11-25 23:44:25,111][0m Trial 1 finished with value: 0.9061220648668195 and parameters: {'max_depth': 2, 'n_estimators': 100, 'learning_rate': 0.275}. Best is trial 1 with value: 0.9061220648668195.[0m
Sampler is TPESampler
Dataset2 :


  self._init_valid()


  0%|          | 0/2 [00:00<?, ?it/s]

  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.8230184581976113 
Recall: 0.8284153005464481 
Aging Rate: 0.5


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.8230683090705487 
Recall: 0.8157602663706992 
Aging Rate: 0.48479913137893593


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.8370044052863436 
Recall: 0.819848975188781 
Aging Rate: 0.49294245385450597
[32m[I 2021-11-25 23:44:29,203][0m Trial 0 finished with value: 0.7720834611941947 and parameters: {'max_depth': 1, 'n_estimators': 200, 'learning_rate': 0.825}. Best is trial 0 with value: 0.7720834611941947.[0m


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.8401360544217688 
Recall: 0.7908217716115261 
Aging Rate: 0.4788273615635179


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.8316939890710382 
Recall: 0.8262757871878393 
Aging Rate: 0.496742671009772


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "
[32m[I 2021-11-25 23:44:33,457][0m A new study created in memory with name: no-name-9ea612c7-6691-42e3-bece-dc6482636281[0m


Precision: 0.8544444444444445 
Recall: 0.8146186440677966 
Aging Rate: 0.48859934853420195
[32m[I 2021-11-25 23:44:33,424][0m Trial 1 finished with value: 0.761766421585471 and parameters: {'max_depth': 1, 'n_estimators': 200, 'learning_rate': 0.47500000000000003}. Best is trial 0 with value: 0.7720834611941947.[0m
Sampler is TPESampler
Dataset3 :


  self._init_valid()


  0%|          | 0/2 [00:00<?, ?it/s]

  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.9974358974358974 
Recall: 0.9099415204678363 
Aging Rate: 0.4642857142857143


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 1.0 
Recall: 0.9087635054021609 
Aging Rate: 0.4505952380952381


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 1.0 
Recall: 0.9325443786982248 
Aging Rate: 0.46904761904761905
[32m[I 2021-11-25 23:44:42,958][0m Trial 0 finished with value: 0.8709521824751216 and parameters: {'max_depth': 2, 'n_estimators': 100, 'learning_rate': 0.07500000000000001}. Best is trial 0 with value: 0.8709521824751216.[0m


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.9949685534591195 
Recall: 0.9349881796690307 
Aging Rate: 0.4732142857142857


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 1.0 
Recall: 0.9230769230769231 
Aging Rate: 0.4714285714285714


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "
[32m[I 2021-11-25 23:45:08,876][0m A new study created in memory with name: no-name-d2ecd9be-5641-4821-bcf7-69e8b0792458[0m


Precision: 1.0 
Recall: 0.9290780141843972 
Aging Rate: 0.46785714285714286
[32m[I 2021-11-25 23:45:08,815][0m Trial 1 finished with value: 0.8819643723101169 and parameters: {'max_depth': 3, 'n_estimators': 200, 'learning_rate': 0.025}. Best is trial 1 with value: 0.8819643723101169.[0m
Sampler is TPESampler
Dataset4 :


  self._init_valid()


  0%|          | 0/2 [00:00<?, ?it/s]

  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.9560283687943263 
Recall: 0.9182561307901907 
Aging Rate: 0.46078431372549017


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.9556135770234987 
Recall: 0.9606299212598425 
Aging Rate: 0.5006535947712418


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.9562982005141388 
Recall: 0.96 
Aging Rate: 0.5084967320261438
[32m[I 2021-11-25 23:45:17,362][0m Trial 0 finished with value: 0.8972975293325819 and parameters: {'max_depth': 3, 'n_estimators': 300, 'learning_rate': 0.025}. Best is trial 0 with value: 0.8972975293325819.[0m


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.8728813559322034 
Recall: 0.8284182305630027 
Aging Rate: 0.4627450980392157


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.8717948717948718 
Recall: 0.8324742268041238 
Aging Rate: 0.4843137254901961


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "
[32m[I 2021-11-25 23:45:20,866][0m A new study created in memory with name: no-name-d292f9e7-488b-450b-bc8f-dbb8e0b79d4b[0m


Precision: 0.8732212160413971 
Recall: 0.8642765685019206 
Aging Rate: 0.5052287581699346
[32m[I 2021-11-25 23:45:20,823][0m Trial 1 finished with value: 0.7933134225663707 and parameters: {'max_depth': 1, 'n_estimators': 200, 'learning_rate': 0.6250000000000001}. Best is trial 0 with value: 0.8972975293325819.[0m
Sampler is TPESampler
Dataset5 :


  self._init_valid()


  0%|          | 0/2 [00:00<?, ?it/s]

  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.9187675070028011 
Recall: 0.8631578947368421 
Aging Rate: 0.4654498044328553


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.857916102841678 
Recall: 0.8521505376344086 
Aging Rate: 0.4817470664928292


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.8938992042440318 
Recall: 0.849936948297604 
Aging Rate: 0.4915254237288136
[32m[I 2021-11-25 23:45:24,314][0m Trial 0 finished with value: 0.8071243837344683 and parameters: {'max_depth': 2, 'n_estimators': 150, 'learning_rate': 0.025}. Best is trial 0 with value: 0.8071243837344683.[0m


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.9759797724399494 
Recall: 0.9759797724399494 
Aging Rate: 0.515645371577575


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.9933774834437086 
Recall: 0.985545335085414 
Aging Rate: 0.4921773142112125


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "
[32m[I 2021-11-25 23:45:28,877][0m A new study created in memory with name: no-name-e71fe397-e2b8-49ed-a068-ceaa841c73e9[0m


Precision: 0.9743260590500642 
Recall: 0.9869960988296489 
Aging Rate: 0.5078226857887875
[32m[I 2021-11-25 23:45:28,846][0m Trial 1 finished with value: 0.9323188897324183 and parameters: {'max_depth': 2, 'n_estimators': 200, 'learning_rate': 0.37500000000000006}. Best is trial 1 with value: 0.9323188897324183.[0m
Sampler is TPESampler
Dataset6 :


  self._init_valid()


  0%|          | 0/2 [00:00<?, ?it/s]

  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.8566844919786096 
Recall: 0.8640776699029126 
Aging Rate: 0.5548961424332344


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.837160751565762 
Recall: 0.8614393125671321 
Aging Rate: 0.568545994065282


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.8620309050772627 
Recall: 0.8591859185918592 
Aging Rate: 0.5376854599406529
[32m[I 2021-11-25 23:45:30,768][0m Trial 0 finished with value: 0.8061967138059957 and parameters: {'max_depth': 1, 'n_estimators': 100, 'learning_rate': 0.525}. Best is trial 0 with value: 0.8061967138059957.[0m


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.8625541125541125 
Recall: 0.8588362068965517 
Aging Rate: 0.5483679525222552


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.8398220244716351 
Recall: 0.8435754189944135 
Aging Rate: 0.5335311572700296


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "
[32m[I 2021-11-25 23:45:33,644][0m A new study created in memory with name: no-name-bf3f9846-247f-45bd-8b05-922442b88550[0m


Precision: 0.8530701754385965 
Recall: 0.8521358159912377 
Aging Rate: 0.5412462908011869
[32m[I 2021-11-25 23:45:33,612][0m Trial 1 finished with value: 0.7974109672742852 and parameters: {'max_depth': 1, 'n_estimators': 150, 'learning_rate': 0.37500000000000006}. Best is trial 0 with value: 0.8061967138059957.[0m
Sampler is TPESampler
Dataset7 :


  self._init_valid()


  0%|          | 0/2 [00:00<?, ?it/s]

  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.9972451790633609 
Recall: 0.9823609226594301 
Aging Rate: 0.4745098039215686


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.9946737683089214 
Recall: 0.9828947368421053 
Aging Rate: 0.49084967320261436


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.9879356568364611 
Recall: 0.9787516600265604 
Aging Rate: 0.4875816993464052
[32m[I 2021-11-25 23:45:59,018][0m Trial 0 finished with value: 0.9329044006270123 and parameters: {'max_depth': 2, 'n_estimators': 300, 'learning_rate': 0.825}. Best is trial 0 with value: 0.9329044006270123.[0m


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.9957983193277311 
Recall: 0.9162371134020618 
Aging Rate: 0.4666666666666667


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.9889196675900277 
Recall: 0.9431968295904888 
Aging Rate: 0.4718954248366013


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "
[32m[I 2021-11-25 23:46:08,878][0m A new study created in memory with name: no-name-c5157699-8070-44d8-9dbe-761aa7cee36a[0m


Precision: 0.9985795454545454 
Recall: 0.9262187088274044 
Aging Rate: 0.46013071895424834
[32m[I 2021-11-25 23:46:08,832][0m Trial 1 finished with value: 0.8819277902580679 and parameters: {'max_depth': 1, 'n_estimators': 200, 'learning_rate': 0.32500000000000007}. Best is trial 0 with value: 0.9329044006270123.[0m
Sampler is TPESampler
Dataset8 :


  self._init_valid()


  0%|          | 0/2 [00:00<?, ?it/s]

  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.9896640826873385 
Recall: 0.9745547073791349 
Aging Rate: 0.5058823529411764


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.9891156462585035 
Recall: 0.9771505376344086 
Aging Rate: 0.4803921568627451


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.9907894736842106 
Recall: 0.984313725490196 
Aging Rate: 0.49673202614379086
[32m[I 2021-11-25 23:46:15,915][0m Trial 0 finished with value: 0.9292394389696561 and parameters: {'max_depth': 3, 'n_estimators': 250, 'learning_rate': 0.07500000000000001}. Best is trial 0 with value: 0.9292394389696561.[0m


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.9884615384615385 
Recall: 0.9871959026888605 
Aging Rate: 0.5098039215686274


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.9857512953367875 
Recall: 0.9806701030927835 
Aging Rate: 0.5045751633986928


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "
[32m[I 2021-11-25 23:46:22,874][0m A new study created in memory with name: no-name-7dd938cb-3f09-4e54-81e7-5e102e149bfe[0m


Precision: 0.9917355371900827 
Recall: 0.975609756097561 
Aging Rate: 0.4745098039215686
[32m[I 2021-11-25 23:46:22,830][0m Trial 1 finished with value: 0.9315289576634386 and parameters: {'max_depth': 3, 'n_estimators': 250, 'learning_rate': 0.17500000000000002}. Best is trial 1 with value: 0.9315289576634386.[0m
Sampler is TPESampler
Dataset9 :


  self._init_valid()


  0%|          | 0/2 [00:00<?, ?it/s]

  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.8524590163934426 
Recall: 0.8387096774193549 
Aging Rate: 0.07244655581947744


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.8918918918918919 
Recall: 0.7586206896551724 
Aging Rate: 0.08788598574821853


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.8805970149253731 
Recall: 0.7564102564102564 
Aging Rate: 0.07957244655581948
[32m[I 2021-11-25 23:46:24,549][0m Trial 0 finished with value: 0.7765833748908108 and parameters: {'max_depth': 3, 'n_estimators': 100, 'learning_rate': 0.675}. Best is trial 0 with value: 0.7765833748908108.[0m


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.9423076923076923 
Recall: 0.5568181818181818 
Aging Rate: 0.06175771971496437


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.9230769230769231 
Recall: 0.6075949367088608 
Aging Rate: 0.06175771971496437


  f"evaluating in Python space because the {repr(op_str)} "
  f"evaluating in Python space because the {repr(op_str)} "


Precision: 0.8548387096774194 
Recall: 0.6091954022988506 
Aging Rate: 0.07363420427553444
[32m[I 2021-11-25 23:46:28,591][0m Trial 1 finished with value: 0.5846311854851156 and parameters: {'max_depth': 2, 'n_estimators': 300, 'learning_rate': 0.125}. Best is trial 0 with value: 0.7765833748908108.[0m
Sampler is TPESampler


In [None]:
best_paramR, all_scoreR = all_optuna(num_set = 10, 
                                     all_data = run_train, 
                                     mode = 'R', 
                                     TPE_multi = True, 
                                     n_iter = 25,
                                     filename = 'runhist_array_m2m5_4selection_AdaBoost',
                                     creator = AdaBoost_creator
                                    )

In [None]:
##### optimization history plot #####
optuna_history(best_paramC, all_scoreC, num_row = 4, num_col = 3, model = 'AdaBoost Classifier')
            
##### best hyperparameter table #####
param_table = pd.DataFrame(best_paramC).T
param_table

In [None]:
##### constructing ......... #####
study = optuna.create_study(sampler = optuna.samplers.TPESampler(multivariate = False), direction = 'maximize') 
#TPE, Random, Grid, CmaEs#
objective = objective_creator(train_data = data_dict['set6'], mode = 'C', num_valid = 3)
study.optimize(objective, n_trials = 5, show_progress_bar = True, gc_after_trial = True)


##### hyperparameter importance #####
#importances = optuna.importance.get_param_importances(study)
#importances.optuna.importance.get_param_importances(study, evaluator = optuna.importance.FanovaImportanceEvaluator())
importance_fig = optuna.visualization.plot_param_importances(study)
slice_fig = optuna.visualization.plot_slice(study)
importance_fig.show()
slice_fig.show()

## 

### classifier

In [11]:
table_setC = runall_AdaBoostC(10, run_train_x, run_test_x, run_train_y, run_test_y, best_paramC)
line_chart(table_setC, title = 'AdaBoost Classifier')

  0%|          | 0/10 [00:00<?, ?it/s]


 Dataset 0:


ValueError: X has 130 features, but DecisionTreeClassifier is expecting 129 features as input.

In [None]:
table_setC

### regressor

In [None]:
pr_dict, table_setR = runall_AdaBoostR(10, run_train_x, run_test_x, run_train_y, run_test_y, best_paramR,
                                      thres_target = 'Recall', threshold = 0.7)
line_chart(table_setR, title = 'AdaBoost Regressor')

In [None]:
multiple_curve(4, 3, pr_dict, table_setR, target = 'Aging Rate')
multiple_curve(4, 3, pr_dict, table_setR, target = 'Precision')
table_setR

### export

In [None]:
savedate = '20211019'
TPE_multi = False

table_setC['sampler'] = 'multivariate-TPE' if TPE_multi else 'univariate-TPE'
table_setC['model'] = 'AdaBoost'
with pd.ExcelWriter(f'{savedate}_Classifier.xlsx', mode = 'a') as writer:
    table_setC.to_excel(writer, sheet_name = 'AdaBoost')