In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from tqdm.auto import tqdm

from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor, RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, CatBoostRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier, XGBRegressor
import torch
from torch import nn
from torch.utils.data import DataLoader
import optuna

from library.Data_Preprocessing import Balance_Ratio, train_col
from library.Imbalance_Sampling import label_divide, resampling_dataset
from library.Aging_Score_Contour import score1
from library.AdaBoost import train_set, multiple_set, multiple_month, line_chart, cf_matrix, AUC, PR_curve, \
     multiple_curve, PR_matrix, best_threshold, all_optuna, optuna_history
from library.XGBoost import XGBoost_creator
from library.LightGBM import LightGBM_creator
from library.CatBoost import CatBoost_creator
from library.RandomForest import RandomForest_creator
from library.ExtraTrees import ExtraTrees_creator
from library.NeuralNetwork import RunhistSet, NeuralNetworkC, trainingC
from library.StackingCV_Scheme3 import optimize_base, stratified_data, runall_LR, runall_RidgeR, stackingCV_creator, \
    correlation_plot, vif, rank_importance, month_param, LR

os.chdir('C:/Users/user/Desktop/Darui_R08621110')  
os.getcwd()

## Function Definition

### Transform Data by Base Learners

In [None]:
# input training data to the base learners and output the outcome
def transform_train(train_data, mode, base_param, cv, add_origin = False):
    
    month_list = list(base_param.keys())
    model_list = list(base_param[month_list[0]].keys())
    set_list = list(base_param[month_list[0]][model_list[0]].keys())
    set_dict = {} 
    for i in tqdm(set_list):
        
        train_x_dict, train_y_dict, valid_x_dict, valid_y_dict = stratified_data(train_data[i], cv = cv)
        all_month = pd.DataFrame()
        for month in tqdm(month_list):    
            
            all_cv = pd.DataFrame()
            for j in range(cv):
                
                model_predict = pd.DataFrame()
                if mode == 'C':

                    if 'NeuralNetwork' in model_list:
                        temp_train = RunhistSet(train_x_dict[j], train_y_dict[j])
                        temp_valid = RunhistSet(valid_x_dict[j], valid_y_dict[j])
                        train_loader = DataLoader(temp_train, 
                                                  batch_size = base_param[month]['NeuralNetwork'][i]['batch_size'], 
                                                  shuffle = True)
                        valid_loader = DataLoader(temp_valid, batch_size = len(valid_x_dict[j]), shuffle = False)
                        nn_model = NeuralNetworkC(dim = train_x_dict[j].shape[1])
                        optimizer = torch.optim.Adam(nn_model.parameters(), 
                                                     lr = base_param[month]['NeuralNetwork'][i]['learning_rate'], 
                                                     weight_decay = base_param[month]['NeuralNetwork'][i]['weight_decay'])
                        criterion = nn.CrossEntropyLoss(
                            weight = torch.tensor([1-base_param[month]['NeuralNetwork'][i]['bad_weight'], 
                                                   base_param[month]['NeuralNetwork'][i]['bad_weight']])).to('cpu')
                        network, _, _ = trainingC(nn_model, train_loader, train_loader, optimizer, criterion, epoch = 100, 
                                                  early_stop = 10)
                        for x, y in valid_loader:
                            output = network(x)
                            predict_y = output.data[:, 1]
                        predict = pd.DataFrame({f'N_{month}': predict_y.numpy()})
                        model_predict = pd.concat([model_predict, predict], axis = 1)
                    
                    if 'XGBoost' in model_list:                     
                        clf = XGBClassifier(**base_param[month]['XGBoost'][i], n_jobs = -1)
                        clf.fit(train_x_dict[j], train_y_dict[j])
                        predict_y = clf.predict_proba(valid_x_dict[j])
                        predict = pd.DataFrame({f'X_{month}': predict_y[:, 0]})
                        model_predict = pd.concat([model_predict, predict], axis = 1)

                    if 'LightGBM' in model_list:                        
                        clf = LGBMClassifier(**base_param[month]['LightGBM'][i])
                        clf.fit(train_x_dict[j], train_y_dict[j])
                        predict_y = clf.predict_proba(valid_x_dict[j])
                        predict = pd.DataFrame({f'L_{month}': predict_y[:, 0]})
                        model_predict = pd.concat([model_predict, predict], axis = 1)
                        
                    if 'CatBoost' in model_list:
                        clf = CatBoostClassifier(**base_param[month]['CatBoost'][i])
                        clf.fit(train_x_dict[j], train_y_dict[j])
                        predict_y = clf.predict_proba(valid_x_dict[j])
                        predict = pd.DataFrame({f'C_{month}': predict_y[:, 0]})
                        model_predict = pd.concat([model_predict, predict], axis = 1)
                        
                    if 'RandomForest' in model_list:
                        clf = RandomForestClassifier(**base_param[month]['RandomForest'][i])
                        clf.fit(train_x_dict[j], train_y_dict[j])
                        predict_y = clf.predict_proba(valid_x_dict[j])
                        predict = pd.DataFrame({f'R_{month}': predict_y[:, 0]})
                        model_predict = pd.concat([model_predict, predict], axis = 1)
                        
                    if 'ExtraTrees' in model_list:
                        clf = ExtraTreesClassifier(**base_param[month]['ExtraTrees'][i])
                        clf.fit(train_x_dict[j], train_y_dict[j])
                        predict_y = clf.predict_proba(valid_x_dict[j])
                        predict = pd.DataFrame({f'E_{month}': predict_y[:, 0]})
                        model_predict = pd.concat([model_predict, predict], axis = 1)
                        
                elif mode == 'R':
                    
                    if 'XGBoost' in model_list:
                        reg = XGBRegressor(**base_param[month]['XGBoost'][i], n_jobs = -1)
                        reg.fit(train_x_dict[j], train_y_dict[j])
                        predict_y = reg.predict(valid_x_dict[j])
                        predict = pd.DataFrame({f'X_{month}': predict_y})
                        model_predict = pd.concat([model_predict, predict], axis = 1)

                    if 'LightGBM' in model_list:
                        reg = LGBMRegressor(**base_param[month]['LightGBM'][i])
                        reg.fit(train_x_dict[j], train_y_dict[j])
                        predict_y = reg.predict(valid_x_dict[j])
                        predict = pd.DataFrame({f'L_{month}': predict_y})
                        model_predict = pd.concat([model_predict, predict], axis = 1)
                        
                    if 'CatBoost' in model_list:
                        reg = CatBoostRegressor(**base_param[month]['CatBoost'][i])
                        reg.fit(train_x_dict[j], train_y_dict[j])
                        predict_y = reg.predict(valid_x_dict[j])
                        predict = pd.DataFrame({f'C_{month}': predict_y})
                        model_predict = pd.concat([model_predict, predict], axis = 1)
                        
                    if 'RandomForest' in model_list:
                        reg = RandomForestRegressor(**base_param[month]['RandomForest'][i])
                        reg.fit(train_x_dict[j], train_y_dict[j])
                        predict_y = reg.predict(valid_x_dict[j])
                        predict = pd.DataFrame({f'R_{month}': predict_y})
                        model_predict = pd.concat([model_predict, predict], axis = 1)
                    
                    if 'ExtraTrees' in model_list:
                        reg = ExtraTreesRegressor(**base_param[month]['ExtraTrees'][i])
                        reg.fit(train_x_dict[j], train_y_dict[j])
                        predict_y = reg.predict(valid_x_dict[j])
                        predict = pd.DataFrame({f'E_{month}': predict_y})
                        model_predict = pd.concat([model_predict, predict], axis = 1)
                        
                test_label = valid_y_dict[j].reset_index(drop = True)
                origin_feature = valid_x_dict[j].reset_index(drop = True)
                if month == month_list[-1]:
                    if add_origin:
                        done_cv = pd.concat([model_predict, origin_feature, test_label], axis = 1)
                    else:
                        done_cv = pd.concat([model_predict, test_label], axis = 1)
                else:
                    done_cv = model_predict
                        
                all_cv = pd.concat([all_cv, done_cv], axis = 0)                
            all_month = pd.concat([all_month, all_cv], axis = 1)
        set_dict[i] = all_month
    
    return set_dict


# input testing data to the base learners and output the outcome
def transform_test(train_data, test_data, mode, base_param, add_origin = False):
    
    month_list = list(base_param.keys())
    model_list = list(base_param[month_list[0]].keys())
    set_list = list(base_param[month_list[0]][model_list[0]].keys())
    test_dict = {}
    for i in tqdm(set_list):
        
        month_test = pd.DataFrame()
        for month in tqdm(month_list):
            
            select_test = train_col(train_data[month][i], test_data)
            train_x, train_y, test_x, test_y = label_divide(train_data[month][i], select_test, train_only = False)
            model_predict = pd.DataFrame()
            if mode == 'C':

                if 'NeuralNetwork' in model_list:
                    temp_train = RunhistSet(train_x, train_y)
                    temp_test = RunhistSet(test_x, test_y)
                    train_loader = DataLoader(temp_train, 
                                              batch_size = base_param[month]['NeuralNetwork'][i]['batch_size'], 
                                              shuffle = True)
                    test_loader = DataLoader(temp_test, batch_size = len(test_x), shuffle = False)
                    nn_model = NeuralNetworkC(dim = train_x.shape[1])
                    optimizer = torch.optim.Adam(nn_model.parameters(), 
                                                 lr = base_param[month]['NeuralNetwork'][i]['learning_rate'], 
                                                 weight_decay = base_param[month]['NeuralNetwork'][i]['weight_decay'])
                    criterion = nn.CrossEntropyLoss(
                        weight = torch.tensor([1-base_param[month]['NeuralNetwork'][i]['bad_weight'], 
                                               base_param[month]['NeuralNetwork'][i]['bad_weight']])).to('cpu')
                    network, _, _ = trainingC(nn_model, train_loader, train_loader, optimizer, criterion, epoch = 100, 
                                              early_stop = 10)
                    for X, Y in test_loader:
                        X, Y = X.float(), Y.long()
                        output = network(X)
                        predict_y = output.data[:, 1]
                    predict = pd.DataFrame({f'N_{month}': predict_y.numpy()})
                    model_predict = pd.concat([model_predict, predict], axis = 1)
                
                if 'XGBoost' in model_list:
                    clf = XGBClassifier(**base_param[month]['XGBoost'][i], n_jobs = -1)
                    clf.fit(train_x, train_y)
                    predict_y = clf.predict_proba(test_x)
                    predict = pd.DataFrame({f'X_{month}': predict_y[:, 0]})
                    model_predict = pd.concat([model_predict, predict], axis = 1)

                if 'LightGBM' in model_list:
                    clf = LGBMClassifier(**base_param[month]['LightGBM'][i])
                    clf.fit(train_x, train_y)
                    predict_y = clf.predict_proba(test_x)
                    predict = pd.DataFrame({f'L_{month}': predict_y[:, 0]})
                    model_predict = pd.concat([model_predict, predict], axis = 1)

                if 'CatBoost' in model_list:
                    clf = CatBoostClassifier(**base_param[month]['CatBoost'][i])
                    clf.fit(train_x, train_y)
                    predict_y = clf.predict_proba(test_x)
                    predict = pd.DataFrame({f'C_{month}': predict_y[:, 0]})
                    model_predict = pd.concat([model_predict, predict], axis = 1)

                if 'RandomForest' in model_list:
                    clf = RandomForestClassifier(**base_param[month]['RandomForest'][i])
                    clf.fit(train_x, train_y)
                    predict_y = clf.predict_proba(test_x)
                    predict = pd.DataFrame({f'R_{month}': predict_y[:, 0]})
                    model_predict = pd.concat([model_predict, predict], axis = 1)

                if 'ExtraTrees' in model_list:
                    clf = ExtraTreesClassifier(**base_param[month]['ExtraTrees'][i])
                    clf.fit(train_x, train_y)
                    predict_y = clf.predict_proba(test_x)
                    predict = pd.DataFrame({f'E_{month}': predict_y[:, 0]})
                    model_predict = pd.concat([model_predict, predict], axis = 1)

            elif mode == 'R':

                if 'XGBoost' in model_list:
                    reg = XGBRegressor(**base_param[month]['XGBoost'][i], n_jobs = -1)
                    reg.fit(train_x, train_y)
                    predict_y = reg.predict(test_x)
                    predict = pd.DataFrame({f'X_{month}': predict_y})
                    model_predict = pd.concat([model_predict, predict], axis = 1)

                if 'LightGBM' in model_list:
                    reg = LGBMRegressor(**base_param[month]['LightGBM'][i])
                    reg.fit(train_x, train_y)
                    predict_y = reg.predict(test_x)
                    predict = pd.DataFrame({f'L_{month}': predict_y})
                    model_predict = pd.concat([model_predict, predict], axis = 1)

                if 'CatBoost' in model_list:
                    reg = CatBoostRegressor(**base_param[month]['CatBoost'][i])
                    reg.fit(train_x, train_y)
                    predict_y = reg.predict(test_x)
                    predict = pd.DataFrame({f'C_{month}': predict_y})
                    model_predict = pd.concat([model_predict, predict], axis = 1)

                if 'RandomForest' in model_list:
                    reg = RandomForestRegressor(**base_param[month]['RandomForest'][i])
                    reg.fit(train_x, train_y)
                    predict_y = reg.predict(test_x)
                    predict = pd.DataFrame({f'R_{month}': predict_y})
                    model_predict = pd.concat([model_predict, predict], axis = 1)

                if 'ExtraTrees' in model_list:
                    reg = ExtraTreesRegressor(**base_param[month]['ExtraTrees'][i])
                    reg.fit(train_x, train_y)
                    predict_y = reg.predict(test_x)
                    predict = pd.DataFrame({f'E_{month}': predict_y})
                    model_predict = pd.concat([model_predict, predict], axis = 1)

            month_test = pd.concat([month_test, model_predict], axis = 1)
        if add_origin:
            test_dict[i] = pd.concat([month_test, test_x, test_y], axis = 1)
        else:
            test_dict[i] = pd.concat([month_test, test_y], axis = 1)
        
    return test_dict

### Full Experiment

In [None]:
def full_stackingcv2(train_month, times):
    prob_dict = dict()
    result_df = pd.DataFrame()

    # load relabel datasets
    runhist = {}
    kinds = {}
    for i in train_month:
        runhist[f'm{i}'] = pd.read_csv(f'relabel_runhist_m{i}.csv', index_col = 'id').iloc[:, 1:]
        kinds[f'm{i}'] = pd.read_csv(f'kind_m{i}.csv').iloc[:, 2:-3]

    # do several times to average the random effect of resampling
    for i in tqdm(range(times)):
        # generate resampled datasets
        resampling_dataset(runhist = runhist, kinds = kinds, train_month = train_month, final_br = 1, num_os = 10)

        # load & prepare the resampled datasets 
        data_dict, trainset_x, trainset_y = multiple_month(train_month, num_set = 10, filename = 'dataset')
        all_train = multiple_set(num_set = 10)
        all_train_x, all_train_y = train_set(all_train)
        all_test = pd.read_csv('test_runhist.csv').iloc[:, 2:]
        all_test_x, all_test_y = label_divide(all_test, None, 'GB', train_only = True)

        # optimization for each month of data
        base_param = optimize_base(train_data = data_dict, 
                                   mode = 'C', 
                                   TPE_multi = False, 
                                   base_list = ['LightGBM', 'XGBoost', 'NeuralNetwork'],
                                   iter_dict = {'LightGBM': 25, 'NeuralNetwork': 10, 'XGBoost': 25, 'CatBoost': 25, 
                                                'RandomForest': 20, 'ExtraTrees': 20},
                                   filename = f'runhist_array_m2m4_m5_3criteria_scheme2-{i}')
        
        # data transformation
        trans_train = transform_train(all_train, mode = 'C', base_param = base_param, cv = 5, add_origin = True)
        trans_test = transform_test(data_dict, all_test, mode = 'C', base_param = base_param, add_origin = True)
        for k in trans_train.keys():
            trans_train[k] = train_col(trans_test[k], trans_train[k])
        trans_train_x, trans_train_y = train_set(trans_train)
        trans_test_x, trans_test_y = train_set(trans_test) 
        trans_train['set0'] = {}      
        
        # searching for hyperparameters
        best_param, _ = all_optuna(all_data = trans_train, 
                                   mode = 'C', 
                                   TPE_multi = False, 
                                   n_iter = 10,
                                   filename = f'runhist_array_m2m4_m5_3criteria_StackingCV2-{i}',
                                   creator = stackingCV_creator)
        
        # store the probability predicted by the classifier 
        for j in best_param.keys():
            if i == 0:
                prob_dict[j] = pd.DataFrame()
            table, _ = LR(trans_train_x[j], trans_test_x[j], trans_train_y[j], trans_test_y[j], best_param[j], 
                          return_prob = True)
            prob_dict[j] = pd.concat([prob_dict[j], table[['predict']]], axis = 1)
            
    # average to get final prediction
    for j in best_param.keys():
        prediction = (prob_dict[j].apply(np.sum, axis = 1) >= 0.5).astype(int)
        result = pd.DataFrame(dict(truth = all_test_y, predict = prediction))
        table = cf_matrix(result, all_train_y[j])
        result_df = pd.concat([result_df, table]).rename(index = {0: f'data{j}'})
        
    return result_df

## Prediction

### Full Experiment

In [None]:
training_month = range(2, 5)
table_setC = full_stackingcv2(training_month, times = 3)

In [None]:
line_chart(table_setC, title = 'StackingCV Scheme 2 Classifier')
table_setC

### Export

In [None]:
savedate = '20220601'
TPE_multi = False

table_setC['sampler'] = 'multivariate-TPE' if TPE_multi else 'univariate-TPE'
table_setC['model'] = 'StackingCV2'
with pd.ExcelWriter(f'{savedate}_Classifier.xlsx', mode = 'a') as writer:
    table_setC.to_excel(writer, sheet_name = 'StackingCV2')