# 1. Import, Setting

## (1) Libraries

In [None]:
# Basic Library
import os
import pandas as pd
import numpy as np
import pickle
import warnings
warnings.filterwarnings('ignore')

# Preprocessing
from sklearn.preprocessing import OneHotEncoder

# HP Tuning
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from optuna.visualization import plot_contour, plot_optimization_history
from optuna.visualization import plot_parallel_coordinate, plot_slice, plot_param_importances

# Modeling
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, log_loss
from lightgbm import LGBMClassifier
import xgboost as xgb
import tensorflow as tf
import keras
from tensorflow.keras.layers import Conv2D, MaxPooling2D, BatchNormalization, Dropout, Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

## (2) Data

In [None]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
submission = pd.read_csv("../data/sample_submission.csv")

## (3) Variables

In [None]:
pred_dict = {}
pred_test_dict = {}

In [None]:
rows_train = 2335 # 주어진 train data의 row 수
rows_test = 9555 # 주어진 test data의 row 수
classes = 4 # 주어진 데이터의 class 수
num_trial = 20 # 파라미터 튜닝을 몇 번 진행하는지의 수
splits_hp = 5 # 파라미터 튜닝을 진행할 때의 kfold 수
splits_tr = 15 # 모델 트레이닝을 진행할 때의 kfold 수
basic_seed = 42 # default seed
num_seed = 10 # 트레이닝 seed 개수
sel_seed = 3 # 선택할 seed 개수

## (4) Folder

In [None]:
def create_dir(dir):
    if not os.path.exists(dir):
        os.makedirs(dir)
        print("Created Directory :", dir)
    else:
        print("Directory already existed :", dir)
create_dir("../pkl")
create_dir("../model")
create_dir("../submission")

# 2. Modeling

## (1) LightGBM

In [None]:
train_x = train.iloc[:, 1:-1]
train_y = train.iloc[:, -1:]
test_x = test.iloc[:, 1:]

### a. Hyperparameter Tuning

In [None]:
def lgb_objective(trial: Trial) -> float:
    params_lgb = {
        "random_state": basic_seed,
        "verbosity": -1,
        "n_estimators": 10000,
        "objective": "multiclass",
        "metric": "multi_logloss",
        'learning_rate': trial.suggest_uniform("learning_rate", 0.005, 0.05),
        "reg_alpha": trial.suggest_uniform("reg_alpha", 0.0, 1),
        "reg_lambda": trial.suggest_uniform("reg_lambda", 0.0, 1),
        "max_depth": trial.suggest_int("max_depth", 5, 8),
        "num_leaves": trial.suggest_int("num_leaves", 200, 1200),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0), # feature_fraction
        "subsample": trial.suggest_uniform("subsample", 0.0, 1.0),
        "subsample_freq": trial.suggest_int("subsample_freq", 1, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 10),
        "max_bin": trial.suggest_int("max_bin", 150, 400),
    }
    
    kfold = StratifiedKFold(n_splits=splits_hp, random_state=basic_seed, shuffle = True) # Cross-validation cv=5
    cv = np.zeros((rows_train, classes))

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):

        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx].values.ravel(), train_y.iloc[val_idx].values.ravel()

        lgbmodel = LGBMClassifier(**params_lgb)
                                                                                        # 진행상황 보고싶을때 -1을 100으로
        lgbmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=-1) 
        cv[val_idx, :] = lgbmodel.predict_proba(x_val)
        
    print('accuracy_score:', accuracy_score(train_y, np.argmax(cv, axis=1)))
    return accuracy_score(train_y, np.argmax(cv, axis=1))

In [None]:
sampler = TPESampler(seed=basic_seed)
lgb_study = optuna.create_study(study_name="lgb_parameter_opt", direction="maximize", sampler=sampler)
lgb_study.optimize(lgb_objective, n_trials=num_trial)

lgb_best_hyperparams = lgb_study.best_trial.params
lgb_base_hyperparams = {'objective':'multiclass', 'n_estimators':10000,
                        'lambda_l1':lgb_best_hyperparams['reg_alpha'],
                        'lambda_l2':lgb_best_hyperparams['reg_lambda'],
                        'reg_alpha':None, 'reg_lambda':None
                       }
lgb_best_hyperparams.update(lgb_base_hyperparams)
print("The best hyperparameters are:\n", lgb_best_hyperparams)

In [None]:
optuna.visualization.matplotlib.plot_param_importances(lgb_study);

In [None]:
optuna.visualization.matplotlib.plot_slice(lgb_study);

### b. Training

In [None]:
lucky_seeds = np.random.randint(0, 1000, num_seed)

for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=splits, random_state=basic_seed, shuffle=True) # CV 늘려가면서 하기
    cv = np.zeros((rows_train, classes))
    pred_test = np.zeros((rows_test, classes), dtype=float)

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx].values.ravel(), train_y.iloc[val_idx].values.ravel()

        lgbmodel = LGBMClassifier(**lgb_best_hyperparams)
        lgbmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=None)
        
        cv[val_idx,:] = lgbmodel.predict_proba(x_val)
        pred_test += lgbmodel.predict_proba(test_x) / splits
        
    pred_dict['lgb'+str(seed)] = cv
    pred_test_dict['lgb'+str(seed)] = pred_test
    print(f'seed {seed}', 'accuracy_score :', accuracy_score(train_y, np.argmax(cv, axis=1)))

## (2) XGBoost

In [None]:
train_x = train.iloc[:, 1:-1]
train_y = train.iloc[:, -1:]
test_x = test.iloc[:, 1:]

### a. Hyperparameter Tuning

In [None]:
def xgb_objective(trial: Trial) -> float:
    params_xgb = {
        "random_state": basic_seed,
        "verbose": None,
        "num_class": classes,
        "objective": "multi:softprob",
        "eval_metric": "mlogloss",
        "learning_rate": trial.suggest_uniform("learning_rate", 0.005, 0.05),
        "reg_alpha": trial.suggest_uniform("reg_alpha", 0.0, 0.1), # default=0
        "reg_lambda": trial.suggest_uniform("reg_lambda", 0.0, 0.1), # default=1
        "max_depth": trial.suggest_int("max_depth", 8, 15),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0), # default=0
        "colsample_bylevel": trial.suggest_uniform("colsample_bylevel", 0.5, 1.0),
        "subsample": trial.suggest_uniform("subsample", 0.5, 1.0), # default=1,
        "min_child_weight": trial.suggest_uniform("min_child_weight", 1, 5), # default=1
        "max_bin": trial.suggest_int("max_bin", 200, 500),
    }
    
    kfold = StratifiedKFold(n_splits=splits_hp, random_state=basic_seed, shuffle=True) # Cross-validation cv=5
    cv = np.zeros((rows_test, classes))

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):

        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]
        
        dtrain = xgb.DMatrix(x_train, label=y_train)
        dvalid = xgb.DMatrix(x_val, label=y_val)
        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
                                                                                            # 진행상황 보고싶을때 None을 100으로
        xgbmodel = xgb.train(params_xgb, dtrain, 100000, watchlist, early_stopping_rounds=30, verbose_eval=None)
        cv[val_idx, :] = xgbmodel.predict(dvalid)
        
    print('accuracy_score:', accuracy_score(train_y, np.argmax(cv, axis=1)))
    
    return accuracy_score(train_y, np.argmax(cv, axis=1))

In [None]:
sampler = TPESampler(seed=basic_seed)
xgb_study = optuna.create_study(study_name="xgb_parameter_opt", direction="maximize", sampler=sampler)
xgb_study.optimize(xgb_objective, n_trials=num_trial)

xgb_best_hyperparams = xgb_study.best_trial.params
xgb_base_hyperparams = {'objective':'multi:softprob', "num_class": classes, "eval_metric": "mlogloss", "random_state": basic_seed}
xgb_best_hyperparams.update(xgb_base_hyperparams)
print("The best hyperparameters are:\n", xgb_best_hyperparams)

In [None]:
optuna.visualization.matplotlib.plot_param_importances(xgb_study);

In [None]:
optuna.visualization.matplotlib.plot_slice(xgb_study);

In [None]:
lucky_seeds = np.random.randint(0, 100, num_seed)
xgtest = xgb.DMatrix(test_x)
for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=splits_tr, random_state=basic_seed, shuffle = True) # CV 늘려가면서 하기
    cv=np.zeros((rows_train, classes))
    pred_test = np.zeros((rows_test, classes), dtype=float)

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx].values.ravel(), train_y.iloc[val_idx].values.ravel()
        
        dtrain = xgb.DMatrix(x_train, label=y_train)
        dvalid = xgb.DMatrix(x_val, label=y_val)
        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
        
                                                                                            # 진행상황 보고싶을때 None을 100으로
        xgbmodel = xgb.train(xgb_best_hyperparams, dtrain, 100000, watchlist, early_stopping_rounds=30, verbose_eval=None)

        cv[val_idx, :] = xgbmodel.predict(dvalid)
        pred_test += xgbmodel.predict(xgtest) / splits_tr # CV 바꾸면 이 숫자도 똑같이 바꿔야함
        
    pred_dict['xgb'+str(seed)] = cv
    pred_test_dict['xgb'+str(seed)] = pred_test
    print(f'seed {seed}', 'accuracy_score :', accuracy_score(train_y, np.argmax(cv, axis=1)))

## 3-3. CNN Modeling

### (1) HP Tuning

In [None]:
train_x = train.iloc[:, 1:-1]
test_x = test.iloc[:, 1:]

train_x = np.array(train_x).reshape(-1, 8, 4, 1)
test_x = np.array(test_x).reshape(-1, 8, 4, 1)

ohe = OneHotEncoder(sparse = False)
train_y = ohe.fit_transform(train[['target']])

In [None]:
def create_model(num_layer, mid_units, num_filters):
    
    model = Sequential()
    model.add(Conv2D(filters=num_filters[0], kernel_size=(2, 2),
                 activation="elu",
                 input_shape=(8, 4, 1)))
    model.add(BatchNormalization())
    #model.add(Dropout(dropout_rate[0]))
    for i in range(1,num_layer):
        model.add(Conv2D(filters=num_filters[i], kernel_size=(2, 2), padding="same", activation="elu"))
        model.add(BatchNormalization())
        #model.add(Dropout(dropout_rate[i+1]))
            
    model.add(GlobalAveragePooling2D())
    model.add(Dense(mid_units, activation='relu'))
    #model.add(Dropout(dropout_rate[-1]))
    model.add(Dense(classes, activation='softmax'))
    
    return model

In [None]:
def cnn_objective(trial: Trial) -> float:
    
    #clear_session
    keras.backend.clear_session()
    
    #number of the convolution layer
    num_layer = trial.suggest_int("num_layer", 2, 3)
    
    #number of the unit
    mid_units = int(trial.suggest_discrete_uniform("mid_units", 30, 150, 10))
    
    #number of the each convolution layer filter
    num_filters = [int(trial.suggest_discrete_uniform("num_filter_"+str(i), 16, 256, 16)) for i in range(num_layer)]

    #Dropout
    #dropout_rate = trial.suggest_uniform('dropout_rate', 0.0, 0.5)
    #dropout_rate = [int(trial.suggest_uniform("dropout_rate"+str(ii), 0.0, 0.5)) for ii in range(num_layer+1)]
    
    kfold = StratifiedKFold(n_splits=splits_hp, random_state = basic_seed, shuffle = True) # Cross-validation cv=5
    es = EarlyStopping(monitor="val_acc", patience=5, mode="max", verbose=0)
    cv = np.zeros((rows_train, classes))

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train.target)):

        x_train, y_train = train_x[train_idx], train_y[train_idx]
        x_val, y_val = train_x[val_idx], train_y[val_idx]
        
        mc = ModelCheckpoint(f"../model_{n+1}.h5", save_best_only=True, monitor="val_acc", mode="max", verbose=0)
        
        model = create_model(num_layer, mid_units, num_filters)
        
        # Adam optimizer learning rate
        optimizer = Adam(learning_rate=trial.suggest_uniform("learning_rate", 0.0005, 0.005))
        model.compile(optimizer=optimizer,
                      loss="categorical_crossentropy",
                      metrics=["acc"])
        model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=100, batch_size=32, 
                  callbacks=[es,mc], verbose=None)
        
        best = load_model(f"../model_{n+1}.h5")
        cv[val_idx, :] = best.predict(x_val)
        
    print('accuracy_score:', accuracy_score(np.argmax(train_y, axis=1), np.argmax(cv, axis=1)))
    
    return accuracy_score(np.argmax(train_y, axis=1), np.argmax(cv, axis=1))

In [None]:
sampler = TPESampler(seed=basic_seed)
cnn_study = optuna.create_study(study_name="cnn_parameter_opt", direction="maximize", sampler=sampler)
cnn_study.optimize(cnn_objective, n_trials=num_trial)
cnn_best_hyperparams = cnn_study.best_trial.params
print("The best hyperparameters are:\n", cnn_best_hyperparams)

In [None]:
optuna.visualization.matplotlib.plot_param_importances(cnn_study);

In [None]:
optuna.visualization.matplotlib.plot_slice(cnn_study);

In [None]:
lucky_seeds = np.random.randint(0, 100, num_seed)

for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=splits_tr, random_state=basic_seed, shuffle = True) # CV 늘려가면서 하기
    cv=np.zeros((rows_train, classes))
    pred_test = np.zeros((rows_test, classes), dtype=float)
    es = EarlyStopping(monitor="val_acc", patience=5, mode="max", verbose=0)
    
    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train.target)):
        
        x_train, y_train = train_x[train_idx], train_y[train_idx]
        x_val, y_val = train_x[val_idx], train_y[val_idx]
        
        cnn = create_model(cnn_study.best_params['num_layer'], cnn_study.best_params['mid_units'], 
                  [cnn_study.best_params[f'num_filter_{i}'] for i in range(cnn_study.best_params['num_layer'])])

        # ModelCheckpoint Fold마다 갱신
        mc = ModelCheckpoint(f"../model_{i+1}.h5", save_best_only=True, monitor="val_acc", mode="max", verbose=0)

        # 모델 Complie
        optimizer = Adam(learning_rate=cnn_study.best_params['learning_rate'])
        cnn.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["acc"])
        cnn.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=100, batch_size=32, 
                callbacks=[es,mc], verbose=0)

        # 최고 성능 기록 모델
        best = load_model(f"../model_{i+1}.h5")
        cv[val_idx,:] = best.predict(x_val)
        pred_test += best.predict(test_x) / splits_tr
        
    pred_dict['cnn'+str(seed)] = cv
    pred_test_dict['cnn'+str(seed)] = pred_test
    print(f'seed {seed}', 'accuracy_score :', accuracy_score(np.argmax(train_y, axis=1), np.argmax(cv, axis=1)))

## 4. Stacking (XGB)

In [None]:
def sort_dict(model, pred_dict, pred_test_dict):
    pred_dict_local = {}
    for key, value in pred_dict.items():
        if model in key:
            pred_dict_local[key]=value

    pred_test_dict_local = {}
    for key, value in pred_test_dict.items():
        if model in key:
            pred_test_dict_local[key]=value

    pred_dict_new_local = dict(sorted(
        pred_dict_local.items(), 
        key=lambda x:accuracy_score(np.argmax(train_y, axis=1), 
                                    np.argmax(list(x[1]), axis=1)), reverse=True)[:sel_seed])
    pred_test_dict_new_local = {}
    for key, value in pred_dict_new_local.items():
        pred_test_dict_new_local[key]=pred_test_dict_local[key]
        
    return pred_dict_new_local, pred_test_dict_new_local

In [None]:
pred_dict_lgb, pred_test_dict_lgb = sort_dict('lgb', pred_dict, pred_test_dict)
pred_dict_xgb, pred_test_dict_xgb = sort_dict('xgb', pred_dict, pred_test_dict)
pred_dict_cnn, pred_test_dict_cnn = sort_dict('cnn', pred_dict, pred_test_dict)
pred_dict_rcnn, pred_test_dict_rcnn = sort_dict('rcnn', pred_dict, pred_test_dict)

In [None]:
def save_dict(model, pred_dict, pred_test_dict):
    with open('./pkl/pred_dict_'+model+'.pickle', 'wb') as fw:
        pickle.dump(pred_dict, fw)
    with open('./pkl/pred_test_dict_'+model+'.pickle', 'wb') as fw:
        pickle.dump(pred_test_dict, fw)

In [None]:
def create_dir(dir):
    if not os.path.exists(dir):
        os.makedirs(dir)
        print("Created Directory :", dir)
    else:
        print("Directory already existed :", dir)
create_dir("pkl")

In [None]:
save_dict('lgb', pred_dict_lgb, pred_test_dict_lgb)
save_dict('xgb', pred_dict_xgb, pred_test_dict_xgb)
save_dict('cnn', pred_dict_cnn, pred_test_dict_cnn)
save_dict('rcnn', pred_dict_rcnn, pred_test_dict_rcnn)

In [None]:
def load_dict(model):
    with open('./pkl/pred_dict_'+model+'.pickle', 'rb') as fw:
        pred_dict_new_local = pickle.load(fw)
    with open('./pkl/pred_test_dict_'+model+'.pickle', 'rb') as fw:
        pred_test_dict_new_local = pickle.load(fw)    
    return pred_dict_new_local, pred_test_dict_new_local

In [None]:
pred_dict_lgb, pred_test_dict_lgb = load_dict('lgb')
pred_dict_xgb, pred_test_dict_xgb = load_dict('xgb')
pred_dict_cnn, pred_test_dict_cnn = load_dict('cnn')
pred_dict_rcnn, pred_test_dict_rcnn = load_dict('rcnn')

In [None]:
pred_dict_total = {**pred_dict_lgb, **pred_dict_xgb, **pred_dict_cnn, **pred_dict_rcnn}
pred_test_dict_total = {**pred_test_dict_lgb, **pred_test_dict_xgb, **pred_test_dict_cnn, **pred_test_dict_rcnn}

In [None]:
train_y = train.iloc[:, -1:]

In [None]:
def stack_objective(trial: Trial) -> float:
    params_xgb = {
        "random_state": basic_seed,
        "verbose": None,
        "num_class": classes,
        "objective": "multi:softprob",
        "eval_metric": "mlogloss",
        "learning_rate": trial.suggest_uniform("learning_rate", 0.0005, 0.05),
        "reg_alpha": trial.suggest_uniform("reg_alpha", 0.1, 1.0),
        "reg_lambda": trial.suggest_uniform("reg_lambda", 0.1, 1.0),
        "max_depth": trial.suggest_int("max_depth", 6, 10),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.3, 1.0),
        "colsample_bylevel": trial.suggest_uniform("colsample_bylevel", 0.3, 1.0),
        "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
        "gamma": trial.suggest_uniform("gamma", 0.0, 0.5),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "max_bin": trial.suggest_int("max_bin", 100, 400),
    }
    
    kfold = StratifiedKFold(n_splits=splits_hp, random_state=basic_seed, shuffle=True)
    cv = np.zeros((rows_train, classes))

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):

        x_train, x_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]
        
        dtrain = xgb.DMatrix(x_train, label=y_train)
        dvalid = xgb.DMatrix(x_val, label=y_val)
        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
                                                                                            # 진행상황 보고싶을때 None을 100으로
        stack_xgbmodel = xgb.train(params_xgb, dtrain, 100000, watchlist, early_stopping_rounds=30, verbose_eval=None)
        cv[val_idx, :] = stack_xgbmodel.predict(dvalid)

    print('accuracy_score:', accuracy_score(train_y, np.argmax(cv, axis=1)))

    return accuracy_score(train_y, np.argmax(cv, axis=1))

In [None]:
X_train = pd.DataFrame(np.hstack([x for _, x in pred_dict_total.items()]))
X_test = pd.DataFrame(np.hstack([x for _, x in pred_test_dict_total.items()]))

sampler = TPESampler(seed=basic_seed)
stack_study = optuna.create_study(study_name="stack_parameter_opt", direction="maximize", sampler=sampler)
stack_study.optimize(stack_objective, n_trials=num_trial)

stack_best_hyperparams = stack_study.best_trial.params
stack_base_hyperparams = {'objective':'multi:softprob', "num_class": classes, "eval_metric": "mlogloss", "random_state": basic_seed}
stack_best_hyperparams.update(stack_base_hyperparams)
print("The best hyperparameters are:\n", stack_best_hyperparams)

In [None]:
optuna.visualization.matplotlib.plot_param_importances(stack_study);

In [None]:
optuna.visualization.matplotlib.plot_slice(stack_study);

In [None]:
pred = np.zeros((rows_train, classes), dtype=float)
pred_test = np.zeros((rows_test, classes), dtype=float)
kfold = StratifiedKFold(n_splits=splits_tr, random_state=basic_seed, shuffle = True)

for n, (train_idx, val_idx) in enumerate(kfold.split(X_train, train_y)):
    x_train, x_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

    dtrain = xgb.DMatrix(x_train, label=y_train)
    dvalid = xgb.DMatrix(x_val, label=y_val)
    watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
                                                                                        # 진행상황 보고싶을때 None을 100으로
    stack_xgbmodel = xgb.train(stack_best_hyperparams, dtrain, 100000, watchlist, early_stopping_rounds=30, verbose_eval=None)
    
    pred[val_idx, :] = stack_xgbmodel.predict(dvalid)
    pred_test += stack_xgbmodel.predict(xgb.DMatrix(X_test)) / splits_tr

In [None]:
print(f'CV Log Loss: {log_loss(train_y, pred):.6f}')
print(f'CV Accuracy Score: {accuracy_score(train_y, np.argmax(pred, axis=1)):.6f}')

## 5. Blending

In [None]:
stack_train = pred.copy()
stack_test = pred_test.copy()

In [None]:
from itertools import permutations
candidate = [0,1,2,3,4,5,6,7,8,9,10]
permute = permutations(candidate,5)
score = {}
for i in list(permute):
    pred_final = (sum(pred_dict_lgb.values())/sel_seed * i[0] +
              sum(pred_dict_xgb.values())/sel_seed * i[1] +
              sum(pred_dict_cnn.values())/sel_seed * i[2] +
              sum(pred_dict_rcnn.values())/sel_seed * i[3] +
                               stack_train * i[4])
    score[i] = accuracy_score(train_y, np.argmax(pred_final, axis=1))

score = dict(sorted(score.items(), key=lambda x: x[1], reverse=True)[:5])
score

In [None]:
pred_final = (sum(pred_dict_lgb.values())/sel_seed * 0 +
              sum(pred_dict_xgb.values())/sel_seed * 1 +
              sum(pred_dict_cnn.values())/sel_seed * 3 +
              sum(pred_dict_rcnn.values())/sel_seed * 2 +
                               stack_train * 15)
accuracy_score(train_y, np.argmax(pred_final, axis=1))

In [None]:
pred_test_final = (sum(pred_test_dict_lgb.values())/3 * 0 +
                   sum(pred_test_dict_xgb.values())/3 * 1 +
                    sum(pred_test_dict_cnn.values())/3 * 3 +
                    sum(pred_dict_rcnn.values())/3 * 2 +
                                          stack_test * 15)

## 6. Submission

In [None]:
submission_name = '20220318'
submission_number = '3'
submission['target'] = np.argmax(pred_test_final, axis=1)
submission.to_csv(f'../submission/{submission_name}-{submission_number}.csv', index = False)
submission.target.value_counts()