# Import

In [None]:
import pandas as pd
import numpy as np
import os
import pickle
from glob import glob
from itertools import permutations
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, log_loss

import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from optuna.visualization import plot_contour, plot_optimization_history
from optuna.visualization import plot_parallel_coordinate, plot_slice, plot_param_importances

from sklearn.neural_network import MLPClassifier
from catboost import Pool, CatBoostClassifier
from lightgbm import LGBMClassifier
import xgboost as xgb

In [None]:
train = pd.read_csv("../data/train.csv", index_col="id")
test = pd.read_csv("../data/test.csv", index_col="id")
submission = pd.read_csv("../data/sample_submission.csv")

In [None]:
def create_dir(dir):
    if not os.path.exists(dir):
        os.makedirs(dir)
        print("Created Directory :", dir)
    else:
        print("Directory already existed :", dir)
create_dir("../pickle")
create_dir("../model")
create_dir("../submission")

In [None]:
train_x = train['text']
train_y = train['target']
test_x = test['text']

In [None]:
rows_train = train.shape[0] # 주어진 train data의 row 수
rows_test = test.shape[0] # 주어진 test data의 row 수
num_classes = len(train_y.unique())
num_trial = 100 # 파라미터 튜닝을 몇 번 진행하는지의 수
splits_hp = 3 # 파라미터 튜닝을 진행할 때의 kfold 수
splits_tr = 15 # 모델 트레이닝을 진행할 때의 kfold 수
basic_seed = 42 # default seed
num_seed_tr = 5 # 트레이닝 seed 개수
sel_seed = 4 # 선택할 seed 개수

In [None]:
pred_dict = {}
pred_test_dict = {}

# LGB

In [None]:
def lgb_objective(trial: Trial) -> float:
    score_hp = []
    for seed_hp in [0]:
        params_lgb = {
            "random_state": basic_seed,
            "verbosity": -1,
            "n_estimators": 10000,
            "objective": "multiclass",
            "metric": "multi_logloss",
            "learning_rate": trial.suggest_uniform("learning_rate", 4e-2, 1e-1), # default=0.1, range=[0,1]
            "max_depth": trial.suggest_int("max_depth", 5, 12), # default=-1
            "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-2, 1e+0), # default=0
            "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-2, 1e+1), # default=0
            "num_leaves": trial.suggest_int("num_leaves", 31, 4000), # default=31, range=(1,130172]
            "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.2, 0.8), # feature_fraction, default=1
            "subsample": trial.suggest_uniform("subsample", 0.5, 1.0), # bagging_fraction, default=1, range=[0,1]
            "subsample_freq": trial.suggest_int("subsample_freq", 1, 20), # bagging_freq, default=0
            "min_child_samples": trial.suggest_int("min_child_samples", 20, 30), # min_data_in_leaf, default=20 
            "max_bin": trial.suggest_int("max_bin", 100, 400),
        }

        kfold = StratifiedKFold(n_splits=splits_hp, random_state=seed_hp, shuffle=True)
        cv = np.zeros((rows_train, num_classes))

        for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
            
            x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
            y_train, y_val = train_y.iloc[train_idx].values.ravel(), train_y.iloc[val_idx].values.ravel()

            vectorizer = TfidfVectorizer(ngram_range=(1, 2))
            vectorizer.fit(x_train)
            x_train = vectorizer.transform(x_train)
            x_val = vectorizer.transform(x_val)
            x_test = vectorizer.transform(test_x)

            lgbmodel = LGBMClassifier(**params_lgb)

#             print(f'fold {n+1} start')
        
            lgbmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=-1) 
            cv[val_idx, :] = lgbmodel.predict_proba(x_val)
        
#             print(f"fold{n+1} log_loss:", log_loss(y_val, cv[val_idx]))
            score_hp.append(log_loss(y_val, cv[val_idx]))
            break
        
#         score_hp.append(log_loss(train_y, cv))
    
    np.mean(score_hp)
    return np.mean(score_hp)

In [None]:
sampler = TPESampler(seed=basic_seed)
lgb_study = optuna.create_study(study_name="lgb_parameter_opt", direction="minimize", sampler=sampler)
lgb_study.optimize(lgb_objective, n_trials=num_trial)

lgb_best_hyperparams = lgb_study.best_trial.params
lgb_base_hyperparams = {"random_state": basic_seed, "verbosity": -1, 
                        "n_estimators": 10000, "objective": "multiclass", "metric": "multi_logloss"}
lgb_best_hyperparams.update(lgb_base_hyperparams)

with open('../pickle/lgb_best_hyperparams.pickle', 'wb') as fw:
    pickle.dump(lgb_best_hyperparams, fw)
print("The best hyperparameters are:\n", lgb_best_hyperparams)

In [None]:
# with open('../pickle/lgb_best_hyperparams.pickle', 'rb') as fw:
#     lgb_best_hyperparams = pickle.load(fw)

In [None]:
lucky_seeds = np.random.randint(0, 1000, num_seed_tr)

for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=splits_tr, random_state=seed, shuffle=True)
    cv = np.zeros((rows_train, num_classes))
    pred_test = np.zeros((rows_test, num_classes))

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx].values.ravel(), train_y.iloc[val_idx].values.ravel()
        
        vectorizer = TfidfVectorizer(ngram_range=(1, 2))
        vectorizer.fit(x_train)
        x_train = vectorizer.transform(x_train)
        x_val = vectorizer.transform(x_val)
        x_test = vectorizer.transform(test_x)

        lgbmodel = LGBMClassifier(**lgb_best_hyperparams)

#         print(f'fold {n+1} start')
        
        lgbmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=-1) 
        cv[val_idx, :] = lgbmodel.predict_proba(x_val)
        pred_test += lgbmodel.predict_proba(x_test) / splits_tr
        
        print(f'fold {n+1}', 'log_loss :', log_loss(y_val, cv[val_idx]))
        print(f'fold {n+1}', 'accuracy_score :', accuracy_score(y_val, np.argmax(cv[val_idx], axis=1)))
        
    pred_dict['lgb'+str(seed)] = cv
    pred_test_dict['lgb'+str(seed)] = pred_test
    print(f'seed {seed}', 'log_loss :', log_loss(train_y, cv))
    print(f'seed {seed}', 'accuracy_score :', accuracy_score(train_y, np.argmax(cv, axis=1)))

# MLP1

In [None]:
lucky_seeds = np.random.randint(0, 1000, num_seed_tr)

for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=splits_tr, random_state=seed, shuffle=True)
    cv = np.zeros((rows_train, num_classes))
    pred_test = np.zeros((rows_test, num_classes))

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx].values.ravel(), train_y.iloc[val_idx].values.ravel()
        
        vectorizer = TfidfVectorizer(ngram_range=(1, 2))
        vectorizer.fit(x_train)
        x_train = vectorizer.transform(x_train)
        x_val = vectorizer.transform(x_val)
        x_test = vectorizer.transform(test_x)
        
#         print(f'fold {n+1} start')
        
        MLPModel = MLPClassifier(max_iter=12, hidden_layer_sizes=100, random_state=basic_seed, verbose=False)
        MLPModel.fit(x_train, y_train)
        
        cv[val_idx, :] = MLPModel.predict_proba(x_val)
        
        pred_test += MLPModel.predict_proba(x_test) / splits_tr
        
        print(f'fold {n+1}', 'log_loss :', log_loss(y_val, cv[val_idx]))
        print(f'fold {n+1}', 'accuracy_score :', accuracy_score(y_val, np.argmax(cv[val_idx], axis=1)))
        
    pred_dict['mlp1'+str(seed)] = cv
    pred_test_dict['mlp1'+str(seed)] = pred_test
    print(f'seed {seed}', 'log_loss :', log_loss(train_y, cv))
    print(f'seed {seed}', 'accuracy_score :', accuracy_score(train_y, np.argmax(cv, axis=1)))

# MLP2

In [None]:
lucky_seeds = np.random.randint(0, 1000, num_seed_tr)

for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=splits_tr, random_state=seed, shuffle=True)
    cv = np.zeros((rows_train, num_classes))
    pred_test = np.zeros((rows_test, num_classes))

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx].values.ravel(), train_y.iloc[val_idx].values.ravel()
        
        vectorizer = TfidfVectorizer(ngram_range=(1, 2))
        vectorizer.fit(x_train)
        x_train = vectorizer.transform(x_train)
        x_val = vectorizer.transform(x_val)
        x_test = vectorizer.transform(test_x)
        
#         print(f'fold {n+1} start')
        
        MLPModel = MLPClassifier(max_iter=3, hidden_layer_sizes=250, random_state=basic_seed, verbose=False)
        MLPModel.fit(x_train, y_train)
        
        cv[val_idx, :] = MLPModel.predict_proba(x_val)
        
        pred_test += MLPModel.predict_proba(x_test) / splits_tr
        
        print(f'fold {n+1}', 'log_loss :', log_loss(y_val, cv[val_idx]))
        print(f'fold {n+1}', 'accuracy_score :', accuracy_score(y_val, np.argmax(cv[val_idx], axis=1)))
        
    pred_dict['mlp2'+str(seed)] = cv
    pred_test_dict['mlp2'+str(seed)] = pred_test
    print(f'seed {seed}', 'log_loss :', log_loss(train_y, cv))
    print(f'seed {seed}', 'accuracy_score :', accuracy_score(train_y, np.argmax(cv, axis=1)))

# Ensemble

In [None]:
def sort_dict(model, pred_dict, pred_test_dict):
    pred_dict_local = {}
    for key, value in pred_dict.items():
        if model in key:
            pred_dict_local[key]=value

    pred_test_dict_local = {}
    for key, value in pred_test_dict.items():
        if model in key:
            pred_test_dict_local[key]=value

    pred_dict_new_local = dict(sorted(
        pred_dict_local.items(), 
        key=lambda x:accuracy_score((train_y), np.argmax(list(x[1]), axis=1)), reverse=False)[:5])
    pred_test_dict_new_local = {}
    for key, value in pred_dict_new_local.items():
        pred_test_dict_new_local[key]=pred_test_dict_local[key]
        
    return pred_dict_new_local, pred_test_dict_new_local

In [None]:
def save_dict(model, pred_dict, pred_test_dict):
    with open('../pickle/pred_dict_'+model+'.pickle', 'wb') as fw:
        pickle.dump(pred_dict, fw)
    with open('../pickle/pred_test_dict_'+model+'.pickle', 'wb') as fw:
        pickle.dump(pred_test_dict, fw)

In [None]:
pred_dict_lgb, pred_test_dict_lgb = sort_dict('lgb', pred_dict, pred_test_dict)
save_dict('lgb', pred_dict_lgb, pred_test_dict_lgb)

In [None]:
pred_dict_mlp1, pred_test_dict_mlp1 = sort_dict('mlp1', pred_dict, pred_test_dict)
save_dict('mlp_epoch3_layer250_cv15', pred_dict_mlp1, pred_test_dict_mlp1)

In [None]:
pred_dict_mlp2, pred_test_dict_mlp2 = sort_dict('mlp2', pred_dict, pred_test_dict)
save_dict('mlp_epoch3_layer250_cv15', pred_dict_mlp2, pred_test_dict_mlp2)

In [None]:
def load_dict(model):
    with open('../pickle/pred_dict_'+model+'.pickle', 'rb') as fw:
        pred_dict_new_local = pickle.load(fw)
    with open('../pickle/pred_test_dict_'+model+'.pickle', 'rb') as fw:
        pred_test_dict_new_local = pickle.load(fw)
    return pred_dict_new_local, pred_test_dict_new_local

In [None]:
pred_dict_lgb, pred_test_dict_lgb = load_dict('lgb')

In [None]:
pred_dict_mlp1, pred_test_dict_mlp1 = load_dict('mlp_epoch12_layer100_cv15')

In [None]:
pred_dict_mlp2, pred_test_dict_mlp2 = load_dict('mlp_epoch3_layer250_cv15')

In [None]:
candidate = np.arange(0, 15)
permute = permutations(candidate, 3)
score = {}
for i in tqdm(list(permute)):
    pred_permute = (
                    sum(pred_dict_lgb.values())/sel_seed * i[0] +
                    sum(pred_dict_mlp1.values())/sel_seed * i[1] +
                    sum(pred_dict_mlp2.values())/sel_seed * i[2]
                   )
    score[i] = accuracy_score(train_y, np.argmax(pred_permute/sum(i), axis=1))

score = dict(sorted(score.items(), key=lambda x: x[1], reverse=True)[:5])
score

In [None]:
pred = (sum(pred_dict_lgb.values())/sel_seed * list(score.keys())[0][0] +
        sum(pred_dict_mlp1.values())/sel_seed * list(score.keys())[0][1] +
        sum(pred_dict_mlp2.values())/sel_seed * list(score.keys())[0][2]
       ) / sum(list(score.keys())[0])
accuracy_score(train_y, np.argmax(pred, axis=1))

In [None]:
pred_test = (sum(pred_test_dict_lgb.values())/sel_seed * list(score.keys())[0][0] + 
             sum(pred_test_dict_mlp1.values())/sel_seed * list(score.keys())[0][1] +
             sum(pred_test_dict_mlp2.values())/sel_seed * list(score.keys())[0][2]
            ) / sum(list(score.keys())[0])

In [None]:
pred_test = np.argmax(pred_test, axis=1)

In [None]:
submission["target"] = pred_test

In [None]:
submission['target'].value_counts()

In [None]:
submission_name = '20220415'
submission_number = '3'
submission.to_csv(f'../submission/{submission_name}-{submission_number}.csv', index = False)

In [None]:
submission