In [1]:
import pandas as pd
import numpy as np
import os
import pickle
from glob import glob
import warnings
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, log_loss

import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from optuna.visualization import plot_contour, plot_optimization_history
from optuna.visualization import plot_parallel_coordinate, plot_slice, plot_param_importances

from sklearn.neural_network import MLPClassifier
from catboost import Pool, CatBoostClassifier
import xgboost as xgb

In [2]:
train = pd.read_csv("../data/train.csv", index_col="id")
test = pd.read_csv("../data/test.csv", index_col="id")
submission = pd.read_csv("../data/sample_submission.csv")

In [3]:
def create_dir(dir):
    if not os.path.exists(dir):
        os.makedirs(dir)
        print("Created Directory :", dir)
    else:
        print("Directory already existed :", dir)
create_dir("../pickle")
create_dir("../model")
create_dir("../submission")

Directory already existed : ../pickle
Directory already existed : ../model
Directory already existed : ../submission


In [4]:
train_x = train['text']
train_y = train['target']
test_x = test['text']

In [39]:
rows_train = train.shape[0] # 주어진 train data의 row 수
rows_test = test.shape[0] # 주어진 test data의 row 수
num_classes = len(train_y.unique())
num_trial = 100 # 파라미터 튜닝을 몇 번 진행하는지의 수
splits_hp = 3 # 파라미터 튜닝을 진행할 때의 kfold 수
splits_tr = 3 # 모델 트레이닝을 진행할 때의 kfold 수
basic_seed = 42 # default seed
num_seed_tr = 5 # 트레이닝 seed 개수
sel_seed = 3 # 선택할 seed 개수

In [16]:
pred_dict = {}
pred_test_dict = {}

# XGB

In [26]:
def xgb_objective(trial: Trial) -> float:
    score_hp = []
    for seed_hp in [0]:
        params_xgb = {
            "random_state": seed_hp,
            "verbose": None,        
            "num_class": num_classes,
            "objective": "multi:softprob",
            "eval_metric": "mlogloss",
#             "tree_method": "gpu_hist",
            "learning_rate": trial.suggest_uniform("learning_rate", 5e-2, 1e-1), # eta, default=0.3, range=[0,1]
            "gamma": trial.suggest_loguniform("gamma", 1e-2, 1e+2), # min_split_loss, default=0, range=[0,∞]
            "max_depth": trial.suggest_int("max_depth", 4, 12), # default=5, range=[0,∞]
            "min_child_weight": trial.suggest_int("min_child_weight", 1, 10), #default=1
            "max_delta_step" : trial.suggest_int("max_delta_step", 0, 10), #default=0
            "subsample": trial.suggest_uniform("subsample", 0.0, 1.0), # default=1, range=(0,1]
            "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.2, 1.0), # default=1, range=(0,1]
            "colsample_bylevel": trial.suggest_uniform("colsample_bylevel", 0.2, 1.0), # default=1, range=(0,1]
            "colsample_bynode": trial.suggest_uniform("colsample_bynode", 0.2, 1.0), # default=1, range=(0,1]
            "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-1, 1e+1), # default=0, range=[0,∞]
            "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-1, 1e+1), # default=1, range=[0,∞]
            "max_bin": trial.suggest_int("max_bin", 100, 400),
        }

        kfold = StratifiedKFold(n_splits=splits_hp, random_state=seed_hp, shuffle=True)
        cv = np.zeros((rows_train, num_classes))

        for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
            
            x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
            y_train, y_val = train_y.iloc[train_idx].values.ravel(), train_y.iloc[val_idx].values.ravel()

            vectorizer = TfidfVectorizer(ngram_range=(1, 2))
            vectorizer.fit(x_train)
            x_train = vectorizer.transform(x_train)
            x_val = vectorizer.transform(x_val)
            x_test = vectorizer.transform(test_x)

            dtrain = xgb.DMatrix(x_train, label=y_train)
            dvalid = xgb.DMatrix(x_val, label=y_val)

            watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
            XGBModel = xgb.train(params_xgb, dtrain, 10000, watchlist, early_stopping_rounds=50, verbose_eval=None)
            
            cv[val_idx, :] = XGBModel.predict(dvalid)
        
#             print(f"fold{n+1} log_loss:", log_loss(y_val, cv[val_idx]))
            score_hp.append(log_loss(y_val, cv[val_idx]))
            break
        
#         score_hp.append(log_loss(train_y, cv))
    
    np.mean(score_hp)
    return np.mean(score_hp)

In [None]:
sampler = TPESampler(seed=basic_seed)
xgb_study = optuna.create_study(study_name="xgb_parameter_opt", direction="minimize", sampler=sampler)
xgb_study.optimize(xgb_objective, n_trials=num_trial)

xgb_best_hyperparams = xgb_study.best_trial.params
xgb_base_hyperparams = {"random_state": basic_seed, "verbose": None, "num_class": num_classes, 
                        "objective": "multi:softprob", "eval_metric": "mlogloss"}
xgb_best_hyperparams.update(xgb_base_hyperparams)

with open('../pickle/xgb_best_hyperparams.pickle', 'wb') as fw:
    pickle.dump(xgb_best_hyperparams, fw)
print("The best hyperparameters are:\n", xgb_best_hyperparams)

In [None]:
xgb_best_hyperparams = {'learning_rate': 0.08827098485602952,
                         'gamma': 0.07068974950624607,
                         'max_depth': 5,
                         'min_child_weight': 2,
                         'max_delta_step': 3,
                         'subsample': 0.5247564316322378,
                         'colsample_bytree': 0.5455560149136927,
                         'colsample_bylevel': 0.43298331215843355,
                         'colsample_bynode': 0.6894823157779035,
                         'reg_alpha': 0.19010245319870356,
                         'reg_lambda': 0.3839629299804172,
                         'max_bin': 210,
                         'random_state': 42,
                         'verbose': None,
                         'num_class': 20,
                         'objective': 'multi:softprob',
                         'eval_metric': 'mlogloss'
                       }

In [None]:
lucky_seeds = np.random.randint(0, 1000, num_seed_tr)

for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=splits_tr, random_state=seed, shuffle=True)
    cv = np.zeros((rows_train, num_classes))
    pred_test = np.zeros((rows_test, num_classes))

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx].values.ravel(), train_y.iloc[val_idx].values.ravel()
        
        vectorizer = TfidfVectorizer(ngram_range=(1, 2))
        vectorizer.fit(x_train)
        x_train = vectorizer.transform(x_train)
        x_val = vectorizer.transform(x_val)
        x_test = vectorizer.transform(test_x)
        
        dtrain = xgb.DMatrix(x_train, label=y_train)
        dvalid = xgb.DMatrix(x_val, label=y_val)
        xgtest = xgb.DMatrix(x_test)
            
        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
        
        print(f'fold {n+1} start')
        
        XGBModel = xgb.train(xgb_best_hyperparams, dtrain, 1000, watchlist, early_stopping_rounds=50, verbose_eval=100)
        
        cv[val_idx, :] = XGBModel.predict(dvalid)
        
        pred_test += XGBModel.predict(xgtest) / splits_tr
        
#         print(f'fold {n+1}', 'log_loss :', log_loss(y_val, cv[val_idx]))
#         print(f'fold {n+1}', 'accuracy_score :', accuracy_score(y_val, np.argmax(cv[val_idx], axis=1)))
        
    pred_dict['xgb'+str(seed)] = cv
    pred_test_dict['xgb'+str(seed)] = pred_test
    print(f'seed {seed}', 'log_loss :', log_loss(train_y, cv))
    print(f'seed {seed}', 'accuracy_score :', accuracy_score(train_y, np.argmax(cv, axis=1)))

In [None]:
# lucky_seeds = np.random.randint(0, 1000, num_seed_tr)

# for i, seed in enumerate(lucky_seeds):

#     kfold = StratifiedKFold(n_splits=splits_tr, random_state=seed, shuffle=True)
#     cv = np.zeros((rows_train, num_classes))
#     pred_test = np.zeros((rows_test, num_classes))

#     for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
#         x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
#         y_train, y_val = train_y.iloc[train_idx].values.ravel(), train_y.iloc[val_idx].values.ravel()
        
# #         print(f'fold {n+1} start')
        
#         BERTModel = BertClassifier(bert_model="bert-base-cased", random_state=basic_seed,
#                                    epochs=5, validation_fraction=0, train_batch_size=8, eval_batch_size=2)
#         BERTModel.fit(x_train, y_train)
        
#         cv[val_idx, :] = BERTModel.predict_proba(x_val)
#         pred_test += BERTModel.predict_proba(test_x) / splits_tr
        
#         print(f'fold {n+1}', 'log_loss :', log_loss(y_val, cv[val_idx]))
#         print(f'fold {n+1}', 'accuracy_score :', accuracy_score(y_val, np.argmax(cv[val_idx], axis=1)))
        
#     pred_dict['bert'+str(seed)] = cv
#     pred_test_dict['bert'+str(seed)] = pred_test
#     print(f'seed {seed}', 'log_loss :', log_loss(train_y, cv))
#     print(f'seed {seed}', 'accuracy_score :', accuracy_score(train_y, np.argmax(cv, axis=1)))

In [None]:
def load_dict(model):
    with open('../pickle/pred_dict_'+model+'.pickle', 'rb') as fw:
        pred_dict_new_local = pickle.load(fw)
    with open('../pickle/pred_test_dict_'+model+'.pickle', 'rb') as fw:
        pred_test_dict_new_local = pickle.load(fw)
    return pred_dict_new_local, pred_test_dict_new_local

In [None]:
pred_dict_mlp, pred_test_dict_mlp = load_dict('mlp_cv15')

In [None]:
pred = np.zeros((rows_train, num_classes))
for _, value in pred_dict_mlp.items():
    pred += value
pred /= len(pred_dict_mlp)

In [None]:
pred_test = np.zeros((rows_test, num_classes))
for _, value in pred_test_dict_mlp.items():
    pred_test += value
pred_test /= len(pred_test_dict_mlp)

In [None]:
print(f'accuracy_score: {accuracy_score(train_y, np.argmax(pred, axis=1)):.6f}')

In [None]:
print(f'accuracy_score: {accuracy_score(train_y, np.argmax(pred*0.95+pred2*0.05, axis=1)):.6f}')

In [None]:
pred_test = np.argmax(pred_test*0.86+pred_test2*0.14, axis=1)

In [None]:
lucky_seeds = np.random.randint(0, 1000, num_seed_tr)

for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=splits_tr, random_state=seed, shuffle=True)
    cv = np.zeros((rows_train, num_classes))

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx].values.ravel(), train_y.iloc[val_idx].values.ravel()
        
        vectorizer = TfidfVectorizer(ngram_range=(1, 2))
        vectorizer.fit(x_train)
        x_train = vectorizer.transform(x_train)
        x_val = vectorizer.transform(x_val)
        x_test = vectorizer.transform(test_x)
        
        print(f'fold {n+1} start')
        
        MLPModel = MLPClassifier(max_iter=12, hidden_layer_sizes=250, random_state=basic_seed, verbose=1)
        MLPModel.fit(x_train, y_train)
        
        cv[val_idx, :] = MLPModel.predict_proba(x_val)

#         cat_best_hyperparams = {"iterations": 10000, "learning_rate": 0.3}
#         catmodel = CatBoostClassifier(**cat_best_hyperparams)
#         catmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=50, verbose=10)
        
#         cv[val_idx] = catmodel.predict(x_val)
        pred_test += MLPModel.predict_proba(x_test) / splits_tr
        
        print(f'fold {n+1}', 'log_loss :', log_loss(y_val, cv[val_idx]))
        print(f'fold {n+1}', 'accuracy_score :', accuracy_score(y_val, np.argmax(cv[val_idx], axis=1)))
        
    pred_dict['mlp'+str(seed)] = cv
    pred_test_dict['mlp'+str(seed)] = pred_test
    print(f'seed {seed}', 'log_loss :', log_loss(train_y, cv))
    print(f'seed {seed}', 'accuracy_score :', accuracy_score(train_y, np.argmax(cv, axis=1)))

fold 1 start
Iteration 1, loss = 2.88587743
Iteration 2, loss = 1.93844120
Iteration 3, loss = 0.85989388
Iteration 4, loss = 0.31230709


In [49]:
lucky_seeds = np.random.randint(0, 1000, num_seed_tr)

for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=splits_tr, random_state=seed, shuffle=True)
    cv = np.zeros((rows_train, num_classes))

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx].values.ravel(), train_y.iloc[val_idx].values.ravel()
        
        vectorizer = TfidfVectorizer(ngram_range=(1, 2))
        vectorizer.fit(x_train)
        x_train = vectorizer.transform(x_train)
        x_val = vectorizer.transform(x_val)
        x_test = vectorizer.transform(test_x)
        
        print(f'fold {n+1} start')
        
        MLPModel = MLPClassifier(max_iter=12, hidden_layer_sizes=250, random_state=basic_seed, verbose=1)
        MLPModel.fit(x_train, y_train)
        
        cv[val_idx, :] = MLPModel.predict_proba(x_val)

#         cat_best_hyperparams = {"iterations": 10000, "learning_rate": 0.3}
#         catmodel = CatBoostClassifier(**cat_best_hyperparams)
#         catmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=50, verbose=10)
        
#         cv[val_idx] = catmodel.predict(x_val)
        pred_test += MLPModel.predict_proba(x_test) / splits_tr
        
        print(f'fold {n+1}', 'log_loss :', log_loss(y_val, cv[val_idx]))
        print(f'fold {n+1}', 'accuracy_score :', accuracy_score(y_val, np.argmax(cv[val_idx], axis=1)))
        
    pred_dict['mlp'+str(seed)] = cv
    pred_test_dict['mlp'+str(seed)] = pred_test
    print(f'seed {seed}', 'log_loss :', log_loss(train_y, cv))
    print(f'seed {seed}', 'accuracy_score :', accuracy_score(train_y, np.argmax(cv, axis=1)))

fold 1 start
Iteration 1, loss = 2.87701584
Iteration 2, loss = 1.82490883
Iteration 3, loss = 0.71171896
Iteration 4, loss = 0.24221684
Iteration 5, loss = 0.12334927
Iteration 6, loss = 0.08526924
Iteration 7, loss = 0.06854069
Iteration 8, loss = 0.05963819
Iteration 9, loss = 0.05408119
Iteration 10, loss = 0.05068640
Iteration 11, loss = 0.04802125
Iteration 12, loss = 0.04613148
fold 1 log_loss : 1.1809629445839431
fold 1 accuracy_score : 0.7426900584795322


KeyboardInterrupt: 

In [48]:
lucky_seeds = np.random.randint(0, 1000, num_seed_tr)

for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=splits_tr, random_state=seed, shuffle=True)
    cv = np.zeros((rows_train, num_classes))

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx].values.ravel(), train_y.iloc[val_idx].values.ravel()
        
        vectorizer = TfidfVectorizer(ngram_range=(1, 2))
        vectorizer.fit(x_train)
        x_train = vectorizer.transform(x_train)
        x_val = vectorizer.transform(x_val)
        x_test = vectorizer.transform(test_x)
        
        print(f'fold {n+1} start')
        
        MLPModel = MLPClassifier(max_iter=12, hidden_layer_sizes=200, random_state=basic_seed, verbose=1)
        MLPModel.fit(x_train, y_train)
        
        cv[val_idx, :] = MLPModel.predict_proba(x_val)

#         cat_best_hyperparams = {"iterations": 10000, "learning_rate": 0.3}
#         catmodel = CatBoostClassifier(**cat_best_hyperparams)
#         catmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=50, verbose=10)
        
#         cv[val_idx] = catmodel.predict(x_val)
        pred_test += MLPModel.predict_proba(x_test) / splits_tr
        
        print(f'fold {n+1}', 'log_loss :', log_loss(y_val, cv[val_idx]))
        print(f'fold {n+1}', 'accuracy_score :', accuracy_score(y_val, np.argmax(cv[val_idx], axis=1)))
        
    pred_dict['mlp'+str(seed)] = cv
    pred_test_dict['mlp'+str(seed)] = pred_test
    print(f'seed {seed}', 'log_loss :', log_loss(train_y, cv))
    print(f'seed {seed}', 'accuracy_score :', accuracy_score(train_y, np.argmax(cv, axis=1)))

fold 1 start
Iteration 1, loss = 2.90069585
Iteration 2, loss = 2.08529755
Iteration 3, loss = 1.04899716
Iteration 4, loss = 0.41136879
Iteration 5, loss = 0.19385638
Iteration 6, loss = 0.11965508
Iteration 7, loss = 0.08817034
Iteration 8, loss = 0.07188408
Iteration 9, loss = 0.06239249
Iteration 10, loss = 0.05615692
Iteration 11, loss = 0.05192312
Iteration 12, loss = 0.04885101
fold 1 log_loss : 1.2182042496686905
fold 1 accuracy_score : 0.7469135802469136
fold 2 start


KeyboardInterrupt: 

In [47]:
lucky_seeds = np.random.randint(0, 1000, num_seed_tr)

for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=splits_tr, random_state=seed, shuffle=True)
    cv = np.zeros((rows_train, num_classes))

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx].values.ravel(), train_y.iloc[val_idx].values.ravel()
        
        vectorizer = TfidfVectorizer(ngram_range=(1, 2))
        vectorizer.fit(x_train)
        x_train = vectorizer.transform(x_train)
        x_val = vectorizer.transform(x_val)
        x_test = vectorizer.transform(test_x)
        
        print(f'fold {n+1} start')
        
        MLPModel = MLPClassifier(max_iter=12, hidden_layer_sizes=150, random_state=basic_seed, verbose=1)
        MLPModel.fit(x_train, y_train)
        
        cv[val_idx, :] = MLPModel.predict_proba(x_val)

#         cat_best_hyperparams = {"iterations": 10000, "learning_rate": 0.3}
#         catmodel = CatBoostClassifier(**cat_best_hyperparams)
#         catmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=50, verbose=10)
        
#         cv[val_idx] = catmodel.predict(x_val)
        pred_test += MLPModel.predict_proba(x_test) / splits_tr
        
        print(f'fold {n+1}', 'log_loss :', log_loss(y_val, cv[val_idx]))
        print(f'fold {n+1}', 'accuracy_score :', accuracy_score(y_val, np.argmax(cv[val_idx], axis=1)))
        
    pred_dict['mlp'+str(seed)] = cv
    pred_test_dict['mlp'+str(seed)] = pred_test
    print(f'seed {seed}', 'log_loss :', log_loss(train_y, cv))
    print(f'seed {seed}', 'accuracy_score :', accuracy_score(train_y, np.argmax(cv, axis=1)))

fold 1 start
Iteration 1, loss = 2.92646606
Iteration 2, loss = 2.25876659
Iteration 3, loss = 1.33687210
Iteration 4, loss = 0.60712800
Iteration 5, loss = 0.28498176
Iteration 6, loss = 0.16487697
Iteration 7, loss = 0.11366884
Iteration 8, loss = 0.08830485
Iteration 9, loss = 0.07381324
Iteration 10, loss = 0.06490315
Iteration 11, loss = 0.05833006
Iteration 12, loss = 0.05413481
fold 1 log_loss : 1.250585081660447
fold 1 accuracy_score : 0.7378167641325536


KeyboardInterrupt: 

In [46]:
lucky_seeds = np.random.randint(0, 1000, num_seed_tr)

for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=splits_tr, random_state=seed, shuffle=True)
    cv = np.zeros((rows_train, num_classes))

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx].values.ravel(), train_y.iloc[val_idx].values.ravel()
        
        vectorizer = TfidfVectorizer(ngram_range=(1, 2))
        vectorizer.fit(x_train)
        x_train = vectorizer.transform(x_train)
        x_val = vectorizer.transform(x_val)
        x_test = vectorizer.transform(test_x)
        
        print(f'fold {n+1} start')
        
        MLPModel = MLPClassifier(max_iter=12, hidden_layer_sizes=120, random_state=basic_seed, verbose=1)
        MLPModel.fit(x_train, y_train)
        
        cv[val_idx, :] = MLPModel.predict_proba(x_val)

#         cat_best_hyperparams = {"iterations": 10000, "learning_rate": 0.3}
#         catmodel = CatBoostClassifier(**cat_best_hyperparams)
#         catmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=50, verbose=10)
        
#         cv[val_idx] = catmodel.predict(x_val)
        pred_test += MLPModel.predict_proba(x_test) / splits_tr
        
        print(f'fold {n+1}', 'log_loss :', log_loss(y_val, cv[val_idx]))
        print(f'fold {n+1}', 'accuracy_score :', accuracy_score(y_val, np.argmax(cv[val_idx], axis=1)))
        
    pred_dict['mlp'+str(seed)] = cv
    pred_test_dict['mlp'+str(seed)] = pred_test
    print(f'seed {seed}', 'log_loss :', log_loss(train_y, cv))
    print(f'seed {seed}', 'accuracy_score :', accuracy_score(train_y, np.argmax(cv, axis=1)))

fold 1 start
Iteration 1, loss = 2.94078559
Iteration 2, loss = 2.34423701
Iteration 3, loss = 1.49642476
Iteration 4, loss = 0.74847547
Iteration 5, loss = 0.36632014
Iteration 6, loss = 0.20955085
Iteration 7, loss = 0.14038125
Iteration 8, loss = 0.10584494
Iteration 9, loss = 0.08586610
Iteration 10, loss = 0.07332111
Iteration 11, loss = 0.06517238
Iteration 12, loss = 0.05923650
fold 1 log_loss : 1.2876162641670004
fold 1 accuracy_score : 0.7368421052631579
fold 2 start
fold 2 log_loss : 2.9940261436997018
fold 2 accuracy_score : 0.08771929824561403
fold 3 start
fold 3 log_loss : 3.0055652886960416
fold 3 accuracy_score : 0.05004874878128047
seed 75 log_loss : 2.4290067935374666
seed 75 accuracy_score : 0.2915628723058594


KeyboardInterrupt: 

In [44]:
lucky_seeds = np.random.randint(0, 1000, num_seed_tr)

for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=splits_tr, random_state=seed, shuffle=True)
    cv = np.zeros((rows_train, num_classes))

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx].values.ravel(), train_y.iloc[val_idx].values.ravel()
        
        vectorizer = TfidfVectorizer(ngram_range=(1, 2))
        vectorizer.fit(x_train)
        x_train = vectorizer.transform(x_train)
        x_val = vectorizer.transform(x_val)
        x_test = vectorizer.transform(test_x)
        
        print(f'fold {n+1} start')
        
        MLPModel = MLPClassifier(max_iter=12, hidden_layer_sizes=100, random_state=basic_seed, verbose=1)
        MLPModel.fit(x_train, y_train)
        
        cv[val_idx, :] = MLPModel.predict_proba(x_val)

#         cat_best_hyperparams = {"iterations": 10000, "learning_rate": 0.3}
#         catmodel = CatBoostClassifier(**cat_best_hyperparams)
#         catmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=50, verbose=10)
        
#         cv[val_idx] = catmodel.predict(x_val)
        pred_test += MLPModel.predict_proba(x_test) / splits_tr
        
        print(f'fold {n+1}', 'log_loss :', log_loss(y_val, cv[val_idx]))
        print(f'fold {n+1}', 'accuracy_score :', accuracy_score(y_val, np.argmax(cv[val_idx], axis=1)))
        
    pred_dict['mlp'+str(seed)] = cv
    pred_test_dict['mlp'+str(seed)] = pred_test
    print(f'seed {seed}', 'log_loss :', log_loss(train_y, cv))
    print(f'seed {seed}', 'accuracy_score :', accuracy_score(train_y, np.argmax(cv, axis=1)))

fold 1 start
Iteration 1, loss = 2.93844791
Iteration 2, loss = 2.42155991
Iteration 3, loss = 1.67550687
Iteration 4, loss = 0.92869411
Iteration 5, loss = 0.47811938
Iteration 6, loss = 0.27241151
Iteration 7, loss = 0.17891459
Iteration 8, loss = 0.13192336
Iteration 9, loss = 0.10524688
Iteration 10, loss = 0.08863415
Iteration 11, loss = 0.07761108
Iteration 12, loss = 0.06997654
fold 1 log_loss : 1.3087164072900916
fold 1 accuracy_score : 0.7309941520467836


KeyboardInterrupt: 

In [41]:
lucky_seeds = np.random.randint(0, 1000, num_seed_tr)

for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=splits_tr, random_state=seed, shuffle=True)
    cv = np.zeros((rows_train, num_classes))
    pred_test = np.zeros((rows_test, num_classes))

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx].values.ravel(), train_y.iloc[val_idx].values.ravel()
        
        vectorizer = TfidfVectorizer(ngram_range=(1, 2))
        vectorizer.fit(x_train)
        x_train = vectorizer.transform(x_train)
        x_val = vectorizer.transform(x_val)
        x_test = vectorizer.transform(test_x)
        
        print(f'fold {n+1} start')
        
        MLPModel = MLPClassifier(max_iter=12, hidden_layer_sizes=100, random_state=basic_seed, verbose=1)
        MLPModel.fit(x_train, y_train)
        
        cv[val_idx, :] = MLPModel.predict_proba(x_val)

#         cat_best_hyperparams = {"iterations": 10000, "learning_rate": 0.3}
#         catmodel = CatBoostClassifier(**cat_best_hyperparams)
#         catmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=50, verbose=10)
        
#         cv[val_idx] = catmodel.predict(x_val)
        pred_test += MLPModel.predict_proba(x_test) / splits_tr
        
        print(f'fold {n+1}', 'log_loss :', log_loss(y_val, cv[val_idx]))
        print(f'fold {n+1}', 'accuracy_score :', accuracy_score(y_val, np.argmax(cv[val_idx], axis=1)))
        
    pred_dict['mlp'+str(seed)] = cv
    pred_test_dict['mlp'+str(seed)] = pred_test
    print(f'seed {seed}', 'log_loss :', log_loss(train_y, cv))
    print(f'seed {seed}', 'accuracy_score :', accuracy_score(train_y, np.argmax(cv, axis=1)))

fold 1 start
Iteration 1, loss = 2.95933919
Iteration 2, loss = 2.62792367
Iteration 3, loss = 2.13510445
Iteration 4, loss = 1.53656763
Iteration 5, loss = 1.00797684
Iteration 6, loss = 0.63954663
Iteration 7, loss = 0.41913100
Iteration 8, loss = 0.29241655
Iteration 9, loss = 0.21752007
Iteration 10, loss = 0.17064270
Iteration 11, loss = 0.13973926
Iteration 12, loss = 0.11871930
fold 1 log_loss : 1.4402451900990898
fold 1 accuracy_score : 0.7170240415854451


KeyboardInterrupt: 

In [None]:
def sort_dict(model, pred_dict, pred_test_dict):
    pred_dict_local = {}
    for key, value in pred_dict.items():
        if model in key:
            pred_dict_local[key]=value

    pred_test_dict_local = {}
    for key, value in pred_test_dict.items():
        if model in key:
            pred_test_dict_local[key]=value

    pred_dict_new_local = dict(sorted(
        pred_dict_local.items(), 
        key=lambda x:accuracy_score((train_y), np.argmax(list(x[1]), axis=1)), reverse=False)[:5])
    pred_test_dict_new_local = {}
    for key, value in pred_dict_new_local.items():
        pred_test_dict_new_local[key]=pred_test_dict_local[key]
        
    return pred_dict_new_local, pred_test_dict_new_local

In [None]:
def save_dict(model, pred_dict, pred_test_dict):
    with open('../pickle/pred_dict_'+model+'.pickle', 'wb') as fw:
        pickle.dump(pred_dict, fw)
    with open('../pickle/pred_test_dict_'+model+'.pickle', 'wb') as fw:
        pickle.dump(pred_test_dict, fw)

In [None]:
pred_dict_bert, pred_test_dict_bert = sort_dict('bert', pred_dict, pred_test_dict)
save_dict('bert_epoch5_cv15', pred_dict_bert, pred_test_dict_bert)

In [None]:
pred_dict_bert, pred_test_dict_bert = sort_dict('bert', pred_dict, pred_test_dict)
save_dict('bert', pred_dict_bert, pred_test_dict_bert)

In [None]:
pred_dict_mlp, pred_test_dict_mlp = sort_dict('mlp', pred_dict, pred_test_dict)
save_dict('mlp', pred_dict_mlp, pred_test_dict_mlp)

In [None]:
pred = np.zeros((rows_train, num_classes))
for _, value in pred_dict_mlp.items():
    pred += value
pred /= len(pred_dict_mlp)

In [None]:
print(f'accuracy_score: {accuracy_score(train_y, np.argmax(pred, axis=1)):.6f}')

In [None]:
pred_test = np.zeros((rows_test, num_classes))
for _, value in pred_test_dict_mlp.items():
    pred_test += value
pred_test /= len(pred_test_dict_mlp)

In [None]:
def load_dict(model):
    with open('../pickle/pred_dict_'+model+'.pickle', 'rb') as fw:
        pred_dict_new_local = pickle.load(fw)
    with open('../pickle/pred_test_dict_'+model+'.pickle', 'rb') as fw:
        pred_test_dict_new_local = pickle.load(fw)
    return pred_dict_new_local, pred_test_dict_new_local

In [None]:
pred_dict_mlp2, pred_test_dict_mlp2 = load_dict('mlp')

In [None]:
pred2 = np.zeros((rows_train, num_classes))
for _, value in pred_dict_mlp2.items():
    pred2 += value
pred2 /= len(pred_dict_mlp2)

In [None]:
pred_test2 = np.zeros((rows_test, num_classes))
for _, value in pred_test_dict_mlp2.items():
    pred_test2 += value
pred_test2 /= len(pred_test_dict_mlp2)

In [None]:
print(f'accuracy_score: {accuracy_score(train_y, np.argmax(pred*0.6+pred2*0.4, axis=1)):.6f}')

In [None]:
pred_test = np.argmax(pred_test, axis=1)

In [None]:
submission["target"] = pred_test

In [None]:
submission['target'].value_counts()

In [None]:
submission_name = '20220410'
submission_number = '1'
submission.to_csv(f'../submission/{submission_name}-{submission_number}.csv', index = False)

In [None]:
submission