<a href="https://colab.research.google.com/github/Joonyoung-Song/DACON-NLP_competition/blob/main/STACKING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

In [None]:
import pandas as pd
import numpy as np
np.random.seed(42)
import re
import os
import tqdm
from pathlib import Path

import lightgbm as lgb
import xgboost as xgb

from scipy import sparse
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, log_loss
from sklearn.decomposition import TruncatedSVD
from sklearn import ensemble, metrics, model_selection, naive_bayes


In [None]:
val_dir = Path('/gdrive/My Drive/dacon_nlp_competition/build/val')
tst_dir = Path('/gdrive/My Drive/dacon_nlp_competition/build/tst')
model_names = [
'+gru15_emb16_  0.6000',
 'LGB_tfidf',
 'XGB_tfidf',
'lstm15+gru15_emb_  0.6041',
'CNN_emb_  0.6982',
              ]
stk_trn = []
stk_tst = []
feature_names = []
for model in model_names:
    stk_trn.append(np.loadtxt(val_dir / f'{model}.val.csv', delimiter=','))
    stk_tst.append(np.loadtxt(tst_dir / f'{model}.tst.csv', delimiter=','))
    feature_names += [f'{model}_class0', f'{model}_class1', f'{model}_class2', f'{model}_class3', f'{model}_class4']
    
stk_trn = np.hstack(stk_trn)
stk_tst = np.hstack(stk_tst)
feature_names

In [None]:
data_dir = Path('/gdrive/My Drive/dacon_nlp_competition/data')
sub_dir = Path('/gdrive/My Drive/dacon_nlp_competition/build/sub')

sample_file = data_dir / 'sample_submission.csv'

trn_file = data_dir / 'train.csv'
y=pd.read_csv(trn_file, encoding = 'utf-8',index_col=0)['author']

In [None]:
n_fold=5
seed=42
n_class=5
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

In [None]:
#  p_trn =np.zeros((stk_trn.shape[0], n_class))
p_val = np.zeros((stk_trn.shape[0], n_class))
p_tst = np.zeros((stk_tst.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(stk_trn, y), 1):
    print(f'training model for CV #{i}')
    clf = lgb.LGBMClassifier(objective='multiclass',
                             n_estimators=10000,
                             learning_rate=0.01,
                             boosting_type ='gbdt',
                              max_depth=5,
                               feature_fraction=0.4,
#                               min_child_weight=0.01,
                              num_leaves=30,
                             random_state=seed,                    
                             n_jobs=-1,
                             verbose=100)
    clf.fit(stk_trn[i_trn], y[i_trn],
            eval_set=[(stk_trn[i_val], y[i_val])],
            eval_metric='multi_logloss',early_stopping_rounds=100,
            verbose=100)
#     p_trn[i_trn, :] = clf.predict_proba(stk_trn[i_trn])
    p_val[i_val, :] = clf.predict_proba(stk_trn[i_val])
    p_tst += clf.predict_proba(stk_tst) / n_fold
print()
print('models:',model_names)
print(clf)
# print(f'train cv accuracy : {accuracy_score(y, np.argmax(p_trn, axis=1)) :.6f}')
print(f'valid cv accuracy : {log_loss(y, p_val) :.6f}')

In [None]:
print(f'valid cv logloss : {log_loss(y, p_val) :.6f}') # 제출한 파일 로그로스 : 0.4389

## 제출 파일 생성


In [None]:
sub = pd.read_csv(sample_file, index_col=0)
print(sub.shape)
sub.head()

In [None]:
sub[sub.columns] = p_tst
sub.head()

In [None]:
algo_name = 'gru+lgb+xgb+(lstm+gru)+cnn'
feature_name = 'stk'
model_name = f'{algo_name}_{feature_name}'


sub_file = sub_dir / f'{model_name}.csv'

In [None]:

sub.to_csv(sub_file)