In [None]:
%ls ../input/otto-meta-bagging/

In [None]:
import gc
import numpy as np
import pandas as pd
import lightgbm as lgb
# import optuna.integration.lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from pathlib import Path

INPUT_PATH = Path("../input/otto-group-product-classification-challenge/")
LEVEL0_PATH = Path("../input/otto-stacking-level0/") 
META_PATH = Path("../input/otto-meta-bagging/") 

In [None]:
train = pd.read_csv(INPUT_PATH / "train.csv")
test = pd.read_csv(INPUT_PATH / "test.csv")

train.shape, test.shape

In [None]:
tsne_2dim = pd.read_csv(META_PATH / "tsne_2dim.csv")

tsne_2dim.shape

In [None]:
drop_cols = ["id"]
target_col = "target"
target = train[target_col]
feat_cols = [col for col in train.columns if col not in drop_cols + [target_col]]

train[target_col] = train[target_col].str.replace('Class_', '')
train[target_col] = train[target_col].astype(int) - 1
target = train[target_col]

In [None]:
tr_tsne_2dim = tsne_2dim[:train.shape[0]]
te_tsne_2dim = tsne_2dim[train.shape[0]:train.shape[0] + test.shape[0]]

del tsne_2dim
gc.collect()

In [None]:
tr_tsne_2dim.shape, te_tsne_2dim.shape, target.shape

In [None]:
train = train.join(tr_tsne_2dim)
test = test.join(te_tsne_2dim)

train.shape, test.shape

In [None]:
train.head()

In [None]:
# train setting
NFOLDS = 5
RANDOM_STATE = 871972

excluded_column = ['target', 'id']
cols = [col for col in train.columns if col not in excluded_column]

folds = StratifiedKFold(n_splits=NFOLDS, shuffle=True, 
                        random_state=RANDOM_STATE)

# parameter calculated by LGBtuner
params = {
    'metric':'multi_logloss',
    'objective': 'multiclass',
    'num_class': 9,
    'verbosity': 1,
}

In [None]:
y_pred_test = np.zeros((len(test), 9))
oof = np.zeros((len(train), 9))
score = 0

for fold_n, (train_index, valid_index) in enumerate(folds.split(train, y=target)):
    print('Fold', fold_n)
    X_train, X_valid = train.iloc[train_index], train.iloc[valid_index]
    y_train, y_valid = target.loc[train_index].astype(int), target.loc[valid_index].astype(int)
    
    train_data = lgb.Dataset(X_train[cols], label=y_train)
    valid_data = lgb.Dataset(X_valid[cols], label=y_valid)

    lgb_model = lgb.train(params,train_data,num_boost_round=30000,
                          valid_sets=[train_data, valid_data],
                          verbose_eval=300,early_stopping_rounds=300)
    
    y_pred_valid = lgb_model.predict(X_valid[cols],
                                     num_iteration=lgb_model.best_iteration)
    oof[valid_index] = y_pred_valid
    score += log_loss(y_valid, y_pred_valid)
    
    y_pred_test += lgb_model.predict(test[cols], num_iteration=lgb_model.best_iteration)/NFOLDS
print('valid logloss average:', score/NFOLDS, log_loss(target, oof))

In [None]:
sample_submit = pd.read_csv(INPUT_PATH / "sampleSubmission.csv")

In [None]:
submit = pd.concat([sample_submit[['id']], pd.DataFrame(y_pred_test)], axis = 1)
submit.columns = sample_submit.columns
submit.to_csv('submit.csv', index=False)

In [None]:
column_name = ['lgb_' + str(i) for i in range(9)]
pd.DataFrame(oof, columns = column_name).to_csv('oof_lgb.csv', index=False)
pd.DataFrame(y_pred_test, columns = column_name).to_csv('submit_lgb.csv', index=False)

In [None]:
np.save("lgb_oof.npy", oof)