# Submitする用のテンプレート

特徴量を読み込んで `LightGBM` を使用したモデルを使用して判別を行います。

## 使用方法

1. 特徴となるデータ(csv)をあらかじめ用意しておく。  
2. 用意したデータのパスを[2]の `train_data_list` と `test_data_list` に追加する。
3. このノートを実行する。

## 読み込むデータの形式について

以下のように `ID_code` を主キーとしたCSVデータを用意してください。

| ID_code | feature1 | feature2 |
|:-----------|------------:|:------------:|
| value | value | value |

In [None]:
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
from IPython.core.display import display
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
warnings.filterwarnings('ignore')

In [None]:
train_data_list = [
#     "exam→" '../input/sample_train.csv'
    '../input/train.csv'
]
test_data_list = [
#     "exam→" '../input/sample_test.csv'
    '../input/test.csv'
]

In [None]:
# display(pd.read_csv( '../input/sample_train').shape)
display(pd.read_csv( '../input/train.csv').shape)


# display(pd.read_csv( '../input/sample_test').shape)
display(pd.read_csv( '../input/test.csv').shape)

In [None]:
train = pd.DataFrame()
for path in train_data_list: 
    if len(train) == 0:
        train = pd.read_csv(path)
    else:
        other = pd.read_csv(path) 
        train = pd.merge(train, other, on='ID_code', how='left')  

test = pd.DataFrame()
for path in test_data_list: 
    if len(test) == 0: 
        test= pd.read_csv(path)
    else: 
        other = pd.read_csv(path) 
        test = pd.merge(test, other, on='ID_code', how='left')

In [None]:
display(train_df.head())
display(test_df.head())

In [None]:
features = [c for c in train.columns if c not in ['ID_code', 'target']]
target = train['target']

In [None]:
param = {
        'num_leaves': 6,
        'max_bin': 63,
        'min_data_in_leaf': 45,
        'learning_rate': 0.01,
        'min_sum_hessian_in_leaf': 0.000446,
        'bagging_fraction': 0.55, 
        'bagging_freq': 5, 
        'max_depth': 14,
        'save_binary': True,
        'seed': 31452,
        'feature_fraction_seed': 31415,
         'feature_fraction': 0.51,
        'bagging_seed': 31415,
        'drop_seed': 31415,
        'data_random_seed': 31415,
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'verbose': 1,
        'metric': 'auc',
        'is_unbalance': True,
        'boost_from_average': False,
    }

In [None]:
folds = StratifiedKFold(n_splits=9, shuffle=True, random_state=2019)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))
feature_importance = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print("Fold {}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx])

    num_round = 8000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 250)
    oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance = pd.DataFrame()
    fold_importance["Feature"] = features
    fold_importance["importance"] = clf.feature_importance()
    fold_importance["fold"] = fold_ + 1
    feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
    
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))

In [None]:
cols = (feature_importance[["Feature", "importance"]]
        .groupby("Feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:150].index)
best_features = feature_importance.loc[feature_importance.Feature.isin(cols)]

plt.figure(figsize=(14,28))
sns.barplot(x="importance", y="Feature", data=best_features.sort_values(by="importance",ascending=False))
plt.title('Features importance (averaged/folds)')
plt.tight_layout()
plt.savefig('FI.png')

In [None]:
date = datetime.datetime.now().strftime('%Y%m%d%H%M')
sub_df = pd.DataFrame({"ID_code":test["ID_code"].values})
sub_df["target"] = predictions
sub_df.to_csv(f"submission-{date}.csv", index=False)