In [None]:
""" やること
1. LightGBMのトレーニング
2. モデルの精度検証
3. モデルを保存(pickle)
4. テストデータの予測を保存(csv)
"""
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score

In [None]:
"""
各種設定
"""
pd.set_option('display.max_columns', 100)
RANDOM_STATE = 2020

In [None]:
"""
データ読み込み
train_X, train_y
test_X
"""
train = pd.read_csv("../../../cat-in-the-dat-ii/output/datas/train_tree.csv", index_col=0).sort_index(inplace=False)
train_X = train.drop(['target'], axis=1)
train_y = train['target']
test_X = pd.read_csv("../../../cat-in-the-dat-ii/output/datas/test_tree.csv", index_col=0).sort_index(inplace=False)
sample_submission = pd.read_csv("../../../cat-in-the-dat-ii/input/sample_submission.csv", index_col=0)

# 不要な変数を削除
del train

In [None]:
"""
KFoldでvalidation
"""
# kf = KFold(n_splits=5, shuffle=False, random_state=RANDOM_STATE)
kf = StratifiedKFold(n_splits=5, shuffle=False, random_state=RANDOM_STATE)

EnsembleModels = []
RocScores = {}

for i, [tr_index, val_index] in tqdm(enumerate(kf.split(train_X, train_y))):
    model = lgb.LGBMClassifier(
        learning_rate=0.05,
        feature_fraction=0.1,
        min_data_in_leaf=12,
        max_depth=3,
        reg_alpha=1,
        reg_lambda=1,
        objective='binary',
        metric='auc',
        n_jobs=-1,
        n_estimators=5000,
        feature_fraction_seed=42,
        bagging_seed=42,
        boosting_type='gbdt',
        verbose=1,
        is_unbalance=True,
        boost_from_average=False,
        random_state=RANDOM_STATE)
    
    TempModel = model.fit(train_X.iloc[tr_index, :], train_y.iloc[tr_index],
                                                eval_set=[(train_X.iloc[val_index, :], train_y.iloc[val_index])],
                                                verbose=1,
                                                eval_metric='auc',
                                                early_stopping_rounds=100)
    EnsembleModels.append(
        (f'LightGbm_{i}', TempModel))
    
    # validation
    RocScores[f'{i}'] = [
        roc_auc_score(train_y.iloc[val_index], TempModel.predict_proba(train_X.iloc[val_index, :])[:, 1]),
    ]

In [None]:
"""
モデルを保存
"""
with open("../../../cat-in-the-dat-ii/output/models/LightGBM.pkl", mode="wb") as f:
    pickle.dump(EnsembleModels, file=f)
    print('モデル保存しました')

"""
ローカルCV結果を保存
"""
with open("../../../cat-in-the-dat-ii/output/results/LightGBM_localCV.pkl", mode="wb") as f:
    pickle.dump(RocScores, file=f)
    print('ローカルCVを保存しました')


In [None]:
"""
テストテータの予測と保存
"""
# dataframe.append()  おっそーーー
# dict -> dataframe  はっやーーー
EnsembleDf = {}
for name_model in tqdm(EnsembleModels):
    temp = pd.Series(data=name_model[1].predict_proba(test_X)[:, 1], index=test_X.index, name=name_model[0])
    EnsembleDf[name_model[0]] = temp

EnsembleDf = pd.DataFrame.from_dict(EnsembleDf, orient='columns')
EnsembleDf.set_index(keys=sample_submission.index[:], inplace=True)
sample_submission['target'] = EnsembleDf.mean(axis=1)
sample_submission.to_csv("../../../cat-in-the-dat-ii/output/datas/sample_submission_lgb.csv")
print('sample_submission_lgbを保存しました')

In [None]:
"""
特徴重要度
"""
fig = plt.figure()
img = lgb.plot_importance(EnsembleModels[0][1],
                          importance_type='gain', title='LightGBM feature importance gain')
plt.savefig("../../../cat-in-the-dat-ii/output/results/lgb_feature_importance", bbox_inches='tight')

# 以下はgainではなく、頻度
# feature_imp = pd.DataFrame(sorted(zip(EnsembleModels[0][1].feature_importances_,train_X.columns)), columns=['Value','Feature'])
# 
# plt.figure(figsize=(20, 10))
# sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
# plt.title('LightGBM Features (avg over folds)')
# plt.tight_layout()
# plt.show()


In [None]:
print(RocScores)

In [None]:
a = {'0': [0.7103774122037242], '1': [0.7295190845480759], '2': [0.7171733903991969], '3': [0.7080964594871795], '4': [0.7273620376188681]}


In [None]:
new = np.array(list(RocScores.values())).flatten().mean()
print(new)

old = np.array(list(a.values())).flatten().mean()
print(old)

print(f'new - old = {new - old}')