In [1]:
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit

from sklearn.model_selection import GridSearchCV
from sklearn.grid_search import RandomizedSearchCV

warnings.filterwarnings('ignore')




基本変数定義

In [2]:
sampling_flg=0 #サンプリング有無をコントロール
submit_flg=1 #保存するかをコントロール（サンプリングしない時のみ）

SEED=12345
sample_num=10000
fold_num=5

#train関連
train_dir='../../02_feature/113_train_qcut.csv'
train_drop_col=['ID_code', 'target']
train_label='target'

#test関連
test_dir='../../02_feature/113_test_qcut.csv'
test_drop_col=['ID_code']

#結果ファイル関連　nameは自分の名前に変更する
train_preds_dir='../../03_predict_train/oka_218_LightGBM_qcut_train.csv'
test_preds_dir='../../04_predict_test/oka_218_LightGBM_qcut_submission.csv'
save_col_name='oof_xgb'

sample_submission_dir='../../01_input/sample_submission.csv'
submission_target_col_name='target'
submission_id_col_name='ID_code'

前処理

In [3]:
#ロード
train_df=pd.read_csv(train_dir)
test_df=pd.read_csv(test_dir)

In [4]:
#サンプリング
if sampling_flg ==1:
    train_df=train_df.sample(n=sample_num,random_state=SEED)
    test_df=test_df.sample(n=sample_num,random_state=SEED)

In [5]:
#x,y作成
x_train=train_df.drop(train_drop_col,axis=1)
y_train=train_df[train_label]
x_test=test_df.drop(test_drop_col,axis=1)

モデル実行

In [6]:
#model_param
param = {
        'num_leaves': 2,
        'max_bin': 63,
        'min_data_in_leaf': 45,
        'learning_rate': 0.01,
        'min_sum_hessian_in_leaf': 0.000446,
        'bagging_fraction': 0.55, 
        'bagging_freq': 5, 
        'max_depth': 14,
        'save_binary': True,
        'seed': 31452,
        'feature_fraction_seed': 31415,
        'feature_fraction': 0.51,
        'bagging_seed': 31415,
        'drop_seed': 31415,
        'data_random_seed': 31415,
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'verbose': 1,
        'metric': 'auc',
        'is_unbalance': True,
        'boost_from_average': False,
    }

In [7]:
folds = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=SEED)
oof_preds = np.zeros((len(x_train), 1))
test_preds = np.zeros((len(x_test), 1))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train.values, y_train.values)):
    print("\n")
    print("Fold {}".format(fold_))
    trn_x,trn_y = x_train.iloc[trn_idx], y_train.iloc[trn_idx]
    val_x,val_y = x_train.iloc[val_idx], y_train.iloc[val_idx]
    trn_data = lgb.Dataset(trn_x,trn_y)
    val_data = lgb.Dataset(val_x,val_y)
    
    num_round = 50000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 2000)
    val_pred = clf.predict(val_x, num_iteration=clf.best_iteration)
    test_pred = clf.predict(x_test, num_iteration=clf.best_iteration)
    
    print("AUC = {}".format(roc_auc_score(val_y, val_pred)))
    #print("val = {}".format(val_pred)
    oof_preds[val_idx, :] = val_pred.reshape((-1, 1))
    test_preds += test_pred.reshape((-1, 1))
    
test_preds /= fold_num
roc_score = roc_auc_score(y_train, oof_preds.ravel())
print("Overall AUC = {}".format(roc_score))



Fold 0
Training until validation scores don't improve for 2000 rounds.
[1000]	training's auc: 0.791293	valid_1's auc: 0.786103
[2000]	training's auc: 0.827794	valid_1's auc: 0.823577
[3000]	training's auc: 0.847561	valid_1's auc: 0.843215
[4000]	training's auc: 0.859268	valid_1's auc: 0.854495
[5000]	training's auc: 0.868616	valid_1's auc: 0.863182
[6000]	training's auc: 0.875614	valid_1's auc: 0.869505
[7000]	training's auc: 0.880959	valid_1's auc: 0.874179
[8000]	training's auc: 0.884844	valid_1's auc: 0.877496
[9000]	training's auc: 0.888523	valid_1's auc: 0.880578
[10000]	training's auc: 0.891467	valid_1's auc: 0.883076
[11000]	training's auc: 0.893597	valid_1's auc: 0.884927
[12000]	training's auc: 0.895642	valid_1's auc: 0.886612
[13000]	training's auc: 0.897548	valid_1's auc: 0.888268
[14000]	training's auc: 0.898988	valid_1's auc: 0.889287
[15000]	training's auc: 0.900409	valid_1's auc: 0.890522
[16000]	training's auc: 0.90165	valid_1's auc: 0.891646
[17000]	training's auc: 0

[11000]	training's auc: 0.893316	valid_1's auc: 0.883747
[12000]	training's auc: 0.895416	valid_1's auc: 0.885566
[13000]	training's auc: 0.897157	valid_1's auc: 0.88718
[14000]	training's auc: 0.898741	valid_1's auc: 0.88846
[15000]	training's auc: 0.900143	valid_1's auc: 0.889774
[16000]	training's auc: 0.901396	valid_1's auc: 0.891005
[17000]	training's auc: 0.902487	valid_1's auc: 0.891828
[18000]	training's auc: 0.903539	valid_1's auc: 0.892699
[19000]	training's auc: 0.904459	valid_1's auc: 0.893416
[20000]	training's auc: 0.905233	valid_1's auc: 0.894055
[21000]	training's auc: 0.905885	valid_1's auc: 0.894514
[22000]	training's auc: 0.906564	valid_1's auc: 0.895064
[23000]	training's auc: 0.907121	valid_1's auc: 0.89541
[24000]	training's auc: 0.907628	valid_1's auc: 0.895652
[25000]	training's auc: 0.908115	valid_1's auc: 0.895982
[26000]	training's auc: 0.908584	valid_1's auc: 0.896273
[27000]	training's auc: 0.909	valid_1's auc: 0.896532
[28000]	training's auc: 0.909377	vali

In [8]:
#結果保存
if (submit_flg ==1 and sampling_flg==0):
    series_oof_preds = pd.Series(data=oof_preds[:,0], name=save_col_name, dtype='float')
    series_oof_preds.to_csv(train_preds_dir,header=True, index=False)

    sample = pd.read_csv(sample_submission_dir)
    sample.target = test_preds[:,0].astype(float)
    sample.ID_code = test_df['ID_code']
    sample.to_csv(test_preds_dir, index=False)
    