In [1]:
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit

from sklearn.model_selection import GridSearchCV
from sklearn.grid_search import RandomizedSearchCV

warnings.filterwarnings('ignore')




基本変数定義

In [2]:
sampling_flg=0 #サンプリング有無をコントロール
submit_flg=1 #保存するかをコントロール（サンプリングしない時のみ）

#SEED=12345
sample_num=5000
fold_num=5

#train関連
train_dir='../../02_feature/101_train.csv'
train_drop_col=['ID_code', 'target']
train_label='target'

#test関連
test_dir='../../02_feature/101_test.csv'
test_drop_col=['ID_code']

#結果ファイル関連　nameは自分の名前に変更する
train_preds_dir='../../03_predict_train/oka_219_LightGBM_seed_train.csv'
test_preds_dir='../../04_predict_test/oka_219_LightGBM_seed_submission.csv'
save_col_name='oof_xgb'

sample_submission_dir='../../01_input/sample_submission.csv'
submission_target_col_name='target'
submission_id_col_name='ID_code'

前処理

In [3]:
#ロード
train_df=pd.read_csv(train_dir)
test_df=pd.read_csv(test_dir)

In [4]:
#サンプリング
if sampling_flg ==1:
    train_df=train_df.sample(n=sample_num,random_state=1)
    test_df=test_df.sample(n=sample_num,random_state=1)

In [5]:
#x,y作成
x_train=train_df.drop(train_drop_col,axis=1)
y_train=train_df[train_label]
x_test=test_df.drop(test_drop_col,axis=1)

モデル実行

In [None]:
#model_param
all_oof_preds = np.zeros((len(x_train), 1))
all_test_preds = np.zeros((len(x_test), 1))
for SEED in range(0,9):
    print(SEED)
    param = {
            'num_leaves': 2,
            'max_bin': 63,
            'min_data_in_leaf': 45,
            'learning_rate': 0.01,
            'min_sum_hessian_in_leaf': 0.000446,
            'bagging_fraction': 0.55, 
            'bagging_freq': 5, 
            'max_depth': 14,
            'save_binary': True,
            'seed': SEED,
            'feature_fraction_seed': SEED,
            'feature_fraction': 0.51,
            'bagging_seed': SEED,
            'drop_seed': SEED,
            'data_random_seed': SEED,
            'objective': 'binary',
            'boosting_type': 'gbdt',
            'verbose': 1,
            'metric': 'auc',
            'is_unbalance': True,
            'boost_from_average': False,
        }

    folds = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=SEED)
    oof_preds = np.zeros((len(x_train), 1))
    test_preds = np.zeros((len(x_test), 1))

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train.values, y_train.values)):
        print("\n")
        print("Fold {}".format(fold_))
        trn_x,trn_y = x_train.iloc[trn_idx], y_train.iloc[trn_idx]
        val_x,val_y = x_train.iloc[val_idx], y_train.iloc[val_idx]
        trn_data = lgb.Dataset(trn_x,trn_y)
        val_data = lgb.Dataset(val_x,val_y)

        num_round = 50000
        clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 2000)
        val_pred = clf.predict(val_x, num_iteration=clf.best_iteration)
        test_pred = clf.predict(x_test, num_iteration=clf.best_iteration)

        print("AUC = {}".format(roc_auc_score(val_y, val_pred)))
        #print("val = {}".format(val_pred)
        oof_preds[val_idx, :] = val_pred.reshape((-1, 1))
        test_preds += test_pred.reshape((-1, 1))

    test_preds /= fold_num
    roc_score = roc_auc_score(y_train, oof_preds.ravel())
    print("Overall AUC = {}".format(roc_score))
    
    all_oof_preds+=oof_preds
    all_test_preds+=test_preds
    
all_oof_preds/=10
all_test_preds/=10

0


Fold 0
Training until validation scores don't improve for 2000 rounds.
[1000]	training's auc: 0.794203	valid_1's auc: 0.786541
[2000]	training's auc: 0.829256	valid_1's auc: 0.81924
[3000]	training's auc: 0.848083	valid_1's auc: 0.83773
[4000]	training's auc: 0.860627	valid_1's auc: 0.849917
[5000]	training's auc: 0.86973	valid_1's auc: 0.858279
[6000]	training's auc: 0.87633	valid_1's auc: 0.864682
[7000]	training's auc: 0.881283	valid_1's auc: 0.869606
[8000]	training's auc: 0.88546	valid_1's auc: 0.873361
[9000]	training's auc: 0.888742	valid_1's auc: 0.876347
[10000]	training's auc: 0.89159	valid_1's auc: 0.8791
[11000]	training's auc: 0.893955	valid_1's auc: 0.881466
[12000]	training's auc: 0.896026	valid_1's auc: 0.883303
[13000]	training's auc: 0.897843	valid_1's auc: 0.885152
[14000]	training's auc: 0.899444	valid_1's auc: 0.886622
[15000]	training's auc: 0.90077	valid_1's auc: 0.887653
[16000]	training's auc: 0.901987	valid_1's auc: 0.88876
[17000]	training's auc: 0.903079

[11000]	training's auc: 0.891924	valid_1's auc: 0.891512
[12000]	training's auc: 0.8943	valid_1's auc: 0.893635
[13000]	training's auc: 0.896048	valid_1's auc: 0.894971
[14000]	training's auc: 0.897566	valid_1's auc: 0.896153
[15000]	training's auc: 0.899094	valid_1's auc: 0.897454
[16000]	training's auc: 0.900294	valid_1's auc: 0.898424
[17000]	training's auc: 0.901396	valid_1's auc: 0.899134
[18000]	training's auc: 0.902402	valid_1's auc: 0.899796
[19000]	training's auc: 0.903235	valid_1's auc: 0.90038
[20000]	training's auc: 0.903961	valid_1's auc: 0.90091
[21000]	training's auc: 0.904613	valid_1's auc: 0.901259
[22000]	training's auc: 0.905284	valid_1's auc: 0.901657
[23000]	training's auc: 0.905902	valid_1's auc: 0.901987
[24000]	training's auc: 0.906424	valid_1's auc: 0.90233
[25000]	training's auc: 0.906944	valid_1's auc: 0.902614
[26000]	training's auc: 0.907386	valid_1's auc: 0.902722
[27000]	training's auc: 0.907787	valid_1's auc: 0.902939
[28000]	training's auc: 0.908139	val

In [None]:
roc_score = roc_auc_score(y_train, all_oof_preds.ravel())
print("Overall AUC = {}".format(roc_score))


In [None]:
#結果保存
if (submit_flg ==1 and sampling_flg==0):
    series_oof_preds = pd.Series(data=oof_preds[:,0], name=save_col_name, dtype='float')
    series_oof_preds.to_csv(train_preds_dir,header=True, index=False)

    sample = pd.read_csv(sample_submission_dir)
    sample.target = test_preds[:,0].astype(float)
    sample.ID_code = test_df['ID_code']
    sample.to_csv(test_preds_dir, index=False)
    