In [1]:
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit

from sklearn.model_selection import GridSearchCV
from sklearn.grid_search import RandomizedSearchCV

warnings.filterwarnings('ignore')




基本変数定義

In [2]:
sampling_flg=0 #サンプリング有無をコントロール
submit_flg=1 #保存するかをコントロール（サンプリングしない時のみ）

SEED=12345
sample_num=10000
fold_num=5

#train関連
train_dir='../../02_feature/101_train.csv'
train_feature_dir='../../02_feature/113_train_NegativeCount.csv'
train_drop_col=['ID_code', 'target']
train_label='target'

#test関連
test_dir='../../02_feature/101_test.csv'
test_feature_dir='../../02_feature/113_test_NegativeCount.csv'
test_drop_col=['ID_code']

#結果ファイル関連　nameは自分の名前に変更する
train_preds_dir='../../03_predict_train/oka_220_LightGBM_NegCount_train.csv'
test_preds_dir='../../04_predict_test/oka_220_LightGBM_NegCount_submission.csv'
save_col_name='oof_xgb'

sample_submission_dir='../../01_input/sample_submission.csv'
submission_target_col_name='target'
submission_id_col_name='ID_code'

前処理

In [3]:
#ロード
train_df=pd.read_csv(train_dir)
test_df=pd.read_csv(test_dir)
train_feature_df=pd.read_csv(train_feature_dir)
test_feature_df=pd.read_csv(test_feature_dir)
train_df=pd.concat([train_df,train_feature_df],axis=1)
test_df=pd.concat([test_df,test_feature_df],axis=1)

In [4]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,...,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199,NegCount
0,0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,...,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914,36
1,1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,...,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518,33
2,2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,...,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965,42
3,3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,...,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996,39
4,4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,...,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104,39


In [5]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,ID_code,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199,NegCount
0,0,test_0,11.0656,7.7798,12.9536,9.4292,11.4327,-2.3805,5.8493,18.2675,...,11.8495,-1.43,2.4508,13.7112,2.4669,4.3654,10.72,15.4722,-8.7197,42
1,1,test_1,8.5304,1.2543,11.3047,5.1858,9.1974,-4.0117,6.0196,18.6316,...,8.8349,0.9403,10.1282,15.5765,0.4773,-1.4852,9.8714,19.1293,-20.976,36
2,2,test_2,5.4827,-10.3581,10.1407,7.0479,10.2628,9.8052,4.895,20.2537,...,10.9935,1.9803,2.18,12.9813,2.1281,-7.1086,7.0618,19.8956,-23.1794,38
3,3,test_3,8.5374,-1.3222,12.022,6.5749,8.8458,3.1744,4.9397,20.566,...,9.0766,1.658,3.5813,15.1874,3.1656,3.9567,9.2295,13.0168,-4.2108,23
4,4,test_4,11.7058,-0.1327,14.1295,7.7506,9.1035,-8.5848,6.8595,10.6048,...,9.1723,1.2835,3.3778,19.5542,-0.286,-5.1612,7.2882,13.926,-9.1846,40


In [6]:
#サンプリング
if sampling_flg ==1:
    train_df=train_df.sample(n=sample_num,random_state=SEED)
    test_df=test_df.sample(n=sample_num,random_state=SEED)

In [7]:
#x,y作成
x_train=train_df.drop(train_drop_col,axis=1)
y_train=train_df[train_label]
x_test=test_df.drop(test_drop_col,axis=1)

モデル実行

In [8]:
#model_param
param = {
        'num_leaves': 2,
        'max_bin': 63,
        'min_data_in_leaf': 45,
        'learning_rate': 0.01,
        'min_sum_hessian_in_leaf': 0.000446,
        'bagging_fraction': 0.55, 
        'bagging_freq': 5, 
        'max_depth': 14,
        'save_binary': True,
        'seed': 31452,
        'feature_fraction_seed': 31415,
        'feature_fraction': 0.51,
        'bagging_seed': 31415,
        'drop_seed': 31415,
        'data_random_seed': 31415,
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'verbose': 1,
        'metric': 'auc',
        'is_unbalance': True,
        'boost_from_average': False,
    }

In [9]:
folds = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=SEED)
oof_preds = np.zeros((len(x_train), 1))
test_preds = np.zeros((len(x_test), 1))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train.values, y_train.values)):
    print("\n")
    print("Fold {}".format(fold_))
    trn_x,trn_y = x_train.iloc[trn_idx], y_train.iloc[trn_idx]
    val_x,val_y = x_train.iloc[val_idx], y_train.iloc[val_idx]
    trn_data = lgb.Dataset(trn_x,trn_y)
    val_data = lgb.Dataset(val_x,val_y)
    
    num_round = 50000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 2000)
    val_pred = clf.predict(val_x, num_iteration=clf.best_iteration)
    test_pred = clf.predict(x_test, num_iteration=clf.best_iteration)
    
    print("AUC = {}".format(roc_auc_score(val_y, val_pred)))
    #print("val = {}".format(val_pred)
    oof_preds[val_idx, :] = val_pred.reshape((-1, 1))
    test_preds += test_pred.reshape((-1, 1))
    
test_preds /= fold_num
roc_score = roc_auc_score(y_train, oof_preds.ravel())
print("Overall AUC = {}".format(roc_score))



Fold 0
Training until validation scores don't improve for 2000 rounds.
[1000]	training's auc: 0.790731	valid_1's auc: 0.785243
[2000]	training's auc: 0.827752	valid_1's auc: 0.824177
[3000]	training's auc: 0.846947	valid_1's auc: 0.842507
[4000]	training's auc: 0.8596	valid_1's auc: 0.854427
[5000]	training's auc: 0.868846	valid_1's auc: 0.863453
[6000]	training's auc: 0.875844	valid_1's auc: 0.869471
[7000]	training's auc: 0.881053	valid_1's auc: 0.874165
[8000]	training's auc: 0.885232	valid_1's auc: 0.877704
[9000]	training's auc: 0.888408	valid_1's auc: 0.880445
[10000]	training's auc: 0.891327	valid_1's auc: 0.882815
[11000]	training's auc: 0.893667	valid_1's auc: 0.884863
[12000]	training's auc: 0.895701	valid_1's auc: 0.88656
[13000]	training's auc: 0.897472	valid_1's auc: 0.888006
[14000]	training's auc: 0.898992	valid_1's auc: 0.889132
[15000]	training's auc: 0.900366	valid_1's auc: 0.890254
[16000]	training's auc: 0.901644	valid_1's auc: 0.891154
[17000]	training's auc: 0.9

[12000]	training's auc: 0.895512	valid_1's auc: 0.885677
[13000]	training's auc: 0.897271	valid_1's auc: 0.887385
[14000]	training's auc: 0.898876	valid_1's auc: 0.888707
[15000]	training's auc: 0.900318	valid_1's auc: 0.889933
[16000]	training's auc: 0.901565	valid_1's auc: 0.891033
[17000]	training's auc: 0.902644	valid_1's auc: 0.891955
[18000]	training's auc: 0.90368	valid_1's auc: 0.892792
[19000]	training's auc: 0.904456	valid_1's auc: 0.893482
[20000]	training's auc: 0.905258	valid_1's auc: 0.894097
[21000]	training's auc: 0.905976	valid_1's auc: 0.894637
[22000]	training's auc: 0.906597	valid_1's auc: 0.895082
[23000]	training's auc: 0.907168	valid_1's auc: 0.895408
[24000]	training's auc: 0.907676	valid_1's auc: 0.895767
[25000]	training's auc: 0.908154	valid_1's auc: 0.896075
[26000]	training's auc: 0.90862	valid_1's auc: 0.896391
[27000]	training's auc: 0.909022	valid_1's auc: 0.896669
[28000]	training's auc: 0.909422	valid_1's auc: 0.896849
[29000]	training's auc: 0.909784	

In [10]:
#結果保存
if (submit_flg ==1 and sampling_flg==0):
    series_oof_preds = pd.Series(data=oof_preds[:,0], name=save_col_name, dtype='float')
    series_oof_preds.to_csv(train_preds_dir,header=True, index=False)

    sample = pd.read_csv(sample_submission_dir)
    sample.target = test_preds[:,0].astype(float)
    sample.ID_code = test_df['ID_code']
    sample.to_csv(test_preds_dir, index=False)
    