In [34]:
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

warnings.filterwarnings('ignore')


基本変数定義

In [35]:
sampling_flg=0 #サンプリング有無をコントロール
submit_flg=1 #保存するかをコントロール（サンプリングしない時のみ）

SEED=12345
sample_num=10000
fold_num=5

#train関連
train_dir='../../01_input/train.csv'
train_FN_dir='../../03_predict_train/FalseNegative_train.csv'
train_drop_col=['ID_code', 'target']
train_label='target'

#test関連
test_dir='../../01_input/test.csv'
test_drop_col=['ID_code']

#結果ファイル関連　nameは自分の名前に変更する
fin_train_preds_dir='../../03_predict_train/oka_252-2_LightGBM_train.csv'
fin_test_preds_dir='../../04_predict_test/oka_252-2_LightGBM_submission.csv'
save_col_name='oof_xgb'

sample_submission_dir='../../01_input/sample_submission.csv'
submission_target_col_name='target'
submission_id_col_name='ID_code'

前処理

In [36]:
#ロード
train_df=pd.read_csv(train_dir)
test_df=pd.read_csv(test_dir)
#train_FN_df=pd.read_csv(train_FN_dir)

In [37]:
#サンプリング
if sampling_flg ==1:
    train_df=train_df.sample(n=sample_num,random_state=SEED)
    test_df=test_df.sample(n=sample_num,random_state=SEED)

In [38]:
#x,y作成
x_train=train_df.drop(train_drop_col,axis=1)
y_train=train_df[train_label]
x_train_base=train_df.drop(train_drop_col,axis=1)
y_train_base=train_df[train_label]
x_test=test_df.drop(test_drop_col,axis=1)
x_train_col=x_train.columns

モデル実行

In [39]:
def lgbm(train_x,train_y,test_x,test_y):
    #model_param
    param = {
        "objective" : "binary", 
        "boost":"gbdt",
        "metric":"auc",
        "boost_from_average":"false",
        "num_threads":28,
        "learning_rate" : 0.01,
        "num_leaves" : 13,
        "max_depth":-1,
        "tree_learner" : "serial",
        "feature_fraction" : 0.05,
        "bagging_freq" : 5,
        "bagging_fraction" : 0.4,
        "min_data_in_leaf" : 80,
        "min_sum_hessian_in_leaf" : 10.0,
        "verbosity" : 1,
        'seed': 44000,
        }

    trn_data = lgb.Dataset(train_x,train_y)
    val_data = lgb.Dataset(test_x,test_y)
    
    num_round = 1000000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
    
    val_pred = clf.predict(test_x, num_iteration=clf.best_iteration)
    auc_score=roc_auc_score(test_y, val_pred)
    print("")
    print("AUC = {}".format(auc_score))
    print(confusion_matrix(test_y, pd.DataFrame(np.round(val_pred))))
    
    return clf

In [40]:
def split_func(df,SEED):
    ss=ShuffleSplit(n_splits=1,train_size=0.8,test_size=0.2,random_state=SEED)
    x=df.drop(train_drop_col,axis=1)
    y=df[train_label]
    train_index,test_index=next(ss.split(x,y))
    train_df_sample=df.iloc[train_index]
    test_df_sample=df.iloc[test_index]
    return train_df_sample,test_df_sample

In [41]:
train_df_sample,test_df_sample=split_func(train_df,1111)


In [42]:
j=0
for i in range(10,15):
    print(i)
    folds = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=i)
    oof_preds = np.zeros((len(x_train), 1))
    test_preds = np.zeros((len(x_test), 1))

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train.values, y_train.values)):
        print("\n")
        print("Fold {}".format(fold_))
        j+=1
        train_df_sample=train_df.iloc[trn_idx]
        sumple_num=round((len(train_df_sample[train_df_sample['target']==1])))
        train_df_sample_1=train_df_sample[train_df_sample['target']==1].sample(n=sumple_num,random_state=j)
        train_df_sample_0=train_df_sample[train_df_sample['target']==0].sample(n=round(sumple_num),random_state=j+100)
        train_df_sample=pd.concat([train_df_sample_0,train_df_sample_1],axis=0)
        print('train 1:'+str(len(train_df_sample[train_df_sample['target']==1])))
        print('train 0:'+str(len(train_df_sample[train_df_sample['target']==0])))
        print('test 1:'+str(len(test_df_sample[test_df_sample['target']==1])))
        print('test 0:'+str(len(test_df_sample[test_df_sample['target']==0])))

        trn_x=train_df_sample.drop(train_drop_col,axis=1)
        trn_y=train_df_sample[train_label]
        val_x,val_y = x_train.iloc[val_idx], y_train.iloc[val_idx]

        trn_data = lgb.Dataset(trn_x,trn_y)
        val_data = lgb.Dataset(val_x,val_y)

        param = {
        "objective" : "binary", 
        "boost":"gbdt",
        "metric":"auc",
        "boost_from_average":"false",
        "num_threads":28,
        "learning_rate" : 0.01,
        "num_leaves" : 13,
        "max_depth":2,
        "tree_learner" : "serial",
        "feature_fraction" : 0.05,
        "bagging_freq" : 5,
        "bagging_fraction" : 0.4,
        "min_data_in_leaf" : 80,
        "min_sum_hessian_in_leaf" : 10.0,
        "verbosity" : 1,
        'seed': i,
        'scale_pos_weight':0.35
        }
        num_round = 1000000
        clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
        val_pred = clf.predict(val_x, num_iteration=clf.best_iteration)
        test_pred = clf.predict(x_test, num_iteration=clf.best_iteration)

        print("AUC = {}".format(roc_auc_score(val_y, val_pred)))
        print(confusion_matrix(val_y, pd.DataFrame(np.round(val_pred))))
        #print("val = {}".format(val_pred)
        oof_preds[val_idx, :] = val_pred.reshape((-1, 1))
        test_preds += test_pred.reshape((-1, 1))

    test_preds /= fold_num
    roc_score = roc_auc_score(y_train, oof_preds.ravel())
    print("Overall AUC = {}".format(roc_score))
    print(confusion_matrix(y_train, pd.DataFrame(np.round(oof_preds))))

    train_preds_dir='../../03_predict_train/US4_train_'+str(i)+'.csv'
    test_preds_dir='../../04_predict_test/US4_test_'+str(i)+'.csv'
    
    pd.DataFrame(oof_preds).to_csv(train_preds_dir, index=False)
    pd.DataFrame(test_preds).to_csv(test_preds_dir, index=False)
    print('save done')


10


Fold 0
train 1:16078
train 0:16078
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.8644	valid_1's auc: 0.843348
[2000]	training's auc: 0.882437	valid_1's auc: 0.859345
[3000]	training's auc: 0.895111	valid_1's auc: 0.870655
[4000]	training's auc: 0.903661	valid_1's auc: 0.877649
[5000]	training's auc: 0.910015	valid_1's auc: 0.882821
[6000]	training's auc: 0.914634	valid_1's auc: 0.886322
[7000]	training's auc: 0.918218	valid_1's auc: 0.888665
[8000]	training's auc: 0.921242	valid_1's auc: 0.890398
[9000]	training's auc: 0.923673	valid_1's auc: 0.891635
[10000]	training's auc: 0.925854	valid_1's auc: 0.892655
[11000]	training's auc: 0.927715	valid_1's auc: 0.893223
[12000]	training's auc: 0.929463	valid_1's auc: 0.893665
[13000]	training's auc: 0.930974	valid_1's auc: 0.894013
[14000]	training's auc: 0.932351	valid_1's auc: 0.894175
[15000]	training's auc: 0.933713	valid_1's auc: 0.894242
[16000]	training's auc: 0.9

[8000]	training's auc: 0.922561	valid_1's auc: 0.892476
[9000]	training's auc: 0.925088	valid_1's auc: 0.893699
[10000]	training's auc: 0.927175	valid_1's auc: 0.894428
[11000]	training's auc: 0.929044	valid_1's auc: 0.894949
[12000]	training's auc: 0.930604	valid_1's auc: 0.895295
[13000]	training's auc: 0.932098	valid_1's auc: 0.89561
[14000]	training's auc: 0.933543	valid_1's auc: 0.895758
[15000]	training's auc: 0.934894	valid_1's auc: 0.895838
[16000]	training's auc: 0.936151	valid_1's auc: 0.895794
[17000]	training's auc: 0.937358	valid_1's auc: 0.895691
Early stopping, best iteration is:
[14956]	training's auc: 0.934834	valid_1's auc: 0.895864
AUC = 0.8958635990996353
[[33730  2251]
 [ 1555  2465]]


Fold 2
train 1:16078
train 0:16078
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.866096	valid_1's auc: 0.853578
[2000]	training's auc: 0.883568	valid_1's auc: 0.868385
[3000]	training's auc: 0.894895	valid_1's auc: 

[17000]	training's auc: 0.936039	valid_1's auc: 0.896033
Early stopping, best iteration is:
[14679]	training's auc: 0.933138	valid_1's auc: 0.896302
AUC = 0.8963022436455854
[[33678  2302]
 [ 1543  2477]]


Fold 3
train 1:16079
train 0:16079
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.866809	valid_1's auc: 0.859071
[2000]	training's auc: 0.882017	valid_1's auc: 0.871206
[3000]	training's auc: 0.894041	valid_1's auc: 0.88122
[4000]	training's auc: 0.903074	valid_1's auc: 0.888117
[5000]	training's auc: 0.9091	valid_1's auc: 0.892577
[6000]	training's auc: 0.913809	valid_1's auc: 0.895745
[7000]	training's auc: 0.917505	valid_1's auc: 0.897934
[8000]	training's auc: 0.920496	valid_1's auc: 0.899535
[9000]	training's auc: 0.922893	valid_1's auc: 0.900737
[10000]	training's auc: 0.924882	valid_1's auc: 0.901328
[11000]	training's auc: 0.926749	valid_1's auc: 0.901945
[12000]	training's auc: 0.928365	valid_1's auc: 0.9022

Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.866106	valid_1's auc: 0.850572
[2000]	training's auc: 0.883993	valid_1's auc: 0.866723
[3000]	training's auc: 0.896354	valid_1's auc: 0.876867
[4000]	training's auc: 0.905254	valid_1's auc: 0.883826
[5000]	training's auc: 0.911621	valid_1's auc: 0.888188
[6000]	training's auc: 0.916576	valid_1's auc: 0.891425
[7000]	training's auc: 0.920285	valid_1's auc: 0.893686
[8000]	training's auc: 0.923171	valid_1's auc: 0.894907
[9000]	training's auc: 0.925715	valid_1's auc: 0.895801
[10000]	training's auc: 0.927874	valid_1's auc: 0.896482
[11000]	training's auc: 0.929803	valid_1's auc: 0.896978
[12000]	training's auc: 0.931478	valid_1's auc: 0.89727
[13000]	training's auc: 0.93299	valid_1's auc: 0.897501
[14000]	training's auc: 0.934399	valid_1's auc: 0.897488
[15000]	training's auc: 0.935751	valid_1's auc: 0.897524
[16000]	training's auc: 0.937082	valid_1's auc: 0.897486
[17000]	training's auc: 0.938286	va

In [43]:
import glob
train_stack_df=pd.DataFrame(y_train)
test_stack_df=pd.DataFrame()
files = glob.glob('../../03_predict_train/*')
i=0
for file in files:
    if 'US4' in file:
        i+=1
        print(file)
        train_stack_df_tmp=pd.read_csv(file)
        train_stack_df['pred_'+str(i)]=train_stack_df_tmp

files = glob.glob('../../04_predict_test/*')
i=0
for file in files:
    if 'US4' in file:
        i+=1
        print(file)
        test_stack_df_tmp=pd.read_csv(file)
        
        if i == 1 :
            test_stack_df=test_stack_df_tmp
            test_stack_df.columns=['pred_1']
        else:
            test_stack_df['pred_'+str(i)]=test_stack_df_tmp


../../03_predict_train/US4_train_6.csv
../../03_predict_train/US4_train_0.csv
../../03_predict_train/US4_train_10.csv
../../03_predict_train/US4_train_2.csv
../../03_predict_train/US4_train_9.csv
../../03_predict_train/US4_train_12.csv
../../03_predict_train/US4_train_3.csv
../../03_predict_train/US4_train_13.csv
../../03_predict_train/US4_train_5.csv
../../03_predict_train/US4_train_14.csv
../../03_predict_train/US4_train_4.csv
../../03_predict_train/US4_train_11.csv
../../03_predict_train/US4_train_7.csv
../../03_predict_train/US4_train_8.csv
../../03_predict_train/US4_train_1.csv
../../04_predict_test/US4_test_2.csv
../../04_predict_test/US4_test_5.csv
../../04_predict_test/US4_test_7.csv
../../04_predict_test/US4_test_3.csv
../../04_predict_test/US4_test_12.csv
../../04_predict_test/US4_test_14.csv
../../04_predict_test/US4_test_1.csv
../../04_predict_test/US4_test_10.csv
../../04_predict_test/US4_test_11.csv
../../04_predict_test/US4_test_8.csv
../../04_predict_test/US4_test_4.csv

In [44]:
pd.set_option('display.max_rows', 500)
train_stack_df

Unnamed: 0,target,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,pred_8,pred_9,pred_10,pred_11,pred_12,pred_13,pred_14,pred_15
0,0,0.052079,0.031713,0.035480,0.039307,0.035985,0.027121,0.028039,0.025121,0.033111,0.037276,0.044877,0.029682,0.035349,0.038714,0.047973
1,0,0.667171,0.635790,0.698043,0.711116,0.806744,0.659812,0.730814,0.709804,0.596899,0.679815,0.808299,0.601934,0.770502,0.718913,0.658280
2,0,0.026545,0.021594,0.032187,0.014066,0.022994,0.024364,0.019981,0.020349,0.014042,0.018554,0.026500,0.023222,0.041059,0.023959,0.030498
3,0,0.556307,0.616654,0.443866,0.418800,0.432806,0.449670,0.481231,0.493116,0.573818,0.548497,0.532940,0.417150,0.414880,0.502474,0.490040
4,0,0.276644,0.227806,0.252867,0.192376,0.283271,0.321417,0.219499,0.324173,0.265672,0.204881,0.167591,0.337346,0.249824,0.268328,0.291390
5,0,0.058496,0.095212,0.078045,0.078587,0.083342,0.113306,0.108515,0.064336,0.060302,0.084020,0.069516,0.067374,0.071991,0.089081,0.094919
6,0,0.365617,0.284446,0.284630,0.308387,0.308295,0.333347,0.325806,0.351844,0.219929,0.237872,0.352156,0.195247,0.220906,0.331294,0.275306
7,0,0.216346,0.199884,0.150696,0.158361,0.159124,0.215368,0.168003,0.210862,0.183186,0.288769,0.217961,0.165945,0.292777,0.154378,0.211319
8,0,0.361577,0.243293,0.216996,0.179118,0.332003,0.185872,0.290422,0.265407,0.215659,0.177651,0.212144,0.207208,0.231543,0.345183,0.213261
9,0,0.061365,0.062464,0.027330,0.063967,0.039341,0.045156,0.043926,0.036649,0.041906,0.045918,0.042853,0.042500,0.048358,0.043037,0.058247


In [45]:
test_stack_df

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,pred_8,pred_9,pred_10,pred_11,pred_12,pred_13,pred_14,pred_15
0,0.351845,0.287049,0.338893,0.301523,0.314517,0.317818,0.326513,0.273614,0.315235,0.329322,0.308512,0.302337,0.272527,0.299520,0.349885
1,0.430416,0.438003,0.398194,0.420364,0.400058,0.380706,0.391792,0.462299,0.467725,0.468847,0.452122,0.466909,0.447137,0.438719,0.408615
2,0.422369,0.469076,0.401089,0.474984,0.439485,0.421118,0.440327,0.389649,0.439583,0.447883,0.470608,0.434656,0.435010,0.429370,0.418825
3,0.377699,0.403414,0.390639,0.396505,0.372701,0.404724,0.393548,0.431346,0.414353,0.438294,0.442795,0.340346,0.385191,0.426123,0.436163
4,0.120688,0.117561,0.120819,0.144650,0.145883,0.133725,0.115471,0.116814,0.143039,0.098506,0.113495,0.125937,0.117471,0.113983,0.130939
5,0.008543,0.007918,0.007615,0.008425,0.007238,0.007091,0.008520,0.008013,0.007683,0.007387,0.006904,0.008961,0.006540,0.008527,0.008082
6,0.017891,0.021898,0.016508,0.021101,0.017794,0.024119,0.025422,0.025068,0.025560,0.024076,0.018309,0.022528,0.018945,0.024515,0.025996
7,0.397416,0.363283,0.432741,0.406151,0.343510,0.409389,0.361734,0.383287,0.432089,0.466993,0.388408,0.407245,0.426339,0.408939,0.380521
8,0.009163,0.009971,0.007941,0.008206,0.009895,0.008314,0.009327,0.008238,0.010264,0.007271,0.009795,0.007960,0.009037,0.009730,0.007952
9,0.030382,0.027777,0.030035,0.021619,0.027818,0.025358,0.035090,0.029011,0.034118,0.025515,0.027242,0.032408,0.025679,0.025423,0.029018


In [46]:
train_stack_df.describe()

Unnamed: 0,target,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,pred_8,pred_9,pred_10,pred_11,pred_12,pred_13,pred_14,pred_15
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0
mean,0.10049,0.201627,0.201752,0.201337,0.200567,0.201348,0.201011,0.200892,0.20171,0.201247,0.200876,0.201138,0.20129,0.20122,0.200802,0.201019
std,0.300653,0.226904,0.230667,0.22962,0.231889,0.23056,0.231077,0.230822,0.232703,0.228686,0.229307,0.232885,0.227363,0.229651,0.228983,0.229783
min,0.0,0.000694,0.000503,0.000546,0.000373,0.000455,0.000424,0.000496,0.000442,0.000452,0.000457,0.00043,0.000468,0.000479,0.000455,0.000601
25%,0.0,0.043623,0.041343,0.041747,0.039857,0.041076,0.040486,0.040638,0.040094,0.04216,0.041387,0.039406,0.043086,0.041537,0.041726,0.041363
50%,0.0,0.10889,0.105957,0.106074,0.103565,0.105528,0.104766,0.104729,0.10429,0.10723,0.106014,0.103283,0.108068,0.106045,0.106396,0.10601
75%,0.0,0.271207,0.272331,0.271833,0.270322,0.271919,0.271204,0.270152,0.271381,0.270542,0.270054,0.270791,0.270749,0.271274,0.270473,0.270964
max,1.0,0.999717,0.99978,0.999687,0.999747,0.999805,0.999759,0.999781,0.999817,0.999852,0.9998,0.999898,0.999687,0.999597,0.999812,0.9997


In [47]:
test_stack_df.describe()

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,pred_8,pred_9,pred_10,pred_11,pred_12,pred_13,pred_14,pred_15
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0
mean,0.19545,0.196338,0.196174,0.195754,0.195956,0.195929,0.195967,0.196345,0.196264,0.195857,0.19603,0.196745,0.196799,0.196358,0.196679
std,0.213432,0.210549,0.211477,0.21242,0.212864,0.211203,0.211691,0.211392,0.209257,0.210902,0.214443,0.208969,0.2143,0.212317,0.212271
min,0.000445,0.000548,0.000558,0.000508,0.000423,0.000562,0.000583,0.000595,0.000588,0.000564,0.000435,0.000567,0.000455,0.000484,0.000532
25%,0.044466,0.046891,0.046142,0.045304,0.045229,0.046308,0.045935,0.046375,0.047796,0.046433,0.04416,0.048379,0.044791,0.045829,0.045997
50%,0.110944,0.114347,0.113363,0.112276,0.11209,0.113296,0.112944,0.113673,0.115067,0.113579,0.111215,0.116103,0.1121,0.113058,0.113629
75%,0.268419,0.26927,0.269044,0.268675,0.268845,0.26869,0.268834,0.26915,0.269124,0.267956,0.269058,0.269632,0.271166,0.269272,0.269868
max,0.99975,0.999662,0.999517,0.999705,0.999738,0.999745,0.999644,0.999585,0.99958,0.99965,0.999646,0.999626,0.999711,0.999767,0.999706


In [48]:
y_train_stack=train_stack_df['target']
x_train_stack=train_stack_df.drop('target',axis=1)
x_test_stack=test_stack_df

In [49]:
for i_df in [x_train_stack,x_test_stack]:
    i_df['mean']=i_df.mean(axis=1)
    i_df['max']=i_df.max(axis=1)
    i_df['min']=i_df.min(axis=1)
    

In [50]:
x_train_stack

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,pred_8,pred_9,pred_10,pred_11,pred_12,pred_13,pred_14,pred_15,mean,max,min
0,0.052079,0.031713,0.035480,0.039307,0.035985,0.027121,0.028039,0.025121,0.033111,0.037276,0.044877,0.029682,0.035349,0.038714,0.047973,0.036122,0.052079,0.025121
1,0.667171,0.635790,0.698043,0.711116,0.806744,0.659812,0.730814,0.709804,0.596899,0.679815,0.808299,0.601934,0.770502,0.718913,0.658280,0.696929,0.808299,0.596899
2,0.026545,0.021594,0.032187,0.014066,0.022994,0.024364,0.019981,0.020349,0.014042,0.018554,0.026500,0.023222,0.041059,0.023959,0.030498,0.023994,0.041059,0.014042
3,0.556307,0.616654,0.443866,0.418800,0.432806,0.449670,0.481231,0.493116,0.573818,0.548497,0.532940,0.417150,0.414880,0.502474,0.490040,0.491483,0.616654,0.414880
4,0.276644,0.227806,0.252867,0.192376,0.283271,0.321417,0.219499,0.324173,0.265672,0.204881,0.167591,0.337346,0.249824,0.268328,0.291390,0.258872,0.337346,0.167591
5,0.058496,0.095212,0.078045,0.078587,0.083342,0.113306,0.108515,0.064336,0.060302,0.084020,0.069516,0.067374,0.071991,0.089081,0.094919,0.081136,0.113306,0.058496
6,0.365617,0.284446,0.284630,0.308387,0.308295,0.333347,0.325806,0.351844,0.219929,0.237872,0.352156,0.195247,0.220906,0.331294,0.275306,0.293005,0.365617,0.195247
7,0.216346,0.199884,0.150696,0.158361,0.159124,0.215368,0.168003,0.210862,0.183186,0.288769,0.217961,0.165945,0.292777,0.154378,0.211319,0.199532,0.292777,0.150696
8,0.361577,0.243293,0.216996,0.179118,0.332003,0.185872,0.290422,0.265407,0.215659,0.177651,0.212144,0.207208,0.231543,0.345183,0.213261,0.245156,0.361577,0.177651
9,0.061365,0.062464,0.027330,0.063967,0.039341,0.045156,0.043926,0.036649,0.041906,0.045918,0.042853,0.042500,0.048358,0.043037,0.058247,0.046868,0.063967,0.027330


In [55]:
x_train_stack=pd.concat([x_train_stack,x_train_base],axis=1)
x_test_stack=pd.concat([x_test_stack,x_test],axis=1)

In [56]:
#model_param
param = {
    "objective" : "binary", 
    "boost":"gbdt",
    "metric":"auc",
    "boost_from_average":"false",
    "num_threads":28,
    "learning_rate" : 0.0083,
    "num_leaves" : 13,
    "max_depth":5,
    "tree_learner" : "serial",
    "feature_fraction" : 0.0405,
    "bagging_freq" : 5,
    "bagging_fraction" : 0.331,
    "min_data_in_leaf" : 80,
    "min_sum_hessian_in_leaf" : 10.0,
    "verbosity" : 1,
    'seed': 44000,
    }

In [57]:
folds = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=SEED)
oof_preds = np.zeros((len(x_train_stack), 1))
test_preds = np.zeros((len(x_test_stack), 1))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train_stack.values, y_train_stack.values)):
    print("\n")
    print("Fold {}".format(fold_))
    trn_x,trn_y = x_train_stack.iloc[trn_idx], y_train_stack.iloc[trn_idx]
    val_x,val_y = x_train_stack.iloc[val_idx], y_train_stack.iloc[val_idx]
    trn_data = lgb.Dataset(trn_x,trn_y)
    val_data = lgb.Dataset(val_x,val_y)
    
    num_round = 1000000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
    val_pred = clf.predict(val_x, num_iteration=clf.best_iteration)
    test_pred = clf.predict(x_test_stack, num_iteration=clf.best_iteration)
    
    print("AUC = {}".format(roc_auc_score(val_y, val_pred)))
    print(confusion_matrix(val_y, pd.DataFrame(np.round(val_pred))))

    #print("val = {}".format(val_pred)
    oof_preds[val_idx, :] = val_pred.reshape((-1, 1))
    test_preds += test_pred.reshape((-1, 1))
    
test_preds /= fold_num
roc_score = roc_auc_score(y_train_stack, oof_preds.ravel())
print("Overall AUC = {}".format(roc_score))
print(confusion_matrix(y_train_stack, pd.DataFrame(np.round(oof_preds))))




Fold 0
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.905141	valid_1's auc: 0.899683
[2000]	training's auc: 0.909715	valid_1's auc: 0.899144
[3000]	training's auc: 0.914371	valid_1's auc: 0.898708
Early stopping, best iteration is:
[266]	training's auc: 0.902389	valid_1's auc: 0.899884
AUC = 0.899884129006174
[[35737   244]
 [ 2870  1150]]


Fold 1
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.905228	valid_1's auc: 0.899222
[2000]	training's auc: 0.909939	valid_1's auc: 0.898675
[3000]	training's auc: 0.914655	valid_1's auc: 0.898088
Early stopping, best iteration is:
[582]	training's auc: 0.903601	valid_1's auc: 0.899357
AUC = 0.8993573930187865
[[35554   427]
 [ 2590  1430]]


Fold 2
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.905434	valid_1's auc: 0.898529
[2000]	training's auc: 0.910046	valid_1's auc: 0.898033
[3000]	training's auc: 0.914568	valid_1's 

In [58]:
#結果保存
if (submit_flg ==1 and sampling_flg==0):
    series_oof_preds = pd.Series(data=oof_preds[:,0], name=save_col_name, dtype='float')
    series_oof_preds.to_csv(fin_train_preds_dir,header=True, index=False)

    sample = pd.read_csv(sample_submission_dir)
    sample.target = test_preds[:,0].astype(float)
    sample.ID_code = test_df['ID_code']
    sample.to_csv(fin_test_preds_dir, index=False)
    