In [1]:
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

warnings.filterwarnings('ignore')


基本変数定義

In [2]:
sampling_flg=0 #サンプリング有無をコントロール
submit_flg=1 #保存するかをコントロール（サンプリングしない時のみ）

SEED=12345
sample_num=10000
fold_num=5

#train関連
train_dir='../../01_input/train.csv'
train_FN_dir='../../03_predict_train/FalseNegative_train.csv'
train_drop_col=['ID_code', 'target']
train_label='target'

#test関連
test_dir='../../01_input/test.csv'
test_drop_col=['ID_code']

#結果ファイル関連　nameは自分の名前に変更する
fin_train_preds_dir='../../03_predict_train/oka_250-4_LightGBM_train.csv'
fin_test_preds_dir='../../04_predict_test/oka_250-4_LightGBM_submission.csv'
save_col_name='oof_xgb'

sample_submission_dir='../../01_input/sample_submission.csv'
submission_target_col_name='target'
submission_id_col_name='ID_code'

前処理

In [3]:
#ロード
train_df=pd.read_csv(train_dir)
test_df=pd.read_csv(test_dir)
#train_FN_df=pd.read_csv(train_FN_dir)

In [4]:
#サンプリング
if sampling_flg ==1:
    train_df=train_df.sample(n=sample_num,random_state=SEED)
    test_df=test_df.sample(n=sample_num,random_state=SEED)

In [5]:
#x,y作成
x_train=train_df.drop(train_drop_col,axis=1)
y_train=train_df[train_label]
x_train_base=train_df.drop(train_drop_col,axis=1)
y_train_base=train_df[train_label]
x_test=test_df.drop(test_drop_col,axis=1)
x_train_col=x_train.columns

In [10]:
param = {
    "objective" : "binary", 
    "boost":"gbdt",
    "metric":"auc",
    "boost_from_average":"false",
    "num_threads":28,
    "learning_rate" : 0.01,
    "num_leaves" : 13,
    "max_depth":-1,
    "tree_learner" : "serial",
    "feature_fraction" : 0.05,
    "bagging_freq" : 5,
    "bagging_fraction" : 0.4,
    "min_data_in_leaf" : 80,
    "min_sum_hessian_in_leaf" : 10.0,
    "verbosity" : 1,
    'seed': i,
    }


モデル実行

In [6]:
def lgbm(train_x,train_y,test_x,test_y):
    #model_param
    param = {
        "objective" : "binary", 
        "boost":"gbdt",
        "metric":"auc",
        "boost_from_average":"false",
        "num_threads":28,
        "learning_rate" : 0.01,
        "num_leaves" : 13,
        "max_depth":-1,
        "tree_learner" : "serial",
        "feature_fraction" : 0.05,
        "bagging_freq" : 5,
        "bagging_fraction" : 0.4,
        "min_data_in_leaf" : 80,
        "min_sum_hessian_in_leaf" : 10.0,
        "verbosity" : 1,
        'seed': i,
        }

    trn_data = lgb.Dataset(train_x,train_y)
    val_data = lgb.Dataset(test_x,test_y)
    
    num_round = 1000000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
    
    val_pred = clf.predict(test_x, num_iteration=clf.best_iteration)
    auc_score=roc_auc_score(test_y, val_pred)
    print("")
    print("AUC = {}".format(auc_score))
    print(confusion_matrix(test_y, pd.DataFrame(np.round(val_pred))))
    
    return clf

In [7]:
def split_func(df,SEED):
    ss=ShuffleSplit(n_splits=1,train_size=0.8,test_size=0.2,random_state=SEED)
    x=df.drop(train_drop_col,axis=1)
    y=df[train_label]
    train_index,test_index=next(ss.split(x,y))
    train_df_sample=df.iloc[train_index]
    test_df_sample=df.iloc[test_index]
    return train_df_sample,test_df_sample

In [8]:
train_df_sample,test_df_sample=split_func(train_df,1111)


In [11]:
j=0
for i in range(0,200):
    print(i)
    folds = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=i)
    oof_preds = np.zeros((len(x_train), 1))
    test_preds = np.zeros((len(x_test), 1))

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train.values, y_train.values)):
        print("\n")
        print("Fold {}".format(fold_))
        j+=1
        train_df_sample=train_df.iloc[trn_idx]
        sumple_num=round((len(train_df_sample[train_df_sample['target']==1])))
        train_df_sample_1=train_df_sample[train_df_sample['target']==1].sample(n=sumple_num,random_state=j)
        train_df_sample_0=train_df_sample[train_df_sample['target']==0].sample(n=round(sumple_num/10),random_state=j+100)
        train_df_sample=pd.concat([train_df_sample_0,train_df_sample_1],axis=0)
        print('train 1:'+str(len(train_df_sample[train_df_sample['target']==1])))
        print('train 0:'+str(len(train_df_sample[train_df_sample['target']==0])))
        print('test 1:'+str(len(test_df_sample[test_df_sample['target']==1])))
        print('test 0:'+str(len(test_df_sample[test_df_sample['target']==0])))

        trn_x=train_df_sample.drop(train_drop_col,axis=1)
        trn_y=train_df_sample[train_label]
        val_x,val_y = x_train.iloc[val_idx], y_train.iloc[val_idx]

        trn_data = lgb.Dataset(trn_x,trn_y)
        val_data = lgb.Dataset(val_x,val_y)

        num_round = 1000000
        clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
        val_pred = clf.predict(val_x, num_iteration=clf.best_iteration)
        test_pred = clf.predict(x_test, num_iteration=clf.best_iteration)

        print("AUC = {}".format(roc_auc_score(val_y, val_pred)))
        print(confusion_matrix(val_y, pd.DataFrame(np.round(val_pred))))
        #print("val = {}".format(val_pred)
        oof_preds[val_idx, :] = val_pred.reshape((-1, 1))
        test_preds += test_pred.reshape((-1, 1))

    test_preds /= fold_num
    roc_score = roc_auc_score(y_train, oof_preds.ravel())
    print("Overall AUC = {}".format(roc_score))
    print(confusion_matrix(y_train, pd.DataFrame(np.round(oof_preds))))

    train_preds_dir='../../03_predict_train/US4_train_'+str(i)+'.csv'
    test_preds_dir='../../04_predict_test/US4_test_'+str(i)+'.csv'
    
    pd.DataFrame(oof_preds).to_csv(train_preds_dir, index=False)
    pd.DataFrame(test_preds).to_csv(test_preds_dir, index=False)
    print('save done')


0


Fold 0
train 1:16078
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.979018	valid_1's auc: 0.860602
[2000]	training's auc: 0.98929	valid_1's auc: 0.868901
[3000]	training's auc: 0.994885	valid_1's auc: 0.871679
[4000]	training's auc: 0.997984	valid_1's auc: 0.87259
[5000]	training's auc: 0.999322	valid_1's auc: 0.87236
[6000]	training's auc: 0.999803	valid_1's auc: 0.871751
[7000]	training's auc: 0.999936	valid_1's auc: 0.870995
Early stopping, best iteration is:
[4204]	training's auc: 0.998389	valid_1's auc: 0.872648
AUC = 0.8726483477114303
[[ 3669 32312]
 [   14  4006]]


Fold 1
train 1:16078
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.977935	valid_1's auc: 0.863214
[2000]	training's auc: 0.988071	valid_1's auc: 0.87121
[3000]	training's auc: 0.994238	valid_1's auc: 0.873601
[4000]	training's auc: 0.99756	valid_1's auc:

[6000]	training's auc: 0.999367	valid_1's auc: 0.877252
[7000]	training's auc: 0.999528	valid_1's auc: 0.876638
Early stopping, best iteration is:
[4306]	training's auc: 0.998019	valid_1's auc: 0.878739
AUC = 0.878738678757408
[[ 3938 32042]
 [    6  4014]]


Fold 3
train 1:16079
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.977297	valid_1's auc: 0.865124
[2000]	training's auc: 0.988014	valid_1's auc: 0.87287
[3000]	training's auc: 0.994215	valid_1's auc: 0.875257
[4000]	training's auc: 0.997586	valid_1's auc: 0.876279
[5000]	training's auc: 0.999187	valid_1's auc: 0.876553
[6000]	training's auc: 0.999799	valid_1's auc: 0.876324
[7000]	training's auc: 0.999968	valid_1's auc: 0.875693
[8000]	training's auc: 0.999998	valid_1's auc: 0.874637
Early stopping, best iteration is:
[5024]	training's auc: 0.999216	valid_1's auc: 0.876645
AUC = 0.876644969192334
[[ 4699 31281]
 [   19  4000]]


Fold 4
train 1:16079
t

AUC = 0.8806212320272481
[[ 3911 32069]
 [   10  4009]]
Overall AUC = 0.8785690364681128
[[ 20213 159689]
 [    56  20042]]
save done
5


Fold 0
train 1:16078
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.977584	valid_1's auc: 0.863117
[2000]	training's auc: 0.988542	valid_1's auc: 0.871161
[3000]	training's auc: 0.994596	valid_1's auc: 0.873491
[4000]	training's auc: 0.997649	valid_1's auc: 0.874261
[5000]	training's auc: 0.999221	valid_1's auc: 0.874014
[6000]	training's auc: 0.99982	valid_1's auc: 0.87289
Early stopping, best iteration is:
[3611]	training's auc: 0.996729	valid_1's auc: 0.874524
AUC = 0.8745239783130427
[[ 3054 32927]
 [    8  4012]]


Fold 1
train 1:16078
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.978653	valid_1's auc: 0.871696
[2000]	training's auc: 0.98854	valid_1's auc: 0.878157
[3000]	training's auc:

Early stopping, best iteration is:
[4203]	training's auc: 0.998373	valid_1's auc: 0.879654
AUC = 0.8796539536890313
[[ 4101 31879]
 [   11  4009]]


Fold 3
train 1:16079
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.978574	valid_1's auc: 0.865571
[2000]	training's auc: 0.989006	valid_1's auc: 0.875388
[3000]	training's auc: 0.994709	valid_1's auc: 0.87722
[4000]	training's auc: 0.997926	valid_1's auc: 0.878151
[5000]	training's auc: 0.999373	valid_1's auc: 0.877546
[6000]	training's auc: 0.999837	valid_1's auc: 0.877115
[7000]	training's auc: 0.999951	valid_1's auc: 0.876267
Early stopping, best iteration is:
[4142]	training's auc: 0.998207	valid_1's auc: 0.878248
AUC = 0.8782475431804542
[[ 3936 32044]
 [   10  4009]]


Fold 4
train 1:16079
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.97727	valid_1's auc: 0.867105
[2000]	tra

[2000]	training's auc: 0.987983	valid_1's auc: 0.870746
[3000]	training's auc: 0.994109	valid_1's auc: 0.874058
[4000]	training's auc: 0.99767	valid_1's auc: 0.875071
[5000]	training's auc: 0.999276	valid_1's auc: 0.875321
[6000]	training's auc: 0.999851	valid_1's auc: 0.874581
[7000]	training's auc: 0.999969	valid_1's auc: 0.873894
Early stopping, best iteration is:
[4403]	training's auc: 0.998508	valid_1's auc: 0.875552
AUC = 0.8755518494351843
[[ 4336 31645]
 [   14  4006]]


Fold 1
train 1:16078
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.980469	valid_1's auc: 0.872935
[2000]	training's auc: 0.990125	valid_1's auc: 0.880664
[3000]	training's auc: 0.995561	valid_1's auc: 0.882317
[4000]	training's auc: 0.998257	valid_1's auc: 0.881789
[5000]	training's auc: 0.999352	valid_1's auc: 0.880679
[6000]	training's auc: 0.999823	valid_1's auc: 0.879688
Early stopping, best iteration is:
[3152]	training's auc:

train 1:16079
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.976481	valid_1's auc: 0.872016
[2000]	training's auc: 0.987786	valid_1's auc: 0.881327
[3000]	training's auc: 0.994185	valid_1's auc: 0.883097
[4000]	training's auc: 0.997577	valid_1's auc: 0.883614
[5000]	training's auc: 0.999096	valid_1's auc: 0.882733
[6000]	training's auc: 0.999651	valid_1's auc: 0.881483
Early stopping, best iteration is:
[3772]	training's auc: 0.996961	valid_1's auc: 0.883879
AUC = 0.8838793938906923
[[ 3334 32646]
 [   11  4008]]


Fold 4
train 1:16079
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.978419	valid_1's auc: 0.86195
[2000]	training's auc: 0.988526	valid_1's auc: 0.871566
[3000]	training's auc: 0.994333	valid_1's auc: 0.874139
[4000]	training's auc: 0.997656	valid_1's auc: 0.874665
[5000]	training's auc: 0.999145	valid_1's auc: 0.8746

[6000]	training's auc: 0.999855	valid_1's auc: 0.877398
[7000]	training's auc: 0.99997	valid_1's auc: 0.87657
Early stopping, best iteration is:
[4408]	training's auc: 0.998496	valid_1's auc: 0.878624
AUC = 0.8786236821229999
[[ 4339 31642]
 [   15  4005]]


Fold 1
train 1:16078
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.977578	valid_1's auc: 0.865151
[2000]	training's auc: 0.988375	valid_1's auc: 0.873131
[3000]	training's auc: 0.994168	valid_1's auc: 0.875923
[4000]	training's auc: 0.997519	valid_1's auc: 0.876286
[5000]	training's auc: 0.999075	valid_1's auc: 0.875792
[6000]	training's auc: 0.999736	valid_1's auc: 0.87524
[7000]	training's auc: 0.999951	valid_1's auc: 0.874923
Early stopping, best iteration is:
[4305]	training's auc: 0.998131	valid_1's auc: 0.876465
AUC = 0.8764654465921138
[[ 4200 31781]
 [   13  4007]]


Fold 2
train 1:16078
train 0:1608
test 1:3970
test 0:36030
Training until vali

[4000]	training's auc: 0.99816	valid_1's auc: 0.881105
[5000]	training's auc: 0.999395	valid_1's auc: 0.880248
[6000]	training's auc: 0.999826	valid_1's auc: 0.879187
Early stopping, best iteration is:
[3672]	training's auc: 0.997405	valid_1's auc: 0.881376
AUC = 0.8813755354119074
[[ 3294 32686]
 [   14  4005]]


Fold 4
train 1:16079
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.978834	valid_1's auc: 0.864441
[2000]	training's auc: 0.989233	valid_1's auc: 0.871099
[3000]	training's auc: 0.995339	valid_1's auc: 0.873459
[4000]	training's auc: 0.998263	valid_1's auc: 0.873514
[5000]	training's auc: 0.999467	valid_1's auc: 0.873188
[6000]	training's auc: 0.999845	valid_1's auc: 0.872642
Early stopping, best iteration is:
[3370]	training's auc: 0.996694	valid_1's auc: 0.873755
AUC = 0.8737548894004175
[[ 3229 32751]
 [   10  4009]]
Overall AUC = 0.8779702722659348
[[ 18077 161825]
 [    55  20043]]
save done


Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.97924	valid_1's auc: 0.872188
[2000]	training's auc: 0.988907	valid_1's auc: 0.879121
[3000]	training's auc: 0.994652	valid_1's auc: 0.880667
[4000]	training's auc: 0.997888	valid_1's auc: 0.881034
[5000]	training's auc: 0.999232	valid_1's auc: 0.880928
[6000]	training's auc: 0.99976	valid_1's auc: 0.880387
Early stopping, best iteration is:
[3388]	training's auc: 0.996078	valid_1's auc: 0.881156
AUC = 0.8811557882746575
[[ 3057 32924]
 [    9  4011]]


Fold 2
train 1:16078
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.977523	valid_1's auc: 0.862516
[2000]	training's auc: 0.988292	valid_1's auc: 0.871367
[3000]	training's auc: 0.994225	valid_1's auc: 0.873375
[4000]	training's auc: 0.997537	valid_1's auc: 0.873999
[5000]	training's auc: 0.999085	valid_1's auc: 0.873688
[6000]	training's auc: 0.999656	valid_1's auc: 0.8

[7000]	training's auc: 0.999935	valid_1's auc: 0.876765
Early stopping, best iteration is:
[4443]	training's auc: 0.998221	valid_1's auc: 0.878178
AUC = 0.8781777178192357
[[ 4060 31920]
 [   12  4007]]


Fold 4
train 1:16079
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.976285	valid_1's auc: 0.864018
[2000]	training's auc: 0.987135	valid_1's auc: 0.871333
[3000]	training's auc: 0.993535	valid_1's auc: 0.874047
[4000]	training's auc: 0.997002	valid_1's auc: 0.874864
[5000]	training's auc: 0.998676	valid_1's auc: 0.874389
[6000]	training's auc: 0.999349	valid_1's auc: 0.873127
[7000]	training's auc: 0.999547	valid_1's auc: 0.872139
Early stopping, best iteration is:
[4049]	training's auc: 0.997128	valid_1's auc: 0.874899
AUC = 0.8748988925726756
[[ 3860 32120]
 [   17  4002]]
Overall AUC = 0.8775558547898126
[[ 18405 161497]
 [    57  20041]]
save done
23


Fold 0
train 1:16078
train 0:1608
test 1:3970
test

[1000]	training's auc: 0.978778	valid_1's auc: 0.867287
[2000]	training's auc: 0.988893	valid_1's auc: 0.875343
[3000]	training's auc: 0.994633	valid_1's auc: 0.877206
[4000]	training's auc: 0.997847	valid_1's auc: 0.877986
[5000]	training's auc: 0.999275	valid_1's auc: 0.877944
[6000]	training's auc: 0.999795	valid_1's auc: 0.877749
[7000]	training's auc: 0.999929	valid_1's auc: 0.877246
Early stopping, best iteration is:
[4064]	training's auc: 0.99798	valid_1's auc: 0.878081
AUC = 0.878081024244277
[[ 4092 31889]
 [    9  4011]]


Fold 2
train 1:16078
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.978644	valid_1's auc: 0.864288
[2000]	training's auc: 0.988598	valid_1's auc: 0.872043
[3000]	training's auc: 0.99445	valid_1's auc: 0.874198
[4000]	training's auc: 0.997637	valid_1's auc: 0.87508
[5000]	training's auc: 0.999007	valid_1's auc: 0.875027
[6000]	training's auc: 0.999463	valid_1's auc: 0.874248
Earl

train 1:16079
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.976498	valid_1's auc: 0.864756
[2000]	training's auc: 0.987937	valid_1's auc: 0.873798
[3000]	training's auc: 0.994478	valid_1's auc: 0.876385
[4000]	training's auc: 0.997957	valid_1's auc: 0.876926
[5000]	training's auc: 0.999415	valid_1's auc: 0.876714
[6000]	training's auc: 0.999872	valid_1's auc: 0.875745
[7000]	training's auc: 0.99998	valid_1's auc: 0.874991
Early stopping, best iteration is:
[4379]	training's auc: 0.998743	valid_1's auc: 0.87713
AUC = 0.8771298049108314
[[ 4279 31701]
 [   18  4001]]
Overall AUC = 0.8779924651074307
[[ 18227 161675]
 [    54  20044]]
save done
28


Fold 0
train 1:16078
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.978881	valid_1's auc: 0.862103
[2000]	training's auc: 0.988725	valid_1's auc: 0.870018
[3000]	training's auc: 0.9942

[2000]	training's auc: 0.989215	valid_1's auc: 0.873619
[3000]	training's auc: 0.994755	valid_1's auc: 0.876474
[4000]	training's auc: 0.997822	valid_1's auc: 0.877292
[5000]	training's auc: 0.999184	valid_1's auc: 0.876729
[6000]	training's auc: 0.999684	valid_1's auc: 0.876205
Early stopping, best iteration is:
[3896]	training's auc: 0.997561	valid_1's auc: 0.877465
AUC = 0.8774647924326009
[[ 3683 32298]
 [   10  4010]]


Fold 2
train 1:16078
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.977869	valid_1's auc: 0.869753
[2000]	training's auc: 0.988421	valid_1's auc: 0.877738
[3000]	training's auc: 0.994493	valid_1's auc: 0.880076
[4000]	training's auc: 0.997957	valid_1's auc: 0.88096
[5000]	training's auc: 0.999461	valid_1's auc: 0.880115
[6000]	training's auc: 0.999909	valid_1's auc: 0.879205
Early stopping, best iteration is:
[3959]	training's auc: 0.997855	valid_1's auc: 0.881076
AUC = 0.88107595706846

Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.978126	valid_1's auc: 0.867914
[2000]	training's auc: 0.988358	valid_1's auc: 0.875526
[3000]	training's auc: 0.994344	valid_1's auc: 0.876824
[4000]	training's auc: 0.997605	valid_1's auc: 0.876983
[5000]	training's auc: 0.999098	valid_1's auc: 0.876618
[6000]	training's auc: 0.999588	valid_1's auc: 0.875885
Early stopping, best iteration is:
[3213]	training's auc: 0.995169	valid_1's auc: 0.877219
AUC = 0.8772192909140173
[[ 2971 33009]
 [    6  4013]]
Overall AUC = 0.8773508347191722
[[ 18104 161798]
 [    46  20052]]
save done
33


Fold 0
train 1:16078
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.977462	valid_1's auc: 0.864175
[2000]	training's auc: 0.987812	valid_1's auc: 0.873137
[3000]	training's auc: 0.993945	valid_1's auc: 0.875703
[4000]	training's auc: 0.997367	valid_1's auc: 0.876239
[5000]	training's auc: 

train 1:16078
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.977092	valid_1's auc: 0.863529
[2000]	training's auc: 0.987991	valid_1's auc: 0.873098
[3000]	training's auc: 0.994606	valid_1's auc: 0.876201
[4000]	training's auc: 0.998037	valid_1's auc: 0.876827
[5000]	training's auc: 0.999499	valid_1's auc: 0.876691
[6000]	training's auc: 0.999914	valid_1's auc: 0.875822
[7000]	training's auc: 0.999995	valid_1's auc: 0.874635
Early stopping, best iteration is:
[4255]	training's auc: 0.998564	valid_1's auc: 0.877202
AUC = 0.8772022945306818
[[ 4047 31933]
 [   11  4009]]


Fold 3
train 1:16079
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.976893	valid_1's auc: 0.869795
[2000]	training's auc: 0.987745	valid_1's auc: 0.876042
[3000]	training's auc: 0.993903	valid_1's auc: 0.878935
[4000]	training's auc: 0.99733	valid_1's auc: 0.8793

[5000]	training's auc: 0.999079	valid_1's auc: 0.87724
[6000]	training's auc: 0.999687	valid_1's auc: 0.876425
Early stopping, best iteration is:
[3379]	training's auc: 0.995369	valid_1's auc: 0.878073
AUC = 0.878073329007946
[[ 3284 32696]
 [    8  4011]]
Overall AUC = 0.8785243385332073
[[ 15899 164003]
 [    36  20062]]
save done
38


Fold 0
train 1:16078
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.979228	valid_1's auc: 0.862141
[2000]	training's auc: 0.989589	valid_1's auc: 0.870592
[3000]	training's auc: 0.995031	valid_1's auc: 0.873473
[4000]	training's auc: 0.998028	valid_1's auc: 0.874188
[5000]	training's auc: 0.999298	valid_1's auc: 0.874204
[6000]	training's auc: 0.999717	valid_1's auc: 0.874004
Early stopping, best iteration is:
[3798]	training's auc: 0.997552	valid_1's auc: 0.8744
AUC = 0.8743995690926432
[[ 3529 32452]
 [    8  4012]]


Fold 1
train 1:16078
train 0:1608
test 1:3970
test 0:3

[6000]	training's auc: 0.999678	valid_1's auc: 0.875601
Early stopping, best iteration is:
[3720]	training's auc: 0.997174	valid_1's auc: 0.877422
AUC = 0.8774219646625129
[[ 3408 32572]
 [    7  4013]]


Fold 3
train 1:16079
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.978007	valid_1's auc: 0.868984
[2000]	training's auc: 0.988673	valid_1's auc: 0.877256
[3000]	training's auc: 0.994854	valid_1's auc: 0.879298
[4000]	training's auc: 0.998081	valid_1's auc: 0.879764
[5000]	training's auc: 0.999445	valid_1's auc: 0.879265
[6000]	training's auc: 0.999868	valid_1's auc: 0.87876
[7000]	training's auc: 0.999979	valid_1's auc: 0.877592
Early stopping, best iteration is:
[4199]	training's auc: 0.99847	valid_1's auc: 0.880023
AUC = 0.8800228652643688
[[ 4024 31956]
 [   15  4004]]


Fold 4
train 1:16079
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	tra

Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.978988	valid_1's auc: 0.870843
[2000]	training's auc: 0.988522	valid_1's auc: 0.877577
[3000]	training's auc: 0.994395	valid_1's auc: 0.880924
[4000]	training's auc: 0.997603	valid_1's auc: 0.881511
[5000]	training's auc: 0.999097	valid_1's auc: 0.881262
[6000]	training's auc: 0.999662	valid_1's auc: 0.880924
Early stopping, best iteration is:
[3401]	training's auc: 0.995892	valid_1's auc: 0.88177
AUC = 0.881769856147129
[[ 3005 32976]
 [    5  4015]]


Fold 1
train 1:16078
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.978747	valid_1's auc: 0.867481
[2000]	training's auc: 0.988469	valid_1's auc: 0.875202
[3000]	training's auc: 0.994474	valid_1's auc: 0.876401
[4000]	training's auc: 0.997723	valid_1's auc: 0.877089
[5000]	training's auc: 0.999245	valid_1's auc: 0.87626
[6000]	training's auc: 0.999785	valid_1's auc: 0.87

Early stopping, best iteration is:
[4353]	training's auc: 0.998374	valid_1's auc: 0.879882
AUC = 0.8798820447512299
[[ 4200 31780]
 [   10  4010]]


Fold 3
train 1:16079
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.978958	valid_1's auc: 0.864386
[2000]	training's auc: 0.988794	valid_1's auc: 0.872624
[3000]	training's auc: 0.994552	valid_1's auc: 0.875512
[4000]	training's auc: 0.99768	valid_1's auc: 0.875949
[5000]	training's auc: 0.998993	valid_1's auc: 0.875901
[6000]	training's auc: 0.999457	valid_1's auc: 0.875012
[7000]	training's auc: 0.999589	valid_1's auc: 0.874555
Early stopping, best iteration is:
[4207]	training's auc: 0.998044	valid_1's auc: 0.876211
AUC = 0.8762109274996022
[[ 3769 32211]
 [    8  4011]]


Fold 4
train 1:16079
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.978012	valid_1's auc: 0.870938
[2000]	tr

[1000]	training's auc: 0.976164	valid_1's auc: 0.867285
[2000]	training's auc: 0.98735	valid_1's auc: 0.876435
[3000]	training's auc: 0.993735	valid_1's auc: 0.878522
[4000]	training's auc: 0.997353	valid_1's auc: 0.878992
[5000]	training's auc: 0.999017	valid_1's auc: 0.878206
[6000]	training's auc: 0.999663	valid_1's auc: 0.877235
Early stopping, best iteration is:
[3738]	training's auc: 0.996633	valid_1's auc: 0.879268
AUC = 0.8792677962567587
[[ 3408 32573]
 [    5  4015]]


Fold 1
train 1:16078
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.981287	valid_1's auc: 0.865827
[2000]	training's auc: 0.990137	valid_1's auc: 0.874395
[3000]	training's auc: 0.995583	valid_1's auc: 0.877261
[4000]	training's auc: 0.998395	valid_1's auc: 0.877169
[5000]	training's auc: 0.999522	valid_1's auc: 0.876549
[6000]	training's auc: 0.999874	valid_1's auc: 0.876219
Early stopping, best iteration is:
[3165]	training's auc:

AUC = 0.877778139596625
[[ 4237 31743]
 [   15  4005]]


Fold 3
train 1:16079
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.978074	valid_1's auc: 0.864408
[2000]	training's auc: 0.988358	valid_1's auc: 0.872415
[3000]	training's auc: 0.994531	valid_1's auc: 0.875102
[4000]	training's auc: 0.997823	valid_1's auc: 0.875597
[5000]	training's auc: 0.99929	valid_1's auc: 0.874861
[6000]	training's auc: 0.999797	valid_1's auc: 0.873873
Early stopping, best iteration is:
[3849]	training's auc: 0.997452	valid_1's auc: 0.875808
AUC = 0.8758079569515618
[[ 3576 32404]
 [    8  4011]]


Fold 4
train 1:16079
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.980288	valid_1's auc: 0.866886
[2000]	training's auc: 0.990042	valid_1's auc: 0.873063
[3000]	training's auc: 0.995443	valid_1's auc: 0.874627
[4000]	training's auc: 0.998303	valid_1's auc

AUC = 0.882341785970235
[[ 3795 32186]
 [    8  4012]]


Fold 1
train 1:16078
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.978359	valid_1's auc: 0.865657
[2000]	training's auc: 0.989198	valid_1's auc: 0.873459
[3000]	training's auc: 0.995129	valid_1's auc: 0.876644
[4000]	training's auc: 0.998159	valid_1's auc: 0.876705
[5000]	training's auc: 0.999402	valid_1's auc: 0.8767
[6000]	training's auc: 0.999814	valid_1's auc: 0.876282
[7000]	training's auc: 0.999949	valid_1's auc: 0.87528
Early stopping, best iteration is:
[4454]	training's auc: 0.998877	valid_1's auc: 0.877069
AUC = 0.8770685841518624
[[ 4292 31689]
 [    8  4012]]


Fold 2
train 1:16078
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.97906	valid_1's auc: 0.862595
[2000]	training's auc: 0.989321	valid_1's auc: 0.872709
[3000]	training's auc: 0.995123	valid_1's auc: 0

[6000]	training's auc: 0.999904	valid_1's auc: 0.878914
Early stopping, best iteration is:
[3925]	training's auc: 0.997987	valid_1's auc: 0.880296
AUC = 0.8802957699122609
[[ 3792 32188]
 [    8  4011]]


Fold 4
train 1:16079
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.979669	valid_1's auc: 0.862271
[2000]	training's auc: 0.989465	valid_1's auc: 0.869556
[3000]	training's auc: 0.995434	valid_1's auc: 0.872619
[4000]	training's auc: 0.99807	valid_1's auc: 0.872905
[5000]	training's auc: 0.999408	valid_1's auc: 0.872811
[6000]	training's auc: 0.99981	valid_1's auc: 0.871986
Early stopping, best iteration is:
[3925]	training's auc: 0.997892	valid_1's auc: 0.873037
AUC = 0.8730368644989662
[[ 3848 32132]
 [   10  4009]]
Overall AUC = 0.8792509022716795
[[ 18859 161043]
 [    44  20054]]
save done
56


Fold 0
train 1:16078
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve fo

[3000]	training's auc: 0.994794	valid_1's auc: 0.88083
[4000]	training's auc: 0.99782	valid_1's auc: 0.881811
[5000]	training's auc: 0.999323	valid_1's auc: 0.880609
[6000]	training's auc: 0.999856	valid_1's auc: 0.880113
[7000]	training's auc: 0.999984	valid_1's auc: 0.878874
Early stopping, best iteration is:
[4017]	training's auc: 0.99787	valid_1's auc: 0.881851
AUC = 0.8818512838658215
[[ 3826 32155]
 [   10  4010]]


Fold 2
train 1:16078
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.978413	valid_1's auc: 0.865116
[2000]	training's auc: 0.988396	valid_1's auc: 0.873665
[3000]	training's auc: 0.994401	valid_1's auc: 0.876586
[4000]	training's auc: 0.997595	valid_1's auc: 0.877197
[5000]	training's auc: 0.998984	valid_1's auc: 0.877029
[6000]	training's auc: 0.999562	valid_1's auc: 0.876664
[7000]	training's auc: 0.999718	valid_1's auc: 0.875733
Early stopping, best iteration is:
[4225]	training's auc: 0

train 1:16079
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.978877	valid_1's auc: 0.865635
[2000]	training's auc: 0.989112	valid_1's auc: 0.874466
[3000]	training's auc: 0.994869	valid_1's auc: 0.877173
[4000]	training's auc: 0.997914	valid_1's auc: 0.877794
[5000]	training's auc: 0.999352	valid_1's auc: 0.878154
[6000]	training's auc: 0.999834	valid_1's auc: 0.877252
[7000]	training's auc: 0.999971	valid_1's auc: 0.876406
Early stopping, best iteration is:
[4449]	training's auc: 0.998708	valid_1's auc: 0.87836
AUC = 0.8783602167082678
[[ 4078 31902]
 [   12  4007]]
Overall AUC = 0.877566471631448
[[ 17748 162154]
 [    53  20045]]
save done
61


Fold 0
train 1:16078
train 0:1608
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.978897	valid_1's auc: 0.865758
[2000]	training's auc: 0.989449	valid_1's auc: 0.874291
[3000]	training's auc: 0.9952

OSError: [Errno 28] No space left on device

In [None]:
import glob
train_stack_df=pd.DataFrame(y_train)
test_stack_df=pd.DataFrame()
files = glob.glob('../../03_predict_train/*')
i=0
for file in files:
    if 'US4' in file:
        i+=1
        print(file)
        train_stack_df_tmp=pd.read_csv(file)
        train_stack_df['pred_'+str(i)]=train_stack_df_tmp

files = glob.glob('../../04_predict_test/*')
i=0
for file in files:
    if 'US4' in file:
        i+=1
        print(file)
        test_stack_df_tmp=pd.read_csv(file)
        
        if i == 1 :
            test_stack_df=test_stack_df_tmp
            test_stack_df.columns=['pred_1']
        else:
            test_stack_df['pred_'+str(i)]=test_stack_df_tmp


In [None]:
pd.set_option('display.max_rows', 500)
train_stack_df

In [None]:
test_stack_df

In [None]:
train_stack_df.describe()

In [None]:
test_stack_df.describe()

In [None]:
y_train_stack=train_stack_df['target']
x_train_stack=train_stack_df.drop('target',axis=1)
x_test_stack=test_stack_df

In [None]:
for i_df in [x_train_stack,x_test_stack]:
    i_df['mean']=i_df.mean(axis=1)
    i_df['max']=i_df.max(axis=1)
    i_df['min']=i_df.min(axis=1)
    

In [None]:
x_train_stack

In [None]:
x_train_stack=pd.concat([x_train_stack,x_train_base],axis=1)
x_test_stack=pd.concat([x_test_stack,x_test],axis=1)

In [None]:
#model_param
param = {
    "objective" : "binary", 
    "boost":"gbdt",
    "metric":"auc",
    "boost_from_average":"false",
    "num_threads":28,
    "learning_rate" : 0.01,
    "num_leaves" : 13,
    "max_depth":5,
    "tree_learner" : "serial",
    "feature_fraction" : 0.05,
    "bagging_freq" : 5,
    "bagging_fraction" : 0.4,
    "min_data_in_leaf" : 80,
    "min_sum_hessian_in_leaf" : 10.0,
    "verbosity" : 1,
    'seed': 44000,
    }

In [None]:
folds = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=SEED)
oof_preds = np.zeros((len(x_train_stack), 1))
test_preds = np.zeros((len(x_test_stack), 1))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train_stack.values, y_train_stack.values)):
    print("\n")
    print("Fold {}".format(fold_))
    trn_x,trn_y = x_train_stack.iloc[trn_idx], y_train_stack.iloc[trn_idx]
    val_x,val_y = x_train_stack.iloc[val_idx], y_train_stack.iloc[val_idx]
    trn_data = lgb.Dataset(trn_x,trn_y)
    val_data = lgb.Dataset(val_x,val_y)
    
    num_round = 1000000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
    val_pred = clf.predict(val_x, num_iteration=clf.best_iteration)
    test_pred = clf.predict(x_test_stack, num_iteration=clf.best_iteration)
    
    print("AUC = {}".format(roc_auc_score(val_y, val_pred)))
    print(confusion_matrix(val_y, pd.DataFrame(np.round(val_pred))))

    #print("val = {}".format(val_pred)
    oof_preds[val_idx, :] = val_pred.reshape((-1, 1))
    test_preds += test_pred.reshape((-1, 1))
    
test_preds /= fold_num
roc_score = roc_auc_score(y_train_stack, oof_preds.ravel())
print("Overall AUC = {}".format(roc_score))
print(confusion_matrix(y_train_stack, pd.DataFrame(np.round(oof_preds))))


In [None]:
#結果保存
if (submit_flg ==1 and sampling_flg==0):
    series_oof_preds = pd.Series(data=oof_preds[:,0], name=save_col_name, dtype='float')
    series_oof_preds.to_csv(fin_train_preds_dir,header=True, index=False)

    sample = pd.read_csv(sample_submission_dir)
    sample.target = test_preds[:,0].astype(float)
    sample.ID_code = test_df['ID_code']
    sample.to_csv(fin_test_preds_dir, index=False)
    