In [1]:
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

warnings.filterwarnings('ignore')


基本変数定義

In [2]:
sampling_flg=0 #サンプリング有無をコントロール
submit_flg=1 #保存するかをコントロール（サンプリングしない時のみ）

SEED=12345
sample_num=10000
fold_num=5

#train関連
train_dir='../../01_input/train.csv'
train_FN_dir='../../03_predict_train/FalseNegative_train.csv'
train_drop_col=['ID_code', 'target']
train_label='target'

#test関連
test_dir='../../01_input/test.csv'
test_drop_col=['ID_code']

#結果ファイル関連　nameは自分の名前に変更する
fin_train_preds_dir='../../03_predict_train/oka_252-2_LightGBM_train.csv'
fin_test_preds_dir='../../04_predict_test/oka_252-2_LightGBM_submission.csv'
save_col_name='oof_xgb'

sample_submission_dir='../../01_input/sample_submission.csv'
submission_target_col_name='target'
submission_id_col_name='ID_code'

前処理

In [3]:
#ロード
train_df=pd.read_csv(train_dir)
test_df=pd.read_csv(test_dir)
#train_FN_df=pd.read_csv(train_FN_dir)

In [4]:
#サンプリング
if sampling_flg ==1:
    train_df=train_df.sample(n=sample_num,random_state=SEED)
    test_df=test_df.sample(n=sample_num,random_state=SEED)

In [5]:
#x,y作成
x_train=train_df.drop(train_drop_col,axis=1)
y_train=train_df[train_label]
x_train_base=train_df.drop(train_drop_col,axis=1)
y_train_base=train_df[train_label]
x_test=test_df.drop(test_drop_col,axis=1)
x_train_col=x_train.columns

モデル実行

In [6]:
def lgbm(train_x,train_y,test_x,test_y):
    #model_param
    param = {
        "objective" : "binary", 
        "boost":"gbdt",
        "metric":"auc",
        "boost_from_average":"false",
        "num_threads":28,
        "learning_rate" : 0.01,
        "num_leaves" : 13,
        "max_depth":-1,
        "tree_learner" : "serial",
        "feature_fraction" : 0.05,
        "bagging_freq" : 5,
        "bagging_fraction" : 0.4,
        "min_data_in_leaf" : 80,
        "min_sum_hessian_in_leaf" : 10.0,
        "verbosity" : 1,
        'seed': 44000,
        }

    trn_data = lgb.Dataset(train_x,train_y)
    val_data = lgb.Dataset(test_x,test_y)
    
    num_round = 1000000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
    
    val_pred = clf.predict(test_x, num_iteration=clf.best_iteration)
    auc_score=roc_auc_score(test_y, val_pred)
    print("")
    print("AUC = {}".format(auc_score))
    print(confusion_matrix(test_y, pd.DataFrame(np.round(val_pred))))
    
    return clf

In [7]:
def split_func(df,SEED):
    ss=ShuffleSplit(n_splits=1,train_size=0.8,test_size=0.2,random_state=SEED)
    x=df.drop(train_drop_col,axis=1)
    y=df[train_label]
    train_index,test_index=next(ss.split(x,y))
    train_df_sample=df.iloc[train_index]
    test_df_sample=df.iloc[test_index]
    return train_df_sample,test_df_sample

In [8]:
train_df_sample,test_df_sample=split_func(train_df,1111)


In [None]:
j=0
for i in range(11,20):
    print(i)
    folds = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=i)
    oof_preds = np.zeros((len(x_train), 1))
    test_preds = np.zeros((len(x_test), 1))

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train.values, y_train.values)):
        print("\n")
        print("Fold {}".format(fold_))
        j+=1
        train_df_sample=train_df.iloc[trn_idx]
        sumple_num=round((len(train_df_sample[train_df_sample['target']==1])))
        train_df_sample_1=train_df_sample[train_df_sample['target']==1].sample(n=sumple_num,random_state=j)
        train_df_sample_0=train_df_sample[train_df_sample['target']==0].sample(n=round(sumple_num),random_state=j+100)
        train_df_sample=pd.concat([train_df_sample_0,train_df_sample_1],axis=0)
        print('train 1:'+str(len(train_df_sample[train_df_sample['target']==1])))
        print('train 0:'+str(len(train_df_sample[train_df_sample['target']==0])))
        print('test 1:'+str(len(test_df_sample[test_df_sample['target']==1])))
        print('test 0:'+str(len(test_df_sample[test_df_sample['target']==0])))

        trn_x=train_df_sample.drop(train_drop_col,axis=1)
        trn_y=train_df_sample[train_label]
        val_x,val_y = x_train.iloc[val_idx], y_train.iloc[val_idx]

        trn_data = lgb.Dataset(trn_x,trn_y)
        val_data = lgb.Dataset(val_x,val_y)

        param = {
        "objective" : "binary", 
        "boost":"gbdt",
        "metric":"auc",
        "boost_from_average":"false",
        "num_threads":28,
        "learning_rate" : 0.01,
        "num_leaves" : 13,
        "max_depth":2,
        "tree_learner" : "serial",
        "feature_fraction" : 0.05,
        "bagging_freq" : 5,
        "bagging_fraction" : 0.4,
        "min_data_in_leaf" : 80,
        "min_sum_hessian_in_leaf" : 10.0,
        "verbosity" : 1,
        'seed': i,
        'scale_pos_weight':0.35
        }
        num_round = 1000000
        clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
        val_pred = clf.predict(val_x, num_iteration=clf.best_iteration)
        test_pred = clf.predict(x_test, num_iteration=clf.best_iteration)

        print("AUC = {}".format(roc_auc_score(val_y, val_pred)))
        print(confusion_matrix(val_y, pd.DataFrame(np.round(val_pred))))
        #print("val = {}".format(val_pred)
        oof_preds[val_idx, :] = val_pred.reshape((-1, 1))
        test_preds += test_pred.reshape((-1, 1))

    test_preds /= fold_num
    roc_score = roc_auc_score(y_train, oof_preds.ravel())
    print("Overall AUC = {}".format(roc_score))
    print(confusion_matrix(y_train, pd.DataFrame(np.round(oof_preds))))

    train_preds_dir='../../03_predict_train/US4_train_'+str(i)+'.csv'
    test_preds_dir='../../04_predict_test/US4_test_'+str(i)+'.csv'
    
    pd.DataFrame(oof_preds).to_csv(train_preds_dir, index=False)
    pd.DataFrame(test_preds).to_csv(test_preds_dir, index=False)
    print('save done')


0


Fold 0
train 1:16078
train 0:16078
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.866142	valid_1's auc: 0.842625
[2000]	training's auc: 0.883461	valid_1's auc: 0.858983
[3000]	training's auc: 0.896027	valid_1's auc: 0.870142
[4000]	training's auc: 0.904426	valid_1's auc: 0.877235
[5000]	training's auc: 0.910776	valid_1's auc: 0.882379
[6000]	training's auc: 0.915473	valid_1's auc: 0.885932
[7000]	training's auc: 0.919229	valid_1's auc: 0.888714
[8000]	training's auc: 0.922146	valid_1's auc: 0.890571
[9000]	training's auc: 0.924628	valid_1's auc: 0.891944
[10000]	training's auc: 0.926621	valid_1's auc: 0.892893
[11000]	training's auc: 0.928462	valid_1's auc: 0.8936
[12000]	training's auc: 0.930118	valid_1's auc: 0.894188
[13000]	training's auc: 0.931657	valid_1's auc: 0.894558
[14000]	training's auc: 0.93308	valid_1's auc: 0.894763
[15000]	training's auc: 0.934491	valid_1's auc: 0.894925
[16000]	training's auc: 0.935

[5000]	training's auc: 0.90985	valid_1's auc: 0.884591
[6000]	training's auc: 0.914635	valid_1's auc: 0.887754
[7000]	training's auc: 0.91841	valid_1's auc: 0.889782
[8000]	training's auc: 0.921422	valid_1's auc: 0.891659
[9000]	training's auc: 0.924015	valid_1's auc: 0.892972
[10000]	training's auc: 0.926225	valid_1's auc: 0.893712
[11000]	training's auc: 0.92809	valid_1's auc: 0.894112
[12000]	training's auc: 0.929783	valid_1's auc: 0.894497
[13000]	training's auc: 0.931283	valid_1's auc: 0.894631
[14000]	training's auc: 0.93264	valid_1's auc: 0.89491
[15000]	training's auc: 0.934028	valid_1's auc: 0.894773
[16000]	training's auc: 0.935331	valid_1's auc: 0.894882
[17000]	training's auc: 0.936551	valid_1's auc: 0.894829
Early stopping, best iteration is:
[14072]	training's auc: 0.932743	valid_1's auc: 0.894944
AUC = 0.8949436207417928
[[33750  2231]
 [ 1606  2414]]


Fold 2
train 1:16078
train 0:16078
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rou

[11000]	training's auc: 0.926936	valid_1's auc: 0.897184
[12000]	training's auc: 0.92867	valid_1's auc: 0.897467
[13000]	training's auc: 0.930153	valid_1's auc: 0.897717
[14000]	training's auc: 0.931569	valid_1's auc: 0.897858
[15000]	training's auc: 0.932913	valid_1's auc: 0.897913
[16000]	training's auc: 0.934289	valid_1's auc: 0.89786
[17000]	training's auc: 0.935585	valid_1's auc: 0.897775
[18000]	training's auc: 0.936842	valid_1's auc: 0.897563
Early stopping, best iteration is:
[15401]	training's auc: 0.933474	valid_1's auc: 0.897922
AUC = 0.8979224845754552
[[33718  2262]
 [ 1540  2480]]


Fold 3
train 1:16079
train 0:16079
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.866821	valid_1's auc: 0.851813
[2000]	training's auc: 0.882359	valid_1's auc: 0.864658
[3000]	training's auc: 0.894567	valid_1's auc: 0.87488
[4000]	training's auc: 0.902858	valid_1's auc: 0.881084
[5000]	training's auc: 0.909024	valid_1's auc: 0.

[16000]	training's auc: 0.934117	valid_1's auc: 0.898125
[17000]	training's auc: 0.935371	valid_1's auc: 0.898026
[18000]	training's auc: 0.936602	valid_1's auc: 0.897992
Early stopping, best iteration is:
[15452]	training's auc: 0.933422	valid_1's auc: 0.898153
AUC = 0.8981527779180078
[[33696  2284]
 [ 1533  2486]]


Fold 4
train 1:16079
train 0:16079
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.866638	valid_1's auc: 0.847047
[2000]	training's auc: 0.88338	valid_1's auc: 0.862663
[3000]	training's auc: 0.895299	valid_1's auc: 0.872845
[4000]	training's auc: 0.903854	valid_1's auc: 0.879653
[5000]	training's auc: 0.910001	valid_1's auc: 0.884512
[6000]	training's auc: 0.914787	valid_1's auc: 0.888258
[7000]	training's auc: 0.918251	valid_1's auc: 0.890588
[8000]	training's auc: 0.921162	valid_1's auc: 0.892603
[9000]	training's auc: 0.923569	valid_1's auc: 0.894244
[10000]	training's auc: 0.92562	valid_1's auc: 0.895

[17000]	training's auc: 0.93556	valid_1's auc: 0.898679
[18000]	training's auc: 0.936794	valid_1's auc: 0.898612
Early stopping, best iteration is:
[15722]	training's auc: 0.933979	valid_1's auc: 0.898849
AUC = 0.898849357989793
[[33658  2322]
 [ 1513  2506]]
Overall AUC = 0.8978350566996759
[[168010  11892]
 [  7478  12620]]
save done
5


Fold 0
train 1:16078
train 0:16078
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.864465	valid_1's auc: 0.85331
[2000]	training's auc: 0.882398	valid_1's auc: 0.86742
[3000]	training's auc: 0.89449	valid_1's auc: 0.876818
[4000]	training's auc: 0.903368	valid_1's auc: 0.883033
[5000]	training's auc: 0.909722	valid_1's auc: 0.887515
[6000]	training's auc: 0.91466	valid_1's auc: 0.890638
[7000]	training's auc: 0.91847	valid_1's auc: 0.892671
[8000]	training's auc: 0.92152	valid_1's auc: 0.894161
[9000]	training's auc: 0.924002	valid_1's auc: 0.895081
[10000]	training's auc: 0.926113	val

[3000]	training's auc: 0.893415	valid_1's auc: 0.874603
[4000]	training's auc: 0.901937	valid_1's auc: 0.881561
[5000]	training's auc: 0.908033	valid_1's auc: 0.886564
[6000]	training's auc: 0.912721	valid_1's auc: 0.889966
[7000]	training's auc: 0.916402	valid_1's auc: 0.892401
[8000]	training's auc: 0.919503	valid_1's auc: 0.894082
[9000]	training's auc: 0.921973	valid_1's auc: 0.895221
[10000]	training's auc: 0.924085	valid_1's auc: 0.896105
[11000]	training's auc: 0.926004	valid_1's auc: 0.896686
[12000]	training's auc: 0.927633	valid_1's auc: 0.89705
[13000]	training's auc: 0.929176	valid_1's auc: 0.897411
[14000]	training's auc: 0.930683	valid_1's auc: 0.897587
[15000]	training's auc: 0.932092	valid_1's auc: 0.897633
[16000]	training's auc: 0.933421	valid_1's auc: 0.897579
[17000]	training's auc: 0.934649	valid_1's auc: 0.897484
[18000]	training's auc: 0.935949	valid_1's auc: 0.897351
Early stopping, best iteration is:
[15634]	training's auc: 0.932966	valid_1's auc: 0.897648
AUC 

[14000]	training's auc: 0.932948	valid_1's auc: 0.898106
[15000]	training's auc: 0.934248	valid_1's auc: 0.89824
[16000]	training's auc: 0.935517	valid_1's auc: 0.898151
[17000]	training's auc: 0.936738	valid_1's auc: 0.898192
Early stopping, best iteration is:
[14636]	training's auc: 0.933778	valid_1's auc: 0.898279
AUC = 0.8982791987809701
[[33751  2229]
 [ 1483  2537]]


Fold 3
train 1:16079
train 0:16079
test 1:3970
test 0:36030
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.865133	valid_1's auc: 0.848323
[2000]	training's auc: 0.883627	valid_1's auc: 0.863542
[3000]	training's auc: 0.895226	valid_1's auc: 0.873697
[4000]	training's auc: 0.904117	valid_1's auc: 0.880587
[5000]	training's auc: 0.91047	valid_1's auc: 0.885128
[6000]	training's auc: 0.915418	valid_1's auc: 0.888672
[7000]	training's auc: 0.919028	valid_1's auc: 0.891096
[8000]	training's auc: 0.922072	valid_1's auc: 0.892697
[9000]	training's auc: 0.924429	valid_1's auc: 0.893

[1000]	training's auc: 0.862145	valid_1's auc: 0.848816
[2000]	training's auc: 0.882421	valid_1's auc: 0.86658
[3000]	training's auc: 0.894456	valid_1's auc: 0.876262
[4000]	training's auc: 0.903338	valid_1's auc: 0.883223
[5000]	training's auc: 0.909389	valid_1's auc: 0.887504
[6000]	training's auc: 0.914064	valid_1's auc: 0.890779
[7000]	training's auc: 0.917687	valid_1's auc: 0.892927
[8000]	training's auc: 0.92061	valid_1's auc: 0.894432
[9000]	training's auc: 0.923143	valid_1's auc: 0.895667
[10000]	training's auc: 0.925226	valid_1's auc: 0.896252
[11000]	training's auc: 0.927025	valid_1's auc: 0.89675
[12000]	training's auc: 0.928672	valid_1's auc: 0.897021
[13000]	training's auc: 0.930228	valid_1's auc: 0.897111
[14000]	training's auc: 0.931685	valid_1's auc: 0.89707
[15000]	training's auc: 0.933023	valid_1's auc: 0.897071
[16000]	training's auc: 0.934331	valid_1's auc: 0.897095
[17000]	training's auc: 0.935664	valid_1's auc: 0.897018
[18000]	training's auc: 0.936927	valid_1's a

In [19]:
import glob
train_stack_df=pd.DataFrame(y_train)
test_stack_df=pd.DataFrame()
files = glob.glob('../../03_predict_train/*')
i=0
for file in files:
    if 'US4' in file:
        i+=1
        print(file)
        train_stack_df_tmp=pd.read_csv(file)
        train_stack_df['pred_'+str(i)]=train_stack_df_tmp

files = glob.glob('../../04_predict_test/*')
i=0
for file in files:
    if 'US4' in file:
        i+=1
        print(file)
        test_stack_df_tmp=pd.read_csv(file)
        
        if i == 1 :
            test_stack_df=test_stack_df_tmp
            test_stack_df.columns=['pred_1']
        else:
            test_stack_df['pred_'+str(i)]=test_stack_df_tmp


../../03_predict_train/US4_train_6.csv
../../03_predict_train/US4_train_0.csv
../../03_predict_train/US4_train_2.csv
../../03_predict_train/US4_train_9.csv
../../03_predict_train/US4_train_3.csv
../../03_predict_train/US4_train_5.csv
../../03_predict_train/US4_train_4.csv
../../03_predict_train/US4_train_7.csv
../../03_predict_train/US4_train_8.csv
../../03_predict_train/US4_train_1.csv
../../04_predict_test/US4_test_2.csv
../../04_predict_test/US4_test_5.csv
../../04_predict_test/US4_test_7.csv
../../04_predict_test/US4_test_3.csv
../../04_predict_test/US4_test_1.csv
../../04_predict_test/US4_test_8.csv
../../04_predict_test/US4_test_4.csv
../../04_predict_test/US4_test_6.csv
../../04_predict_test/US4_test_9.csv
../../04_predict_test/US4_test_0.csv


In [20]:
pd.set_option('display.max_rows', 500)
train_stack_df

Unnamed: 0,target,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,pred_8,pred_9,pred_10
0,0,0.052079,0.031713,0.039307,0.035985,0.028039,0.033111,0.044877,0.035349,0.038714,0.047973
1,0,0.667171,0.635790,0.711116,0.806744,0.730814,0.596899,0.808299,0.770502,0.718913,0.658280
2,0,0.026545,0.021594,0.014066,0.022994,0.019981,0.014042,0.026500,0.041059,0.023959,0.030498
3,0,0.556307,0.616654,0.418800,0.432806,0.481231,0.573818,0.532940,0.414880,0.502474,0.490040
4,0,0.276644,0.227806,0.192376,0.283271,0.219499,0.265672,0.167591,0.249824,0.268328,0.291390
5,0,0.058496,0.095212,0.078587,0.083342,0.108515,0.060302,0.069516,0.071991,0.089081,0.094919
6,0,0.365617,0.284446,0.308387,0.308295,0.325806,0.219929,0.352156,0.220906,0.331294,0.275306
7,0,0.216346,0.199884,0.158361,0.159124,0.168003,0.183186,0.217961,0.292777,0.154378,0.211319
8,0,0.361577,0.243293,0.179118,0.332003,0.290422,0.215659,0.212144,0.231543,0.345183,0.213261
9,0,0.061365,0.062464,0.063967,0.039341,0.043926,0.041906,0.042853,0.048358,0.043037,0.058247


In [21]:
test_stack_df

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,pred_8,pred_9,pred_10
0,0.351845,0.287049,0.338893,0.301523,0.326513,0.329322,0.308512,0.302337,0.299520,0.349885
1,0.430416,0.438003,0.398194,0.420364,0.391792,0.468847,0.452122,0.466909,0.438719,0.408615
2,0.422369,0.469076,0.401089,0.474984,0.440327,0.447883,0.470608,0.434656,0.429370,0.418825
3,0.377699,0.403414,0.390639,0.396505,0.393548,0.438294,0.442795,0.340346,0.426123,0.436163
4,0.120688,0.117561,0.120819,0.144650,0.115471,0.098506,0.113495,0.125937,0.113983,0.130939
5,0.008543,0.007918,0.007615,0.008425,0.008520,0.007387,0.006904,0.008961,0.008527,0.008082
6,0.017891,0.021898,0.016508,0.021101,0.025422,0.024076,0.018309,0.022528,0.024515,0.025996
7,0.397416,0.363283,0.432741,0.406151,0.361734,0.466993,0.388408,0.407245,0.408939,0.380521
8,0.009163,0.009971,0.007941,0.008206,0.009327,0.007271,0.009795,0.007960,0.009730,0.007952
9,0.030382,0.027777,0.030035,0.021619,0.035090,0.025515,0.027242,0.032408,0.025423,0.029018


In [22]:
train_stack_df.describe()

Unnamed: 0,target,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,pred_8,pred_9,pred_10
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0
mean,0.10049,0.201627,0.201752,0.200567,0.201348,0.200892,0.201247,0.201138,0.20122,0.200802,0.201019
std,0.300653,0.226904,0.230667,0.231889,0.23056,0.230822,0.228686,0.232885,0.229651,0.228983,0.229783
min,0.0,0.000694,0.000503,0.000373,0.000455,0.000496,0.000452,0.00043,0.000479,0.000455,0.000601
25%,0.0,0.043623,0.041343,0.039857,0.041076,0.040638,0.04216,0.039406,0.041537,0.041726,0.041363
50%,0.0,0.10889,0.105957,0.103565,0.105528,0.104729,0.10723,0.103283,0.106045,0.106396,0.10601
75%,0.0,0.271207,0.272331,0.270322,0.271919,0.270152,0.270542,0.270791,0.271274,0.270473,0.270964
max,1.0,0.999717,0.99978,0.999747,0.999805,0.999781,0.999852,0.999898,0.999597,0.999812,0.9997


In [23]:
test_stack_df.describe()

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,pred_8,pred_9,pred_10
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0
mean,0.19545,0.196338,0.196174,0.195754,0.195967,0.195857,0.19603,0.196745,0.196358,0.196679
std,0.213432,0.210549,0.211477,0.21242,0.211691,0.210902,0.214443,0.208969,0.212317,0.212271
min,0.000445,0.000548,0.000558,0.000508,0.000583,0.000564,0.000435,0.000567,0.000484,0.000532
25%,0.044466,0.046891,0.046142,0.045304,0.045935,0.046433,0.04416,0.048379,0.045829,0.045997
50%,0.110944,0.114347,0.113363,0.112276,0.112944,0.113579,0.111215,0.116103,0.113058,0.113629
75%,0.268419,0.26927,0.269044,0.268675,0.268834,0.267956,0.269058,0.269632,0.269272,0.269868
max,0.99975,0.999662,0.999517,0.999705,0.999644,0.99965,0.999646,0.999626,0.999767,0.999706


In [24]:
y_train_stack=train_stack_df['target']
x_train_stack=train_stack_df.drop('target',axis=1)
x_test_stack=test_stack_df

In [25]:
for i_df in [x_train_stack,x_test_stack]:
    i_df['mean']=i_df.mean(axis=1)
    i_df['max']=i_df.max(axis=1)
    i_df['min']=i_df.min(axis=1)
    

In [26]:
x_train_stack

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,pred_8,pred_9,pred_10,mean,max,min
0,0.052079,0.031713,0.039307,0.035985,0.028039,0.033111,0.044877,0.035349,0.038714,0.047973,0.038715,0.052079,0.028039
1,0.667171,0.635790,0.711116,0.806744,0.730814,0.596899,0.808299,0.770502,0.718913,0.658280,0.710453,0.808299,0.596899
2,0.026545,0.021594,0.014066,0.022994,0.019981,0.014042,0.026500,0.041059,0.023959,0.030498,0.024124,0.041059,0.014042
3,0.556307,0.616654,0.418800,0.432806,0.481231,0.573818,0.532940,0.414880,0.502474,0.490040,0.501995,0.616654,0.414880
4,0.276644,0.227806,0.192376,0.283271,0.219499,0.265672,0.167591,0.249824,0.268328,0.291390,0.244240,0.291390,0.167591
5,0.058496,0.095212,0.078587,0.083342,0.108515,0.060302,0.069516,0.071991,0.089081,0.094919,0.080996,0.108515,0.058496
6,0.365617,0.284446,0.308387,0.308295,0.325806,0.219929,0.352156,0.220906,0.331294,0.275306,0.299214,0.365617,0.219929
7,0.216346,0.199884,0.158361,0.159124,0.168003,0.183186,0.217961,0.292777,0.154378,0.211319,0.196134,0.292777,0.154378
8,0.361577,0.243293,0.179118,0.332003,0.290422,0.215659,0.212144,0.231543,0.345183,0.213261,0.262420,0.361577,0.179118
9,0.061365,0.062464,0.063967,0.039341,0.043926,0.041906,0.042853,0.048358,0.043037,0.058247,0.050546,0.063967,0.039341


In [27]:
#x_train_stack=pd.concat([x_train_stack,x_train_base],axis=1)
#x_test_stack=pd.concat([x_test_stack,x_test],axis=1)

In [28]:
#model_param
param = {
    "objective" : "binary", 
    "boost":"gbdt",
    "metric":"auc",
    "boost_from_average":"false",
    "num_threads":28,
    "learning_rate" : 0.0083,
    "num_leaves" : 13,
    "max_depth":5,
    "tree_learner" : "serial",
    "feature_fraction" : 0.0405,
    "bagging_freq" : 5,
    "bagging_fraction" : 0.331,
    "min_data_in_leaf" : 80,
    "min_sum_hessian_in_leaf" : 10.0,
    "verbosity" : 1,
    'seed': 44000,
    }

In [None]:
folds = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=SEED)
oof_preds = np.zeros((len(x_train_stack), 1))
test_preds = np.zeros((len(x_test_stack), 1))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train_stack.values, y_train_stack.values)):
    print("\n")
    print("Fold {}".format(fold_))
    trn_x,trn_y = x_train_stack.iloc[trn_idx], y_train_stack.iloc[trn_idx]
    val_x,val_y = x_train_stack.iloc[val_idx], y_train_stack.iloc[val_idx]
    trn_data = lgb.Dataset(trn_x,trn_y)
    val_data = lgb.Dataset(val_x,val_y)
    
    num_round = 1000000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
    val_pred = clf.predict(val_x, num_iteration=clf.best_iteration)
    test_pred = clf.predict(x_test_stack, num_iteration=clf.best_iteration)
    
    print("AUC = {}".format(roc_auc_score(val_y, val_pred)))
    print(confusion_matrix(val_y, pd.DataFrame(np.round(val_pred))))

    #print("val = {}".format(val_pred)
    oof_preds[val_idx, :] = val_pred.reshape((-1, 1))
    test_preds += test_pred.reshape((-1, 1))
    
test_preds /= fold_num
roc_score = roc_auc_score(y_train_stack, oof_preds.ravel())
print("Overall AUC = {}".format(roc_score))
print(confusion_matrix(y_train_stack, pd.DataFrame(np.round(oof_preds))))




Fold 0
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.902749	valid_1's auc: 0.899843
[2000]	training's auc: 0.904329	valid_1's auc: 0.899531
[3000]	training's auc: 0.905542	valid_1's auc: 0.899145
Early stopping, best iteration is:
[318]	training's auc: 0.90154	valid_1's auc: 0.899949
AUC = 0.8999493582917795
[[35443   538]
 [ 2473  1547]]


Fold 1
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.903001	valid_1's auc: 0.899195
[2000]	training's auc: 0.904688	valid_1's auc: 0.898778
[3000]	training's auc: 0.905878	valid_1's auc: 0.898282
Early stopping, best iteration is:
[349]	training's auc: 0.901779	valid_1's auc: 0.899323
AUC = 0.8993225314742538
[[35442   539]
 [ 2470  1550]]


Fold 2
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.903019	valid_1's auc: 0.898681
[2000]	training's auc: 0.904684	valid_1's auc: 0.898422
[3000]	training's auc: 0.905909	valid_1's 

In [18]:
#結果保存
if (submit_flg ==1 and sampling_flg==0):
    series_oof_preds = pd.Series(data=oof_preds[:,0], name=save_col_name, dtype='float')
    series_oof_preds.to_csv(fin_train_preds_dir,header=True, index=False)

    sample = pd.read_csv(sample_submission_dir)
    sample.target = test_preds[:,0].astype(float)
    sample.ID_code = test_df['ID_code']
    sample.to_csv(fin_test_preds_dir, index=False)
    