In [1]:
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

warnings.filterwarnings('ignore')


基本変数定義

In [2]:
sampling_flg=0 #サンプリング有無をコントロール
submit_flg=1 #保存するかをコントロール（サンプリングしない時のみ）

SEED=12345
sample_num=10000
fold_num=5

#train関連
train_dir='../../01_input/train.csv'
train_drop_col=['ID_code', 'target']
train_label='target'

#test関連
test_dir='../../01_input/test.csv'
test_drop_col=['ID_code']

#結果ファイル関連　nameは自分の名前に変更する
fin_train_preds_dir='../../03_predict_train/oka_250-4_LightGBM_train.csv'
fin_test_preds_dir='../../04_predict_test/oka_250-4_LightGBM_submission.csv'
save_col_name='oof_xgb'

sample_submission_dir='../../01_input/sample_submission.csv'
submission_target_col_name='target'
submission_id_col_name='ID_code'

前処理

In [3]:
#ロード
train_df=pd.read_csv(train_dir)
test_df=pd.read_csv(test_dir)

In [4]:
#サンプリング
if sampling_flg ==1:
    train_df=train_df.sample(n=sample_num,random_state=SEED)
    test_df=test_df.sample(n=sample_num,random_state=SEED)

In [5]:
#x,y作成
x_train=train_df.drop(train_drop_col,axis=1)
y_train=train_df[train_label]
x_train_base=train_df.drop(train_drop_col,axis=1)
y_train_base=train_df[train_label]
x_test=test_df.drop(test_drop_col,axis=1)
x_train_col=x_train.columns

モデル実行

In [6]:
def pred_lgbm(trn_x,trn_y,val_x,val_y,train_x,train_y,test_x,test_y):
    #model_param
    param = {
        "objective" : "binary", 
        "boost":"gbdt",
        "metric":"auc",
        "boost_from_average":"false",
        "num_threads":28,
        "learning_rate" : 0.01,
        "num_leaves" : 13,
        "max_depth":-1,
        "tree_learner" : "serial",
        "feature_fraction" : 0.05,
        "bagging_freq" : 5,
        "bagging_fraction" : 0.4,
        "min_data_in_leaf" : 80,
        "min_sum_hessian_in_leaf" : 10.0,
        "verbosity" : 1,
        'seed': i,
        }

    trn_data = lgb.Dataset(trn_x,trn_y)
    val_data = lgb.Dataset(val_x,val_y)
    
    num_round = 1000000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
        
    val_pred=clf.predict(val_x, num_iteration=clf.best_iteration)
    print("val AUC = {}".format(roc_auc_score(val_y, val_pred)))
    print(confusion_matrix(val_y, pd.DataFrame(np.round(val_pred))))

    train_pred=clf.predict(x_train, num_iteration=clf.best_iteration)
    print("train AUC = {}".format(roc_auc_score(y_train, train_pred)))
    print(confusion_matrix(y_train, pd.DataFrame(np.round(train_pred))))
        
    test_pred = model.predict(x_test, num_iteration=model.best_iteration)

    
    return clf

In [7]:
def split_func(df,SEED):
    ss=ShuffleSplit(n_splits=1,train_size=0.8,test_size=0.2,random_state=SEED)
    x=df.drop(train_drop_col,axis=1)
    y=df[train_label]
    train_index,test_index=next(ss.split(x,y))
    train_df_sample=df.iloc[train_index]
    test_df_sample=df.iloc[test_index]
    return train_df_sample,test_df_sample

In [10]:
j=0
for i in range(0,3):
    print(i)
    folds = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=i)
    train_preds = np.zeros((len(x_train), 1))
    test_preds = np.zeros((len(x_test), 1))
    
    train_df_sample,test_df_sample=split_func(train_df,i)
    sumple_num=round((len(train_df_sample[train_df_sample['target']==1])))
    train_df_sample_1=train_df_sample[train_df_sample['target']==1].sample(n=sumple_num,random_state=j)
    train_df_sample_0=train_df_sample[train_df_sample['target']==0].sample(n=sumple_num,random_state=j+100)
    train_df_sample=pd.concat([train_df_sample_0,train_df_sample_1],axis=0)
    
    print('trn 1:'+str(len(train_df_sample[train_df_sample['target']==1])))
    print('trn 0:'+str(len(train_df_sample[train_df_sample['target']==0])))
    print('val 1:'+str(len(test_df_sample[test_df_sample['target']==1])))
    print('val 0:'+str(len(test_df_sample[test_df_sample['target']==0])))

    trn_x=train_df_sample.drop(train_drop_col,axis=1)
    trn_y=train_df_sample[train_label]
    val_x=test_df_sample.drop(train_drop_col,axis=1)
    val_y=test_df_sample[train_label]
    
    train_pred,test_pred=pred_lgbm(trn_x,trn_y,val_x,val_y)
    
    val_pred=model.predict(val_x, num_iteration=model.best_iteration)
    print("val AUC = {}".format(roc_auc_score(val_y, val_pred)))
    print(confusion_matrix(val_y, pd.DataFrame(np.round(val_pred))))

    train_pred=model.predict(x_train, num_iteration=model.best_iteration)
    print("train AUC = {}".format(roc_auc_score(y_train, train_pred)))
    print(confusion_matrix(y_train, pd.DataFrame(np.round(train_pred))))
        
    test_pred = model.predict(x_test, num_iteration=model.best_iteration)

    train_preds_dir='../../03_predict_train/SIRUS_train_'+str(i)+'.csv'
    test_preds_dir='../../04_predict_test/SIRUS_test_'+str(i)+'.csv'
    
    pd.DataFrame(train_pred).to_csv(train_preds_dir, index=False)
    pd.DataFrame(test_pred).to_csv(test_preds_dir, index=False)
    print('save done')

0
trn 1:16084
trn 0:16084
val 1:4014
val 0:35986
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.918241	valid_1's auc: 0.884058
[2000]	training's auc: 0.930066	valid_1's auc: 0.889127
[3000]	training's auc: 0.939581	valid_1's auc: 0.891103
[4000]	training's auc: 0.947634	valid_1's auc: 0.892265
[5000]	training's auc: 0.954881	valid_1's auc: 0.892716
[6000]	training's auc: 0.961313	valid_1's auc: 0.892708
[7000]	training's auc: 0.967155	valid_1's auc: 0.892733
[8000]	training's auc: 0.972464	valid_1's auc: 0.892395
[9000]	training's auc: 0.977112	valid_1's auc: 0.892236
[10000]	training's auc: 0.981238	valid_1's auc: 0.892005
Early stopping, best iteration is:
[7142]	training's auc: 0.967905	valid_1's auc: 0.892833

AUC = 0.8928331786892378
[[29673  6313]
 [  816  3198]]
val AUC = 0.8928331786892378
[[29673  6313]
 [  816  3198]]
train AUC = 0.9317855097431288
[[149620  30282]
 [  2438  17660]]
save done
1
trn 1:16142
trn 0:16142
val 1:3956
val 0

In [11]:
import glob
train_stack_df=pd.DataFrame(y_train)
test_stack_df=pd.DataFrame()
files = glob.glob('../../03_predict_train/*')
i=0
for file in files:
    if 'SIRUS' in file:
        i+=1
        print(file)
        train_stack_df_tmp=pd.read_csv(file)
        train_stack_df['pred_'+str(i)]=train_stack_df_tmp

files = glob.glob('../../04_predict_test/*')
i=0
for file in files:
    if 'SIRUS' in file:
        i+=1
        print(file)
        test_stack_df_tmp=pd.read_csv(file)
        
        if i == 1 :
            test_stack_df=test_stack_df_tmp
            test_stack_df.columns=['pred_1']
        else:
            test_stack_df['pred_'+str(i)]=test_stack_df_tmp


../../03_predict_train/SIRUS_train_2.csv
../../03_predict_train/SIRUS_train_0.csv
../../03_predict_train/SIRUS_train_1.csv
../../04_predict_test/SIRUS_test_2.csv
../../04_predict_test/SIRUS_test_0.csv
../../04_predict_test/SIRUS_test_1.csv


In [12]:
pd.set_option('display.max_rows', 500)
train_stack_df

Unnamed: 0,target,pred_1,pred_2,pred_3
0,0,0.081447,0.110411,0.140546
1,0,0.775823,0.870638,0.852268
2,0,0.060758,0.052969,0.074399
3,0,0.570718,0.605894,0.713260
4,0,0.494205,0.380052,0.464552
5,0,0.179598,0.186662,0.215541
6,0,0.483196,0.635032,0.511003
7,0,0.357910,0.419908,0.302092
8,0,0.446777,0.405371,0.304027
9,0,0.118277,0.106326,0.161635


In [13]:
test_stack_df

Unnamed: 0,pred_1,pred_2,pred_3
0,0.622778,0.576876,0.577579
1,0.574820,0.634911,0.756247
2,0.774024,0.587731,0.537907
3,0.559717,0.724482,0.603391
4,0.262128,0.222649,0.231122
5,0.019776,0.028834,0.027080
6,0.073971,0.073890,0.065149
7,0.611328,0.599692,0.726596
8,0.020273,0.026612,0.035827
9,0.034816,0.052427,0.099335


In [14]:
train_stack_df.describe()

Unnamed: 0,target,pred_1,pred_2,pred_3
count,200000.0,200000.0,200000.0,200000.0
mean,0.10049,0.324803,0.321706,0.326403
std,0.300653,0.260962,0.264412,0.259171
min,0.0,0.001639,0.001096,0.001693
25%,0.0,0.110876,0.10488,0.114024
50%,0.0,0.245535,0.238541,0.248924
75%,0.0,0.485791,0.485159,0.487138
max,1.0,0.999573,0.999467,0.999598


In [15]:
test_stack_df.describe()

Unnamed: 0,pred_1,pred_2,pred_3
count,200000.0,200000.0,200000.0
mean,0.324459,0.321579,0.32608
std,0.247436,0.250733,0.245465
min,0.000982,0.001064,0.001903
25%,0.119535,0.113368,0.122776
50%,0.257321,0.251651,0.261009
75%,0.484505,0.483961,0.485757
max,0.999049,0.998987,0.998569


In [16]:
y_train_stack=train_stack_df['target']
x_train_stack=train_stack_df.drop('target',axis=1)
x_test_stack=test_stack_df

In [17]:
for i_df in [x_train_stack,x_test_stack]:
    i_df['mean']=i_df.mean(axis=1)
    i_df['max']=i_df.max(axis=1)
    i_df['min']=i_df.min(axis=1)
    

In [18]:
x_train_stack

Unnamed: 0,pred_1,pred_2,pred_3,mean,max,min
0,0.081447,0.110411,0.140546,0.110801,0.140546,0.081447
1,0.775823,0.870638,0.852268,0.832910,0.870638,0.775823
2,0.060758,0.052969,0.074399,0.062709,0.074399,0.052969
3,0.570718,0.605894,0.713260,0.629957,0.713260,0.570718
4,0.494205,0.380052,0.464552,0.446270,0.494205,0.380052
5,0.179598,0.186662,0.215541,0.193934,0.215541,0.179598
6,0.483196,0.635032,0.511003,0.543077,0.635032,0.483196
7,0.357910,0.419908,0.302092,0.359970,0.419908,0.302092
8,0.446777,0.405371,0.304027,0.385392,0.446777,0.304027
9,0.118277,0.106326,0.161635,0.128746,0.161635,0.106326


In [19]:
x_train_stack=pd.concat([x_train_stack,x_train_base],axis=1)
x_test_stack=pd.concat([x_test_stack,x_test],axis=1)

In [20]:
#model_param
param = {
    "objective" : "binary", 
    "boost":"gbdt",
    "metric":"auc",
    "boost_from_average":"false",
    "num_threads":28,
    "learning_rate" : 0.01,
    "num_leaves" : 13,
    "max_depth":5,
    "tree_learner" : "serial",
    "feature_fraction" : 0.05,
    "bagging_freq" : 5,
    "bagging_fraction" : 0.4,
    "min_data_in_leaf" : 80,
    "min_sum_hessian_in_leaf" : 10.0,
    "verbosity" : 1,
    'seed': 44000,
    }

In [None]:
folds = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=SEED)
oof_preds = np.zeros((len(x_train_stack), 1))
test_preds = np.zeros((len(x_test_stack), 1))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train_stack.values, y_train_stack.values)):
    print("\n")
    print("Fold {}".format(fold_))
    trn_x,trn_y = x_train_stack.iloc[trn_idx], y_train_stack.iloc[trn_idx]
    val_x,val_y = x_train_stack.iloc[val_idx], y_train_stack.iloc[val_idx]
    trn_data = lgb.Dataset(trn_x,trn_y)
    val_data = lgb.Dataset(val_x,val_y)
    
    num_round = 1000000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
    val_pred = clf.predict(val_x, num_iteration=clf.best_iteration)
    test_pred = clf.predict(x_test_stack, num_iteration=clf.best_iteration)
    
    print("AUC = {}".format(roc_auc_score(val_y, val_pred)))
    print(confusion_matrix(val_y, pd.DataFrame(np.round(val_pred))))

    #print("val = {}".format(val_pred)
    oof_preds[val_idx, :] = val_pred.reshape((-1, 1))
    test_preds += test_pred.reshape((-1, 1))
    
test_preds /= fold_num
roc_score = roc_auc_score(y_train_stack, oof_preds.ravel())
print("Overall AUC = {}".format(roc_score))
print(confusion_matrix(y_train_stack, pd.DataFrame(np.round(oof_preds))))




Fold 0
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.936351	valid_1's auc: 0.931479
[2000]	training's auc: 0.941421	valid_1's auc: 0.932408
[3000]	training's auc: 0.946376	valid_1's auc: 0.932886
[4000]	training's auc: 0.951049	valid_1's auc: 0.933385
[5000]	training's auc: 0.955275	valid_1's auc: 0.933631
[6000]	training's auc: 0.959137	valid_1's auc: 0.934038
[7000]	training's auc: 0.962724	valid_1's auc: 0.934309
[8000]	training's auc: 0.966023	valid_1's auc: 0.934707
[9000]	training's auc: 0.969103	valid_1's auc: 0.935002
[10000]	training's auc: 0.972007	valid_1's auc: 0.93523
[11000]	training's auc: 0.974681	valid_1's auc: 0.935465
[12000]	training's auc: 0.977088	valid_1's auc: 0.935606
[13000]	training's auc: 0.979392	valid_1's auc: 0.935751
[14000]	training's auc: 0.981498	valid_1's auc: 0.935943
[15000]	training's auc: 0.983445	valid_1's auc: 0.936045
[16000]	training's auc: 0.98509	valid_1's auc: 0.93629
[17000]	training's auc: 0.9

In [None]:
#結果保存
if (submit_flg ==1 and sampling_flg==0):
    series_oof_preds = pd.Series(data=oof_preds[:,0], name=save_col_name, dtype='float')
    series_oof_preds.to_csv(fin_train_preds_dir,header=True, index=False)

    sample = pd.read_csv(sample_submission_dir)
    sample.target = test_preds[:,0].astype(float)
    sample.ID_code = test_df['ID_code']
    sample.to_csv(fin_test_preds_dir, index=False)
    