In [1]:
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

warnings.filterwarnings('ignore')


基本変数定義

In [42]:
sampling_flg=0 #サンプリング有無をコントロール
submit_flg=1 #保存するかをコントロール（サンプリングしない時のみ）

SEED=12345
sample_num=10000
fold_num=5

#train関連
train_dir='../../01_input/train.csv'
train_mean_dir='../../02_feature/122-2_train_mean.csv'
train_drop_col=['ID_code', 'target','mean']
train_label='target'

#test関連
test_dir='../../01_input/test.csv'
test_mean_dir='../../02_feature/122-2_test_mean.csv'
test_drop_col=['ID_code','mean']

#予測結果
pred_test_dir='../../04_predict_test/oka_252-2_LightGBM_submission.csv'
pred_train_dir='../../03_predict_train/oka_252-2_LightGBM_train.csv'

#結果ファイル関連　nameは自分の名前に変更する
train_preds_dir='../../03_predict_train/oka_252-3_LightGBM_train.csv'
test_preds_dir='../../04_predict_test/oka_252-3_LightGBM_submission.csv'
save_col_name='oof_xgb'

sample_submission_dir='../../01_input/sample_submission.csv'
submission_target_col_name='target'
submission_id_col_name='ID_code'

前処理

In [3]:
#ロード
train_df=pd.read_csv(train_dir)
test_df=pd.read_csv(test_dir)
train_mean_df=pd.read_csv(train_mean_dir,header=None,index_col=0)
test_mean_df=pd.read_csv(test_mean_dir,header=None,index_col=0)

pred_train_df = pd.read_csv(pred_train_dir)
pred_test_df = pd.read_csv(pred_test_dir)

In [4]:
pred_train_df.head()

Unnamed: 0,oof_xgb
0,0.032594
1,0.422843
2,0.014319
3,0.233246
4,0.095201


In [5]:
train_df['mean']=train_mean_df
test_df['mean']=test_mean_df

In [6]:
train_df_0=train_df[train_df['mean'] <  0.5]
train_df_1=train_df[train_df['mean'] >= 0.5]
test_df_0=test_df[test_df['mean'] <  0.5]
test_df_1=test_df[test_df['mean'] >= 0.5]

In [7]:
len(train_df_1),len(test_df_1)

(23553, 20979)

In [8]:
len(train_df_0),len(test_df_0)

(176447, 179021)

In [9]:
#x,y作成
x_train_0=train_df_0.drop(train_drop_col,axis=1)
y_train_0=train_df_0[train_label]
x_test_0=test_df_0.drop(test_drop_col,axis=1)

x_train_1=train_df_1.drop(train_drop_col,axis=1)
y_train_1=train_df_1[train_label]
x_test_1=test_df_1.drop(test_drop_col,axis=1)

x_train=train_df.drop(train_drop_col,axis=1)
y_train=train_df[train_label]

In [10]:
#ベースライン
print("AUC = {}".format(roc_auc_score(y_train, pred_train_df)))
print(confusion_matrix(y_train, pd.DataFrame(np.round(pred_train_df))))

AUC = 0.887815127880921
[[177973   1929]
 [ 13225   6873]]


In [39]:
#ベースライン
print("AUC = {}".format(roc_auc_score(y_train, train_mean_df)))
print(confusion_matrix(y_train, pd.DataFrame(np.round(train_mean_df))))

AUC = 0.900925089190569
[[168847  11055]
 [  7600  12498]]


"\nnew_pred_train=pd.DataFrame()\nnew_train=pd.DataFrame()\n\nnew_pred_train=train_df_0['ID_code']\nnew_pred_train=pd.concat([new_pred_train,train_df_0['mean']],axis=1)\nnew_pred_train['ID_code']=new_pred_train['ID_code'].astype('object')\n\nnew_train['target']=y_train\nnew_train=pd.concat([new_train,train_df['ID_code']],axis=1)\nnew_train['ID_code']=new_train['ID_code'].astype('object')\n\nprint(new_train.describe())\nprint(new_pred_train.describe())\nnew_train=pd.merge(new_train, new_pred_train, how='left', on='ID_code')\nnew_train=pd.concat([new_train,x_train],axis=1)\n\nnew_train.head()\n"

モデル実行

In [27]:
#ベースライン
print("AUC = {}".format(roc_auc_score(y_train_0, pd.DataFrame(train_df_0['mean']))))
print(confusion_matrix(y_train_0, pd.DataFrame(np.round(train_df_0['mean']))))

AUC = 0.8132500546274687
[[168847      0]
 [  7600      0]]


In [13]:
#model_param
param = {
    "objective" : "binary", 
    "boost":"gbdt",
    "metric":"auc",
    "boost_from_average":"false",
    "num_threads":28,
    "learning_rate" : 0.01,
    "num_leaves" : 13,
    "max_depth":-1,
    "tree_learner" : "serial",
    "feature_fraction" : 0.05,
    "bagging_freq" : 5,
    "bagging_fraction" : 0.4,
    "min_data_in_leaf" : 80,
    "min_sum_hessian_in_leaf" : 10.0,
    "verbosity" : 1,
    'seed': 44000,
    }

In [14]:
folds = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=SEED)
oof_preds = np.zeros((len(x_train_0), 1))
test_preds = np.zeros((len(x_test_0), 1))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train_0.values, y_train_0.values)):
    print("\n")
    print("Fold {}".format(fold_))
    trn_x,trn_y = x_train_0.iloc[trn_idx], y_train_0.iloc[trn_idx]
    val_x,val_y = x_train_0.iloc[val_idx], y_train_0.iloc[val_idx]
    trn_data = lgb.Dataset(trn_x,trn_y)
    val_data = lgb.Dataset(val_x,val_y)
    
    num_round = 1000000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 1000)
    val_pred = clf.predict(val_x, num_iteration=clf.best_iteration)
    test_pred = clf.predict(x_test_0, num_iteration=clf.best_iteration)
    
    print("AUC = {}".format(roc_auc_score(val_y, pd.DataFrame(val_pred))))
    #print("val = {}".format(val_pred)
    oof_preds[val_idx, :] = val_pred.reshape((-1, 1))
    test_preds += test_pred.reshape((-1, 1))
    
    print(confusion_matrix(val_y, pd.DataFrame(np.round(val_pred))))
    
test_preds /= fold_num
roc_score = roc_auc_score(y_train_0, pd.DataFrame(oof_preds.ravel()))
print("Overall AUC = {}".format(roc_score))
print(confusion_matrix(y_train_0, pd.DataFrame(np.round(oof_preds))))



Fold 0
Training until validation scores don't improve for 1000 rounds.
[1000]	training's auc: 0.90329	valid_1's auc: 0.770944
[2000]	training's auc: 0.923424	valid_1's auc: 0.785547
[3000]	training's auc: 0.936173	valid_1's auc: 0.790419
[4000]	training's auc: 0.945745	valid_1's auc: 0.794136
[5000]	training's auc: 0.953686	valid_1's auc: 0.795884
[6000]	training's auc: 0.960491	valid_1's auc: 0.796433
[7000]	training's auc: 0.966296	valid_1's auc: 0.796161
Early stopping, best iteration is:
[6194]	training's auc: 0.961628	valid_1's auc: 0.796732
AUC = 0.7967316833689199
[[33770     0]
 [ 1520     0]]


Fold 1
Training until validation scores don't improve for 1000 rounds.
[1000]	training's auc: 0.899979	valid_1's auc: 0.781675
[2000]	training's auc: 0.920656	valid_1's auc: 0.795968
[3000]	training's auc: 0.934289	valid_1's auc: 0.802065
[4000]	training's auc: 0.944122	valid_1's auc: 0.80366
[5000]	training's auc: 0.952508	valid_1's auc: 0.805559
[6000]	training's auc: 0.959207	valid

In [33]:
base=pd.DataFrame(train_df_0['mean'])
base=base.reset_index(drop=True)
new=pd.DataFrame(oof_preds)
new=new.reset_index(drop=True)
y_train_0=y_train_0.reset_index(drop=True)

diff=pd.DataFrame()
diff=pd.concat([base,new],axis=1)
diff['label']=y_train_0
len(base),len(new),len(y_train_0),len(diff)

(176447, 176447, 176447, 176447)

In [34]:
diff

Unnamed: 0,mean,0,label
0,0.036122,0.015763,0
1,0.023994,0.008886,0
2,0.491483,0.257509,0
3,0.258872,0.110139,0
4,0.081136,0.031916,0
5,0.293005,0.065244,0
6,0.199532,0.050436,0
7,0.245156,0.058438,0
8,0.046868,0.015661,0
9,0.034361,0.007665,0


In [35]:
print("Overall AUC = {}".format(roc_auc_score(y_train_0, base)))
print(confusion_matrix(y_train_0, pd.DataFrame(np.round(base))))
print("Overall AUC = {}".format(roc_auc_score(y_train_0, new)))
print(confusion_matrix(y_train_0, pd.DataFrame(np.round(new))))

Overall AUC = 0.8132500546274687
[[168847      0]
 [  7600      0]]
Overall AUC = 0.7946585923475409
[[168847      0]
 [  7600      0]]


In [36]:
#ベースライン
print("AUC = {}".format(roc_auc_score(y_train_1, train_df_1['mean'])))
print(confusion_matrix(y_train_1, pd.DataFrame(np.round(train_df_1['mean']))))

AUC = 0.7498668443667404
[[    0 11055]
 [    0 12498]]


In [46]:
#model_param
param = {
    "objective" : "binary", 
    "boost":"gbdt",
    "metric":"auc",
    "boost_from_average":"false",
    "num_threads":28,
    "learning_rate" : 0.01,
    "num_leaves" : 13,
    "max_depth":2,
    "tree_learner" : "serial",
    "feature_fraction" : 0.05,
    "bagging_freq" : 5,
    "bagging_fraction" : 0.4,
    "min_data_in_leaf" : 80,
    "min_sum_hessian_in_leaf" : 10.0,
    "verbosity" : 1,
    'seed': 44000,
    }

In [47]:
folds = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=1)
oof_preds = np.zeros((len(x_train_1), 1))
test_preds = np.zeros((len(x_test_1), 1))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train_1.values, y_train_1.values)):
    print("\n")
    print("Fold {}".format(fold_))
    trn_x,trn_y = x_train_1.iloc[trn_idx], y_train_1.iloc[trn_idx]
    val_x,val_y = x_train_1.iloc[val_idx], y_train_1.iloc[val_idx]
    trn_data = lgb.Dataset(trn_x,trn_y)
    val_data = lgb.Dataset(val_x,val_y)
    
    num_round = 1000000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
    val_pred = clf.predict(val_x, num_iteration=clf.best_iteration)
    test_pred = clf.predict(x_test_1, num_iteration=clf.best_iteration)
    
    print("AUC = {}".format(roc_auc_score(val_y, val_pred)))
    #print("val = {}".format(val_pred)
    oof_preds[val_idx, :] = val_pred.reshape((-1, 1))
    test_preds += test_pred.reshape((-1, 1))
    
    print(confusion_matrix(val_y, pd.DataFrame(np.round(val_pred))))
    
test_preds /= fold_num
roc_score = roc_auc_score(y_train_1, oof_preds.ravel())
print("Overall AUC = {}".format(roc_score))
print(confusion_matrix(y_train_1, pd.DataFrame(np.round(oof_preds))))



Fold 0
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.776765	valid_1's auc: 0.655143
[2000]	training's auc: 0.80197	valid_1's auc: 0.669496
[3000]	training's auc: 0.821347	valid_1's auc: 0.680055
[4000]	training's auc: 0.835698	valid_1's auc: 0.686815
[5000]	training's auc: 0.8471	valid_1's auc: 0.695493
[6000]	training's auc: 0.855692	valid_1's auc: 0.699679
[7000]	training's auc: 0.863847	valid_1's auc: 0.704073
[8000]	training's auc: 0.869592	valid_1's auc: 0.70517
[9000]	training's auc: 0.875205	valid_1's auc: 0.706953
[10000]	training's auc: 0.881286	valid_1's auc: 0.709052
[11000]	training's auc: 0.886112	valid_1's auc: 0.710611
[12000]	training's auc: 0.890662	valid_1's auc: 0.712268
[13000]	training's auc: 0.894725	valid_1's auc: 0.713312
[14000]	training's auc: 0.898526	valid_1's auc: 0.714733
[15000]	training's auc: 0.901961	valid_1's auc: 0.71573
[16000]	training's auc: 0.905916	valid_1's auc: 0.717226
[17000]	training's auc: 0.909

[4000]	training's auc: 0.83643	valid_1's auc: 0.681972
[5000]	training's auc: 0.848158	valid_1's auc: 0.687447
[6000]	training's auc: 0.856548	valid_1's auc: 0.692187
[7000]	training's auc: 0.864268	valid_1's auc: 0.695016
[8000]	training's auc: 0.870768	valid_1's auc: 0.698122
[9000]	training's auc: 0.876415	valid_1's auc: 0.70064
[10000]	training's auc: 0.881903	valid_1's auc: 0.703692
[11000]	training's auc: 0.886974	valid_1's auc: 0.704937
[12000]	training's auc: 0.891307	valid_1's auc: 0.705946
[13000]	training's auc: 0.89513	valid_1's auc: 0.70594
[14000]	training's auc: 0.898805	valid_1's auc: 0.70689
[15000]	training's auc: 0.902539	valid_1's auc: 0.707743
[16000]	training's auc: 0.906115	valid_1's auc: 0.708712
[17000]	training's auc: 0.909463	valid_1's auc: 0.710394
[18000]	training's auc: 0.912595	valid_1's auc: 0.711265
[19000]	training's auc: 0.915801	valid_1's auc: 0.711578
[20000]	training's auc: 0.918754	valid_1's auc: 0.712852
[21000]	training's auc: 0.921587	valid_1's

In [44]:
#結果保存
if (submit_flg ==1 and sampling_flg==0):
    sample = pd.read_csv(sample_submission_dir)
    sample.target = test_mean_df
    sample.ID_code = test_df['ID_code']
    sample.to_csv(test_preds_dir, index=False)
    

In [None]:
new_pred_train=pd.DataFrame()
new_train=pd.DataFrame()

new_pred_train=train_df_0['ID_code']
new_pred_train=pd.concat([new_pred_train,train_df_0['mean']],axis=1)
new_pred_train['ID_code']=new_pred_train['ID_code'].astype('object')

new_train['target']=y_train
new_train=pd.concat([new_train,train_df['ID_code']],axis=1)
new_train['ID_code']=new_train['ID_code'].astype('object')

print(new_train.describe())
print(new_pred_train.describe())
new_train=pd.merge(new_train, new_pred_train, how='left', on='ID_code')
new_train=pd.concat([new_train,x_train],axis=1)

new_train.head()