In [1]:
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

warnings.filterwarnings('ignore')


基本変数定義

In [2]:
sampling_flg=0 #サンプリング有無をコントロール
submit_flg=1 #保存するかをコントロール（サンプリングしない時のみ）

SEED=12345
sample_num=10000
fold_num=5

#train関連
train_dir='../../01_input/train.csv'
train_FN_dir='../../03_predict_train/FalseNegative_train.csv'
train_drop_col=['ID_code', 'target']
train_label='target'

#test関連
test_dir='../../01_input/test.csv'
test_drop_col=['ID_code']

#結果ファイル関連　nameは自分の名前に変更する
train_preds_dir='../../03_predict_train/oka_250_LightGBM_train.csv'
test_preds_dir='../../04_predict_test/oka_250_LightGBM_submission.csv'
save_col_name='oof_xgb'

sample_submission_dir='../../01_input/sample_submission.csv'
submission_target_col_name='target'
submission_id_col_name='ID_code'

前処理

In [3]:
#ロード
train_df=pd.read_csv(train_dir)
test_df=pd.read_csv(test_dir)
train_FN_df=pd.read_csv(train_FN_dir)

In [4]:
#サンプリング
if sampling_flg ==1:
    train_df=train_df.sample(n=sample_num,random_state=SEED)
    test_df=test_df.sample(n=sample_num,random_state=SEED)

In [5]:
#x,y作成
x_train=train_df.drop(train_drop_col,axis=1)
y_train=train_df[train_label]
x_train_base=train_df.drop(train_drop_col,axis=1)
y_train_base=train_df[train_label]
x_test=test_df.drop(test_drop_col,axis=1)
x_train_col=x_train.columns

モデル実行

In [6]:
def lgbm(train_x,train_y,test_x,test_y):
    #model_param
    param = {
        "objective" : "binary", 
        "boost":"gbdt",
        "metric":"auc",
        "boost_from_average":"false",
        "num_threads":28,
        "learning_rate" : 0.01,
        "num_leaves" : 13,
        "max_depth":-1,
        "tree_learner" : "serial",
        "feature_fraction" : 0.05,
        "bagging_freq" : 5,
        "bagging_fraction" : 0.4,
        "min_data_in_leaf" : 80,
        "min_sum_hessian_in_leaf" : 10.0,
        "verbosity" : 1,
        'seed': i,
        }

    trn_data = lgb.Dataset(train_x,train_y)
    val_data = lgb.Dataset(test_x,test_y)
    
    num_round = 1000000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
    
    val_pred = clf.predict(test_x, num_iteration=clf.best_iteration)
    auc_score=roc_auc_score(test_y, val_pred)
    print("")
    print("AUC = {}".format(auc_score))
    print(confusion_matrix(test_y, pd.DataFrame(np.round(val_pred))))
    
    return val_pred,clf

In [9]:
y_test = np.zeros((len(x_test), 1))
y_train_preds = np.zeros((len(x_train), 1))
y_test_preds = np.zeros((len(x_test), 1))
for i in range(0,10):
    print(i)
    train_df_sample=train_df.sample(n=len(train_df[train_df['target']==1]),random_state=i)
    x_train_sample=train_df_sample.drop(train_drop_col,axis=1)
    y_train_sample=train_df_sample[train_label]

    y_val_pred,model=lgbm(x_train_sample,y_train_sample,x_train,y_train)
    
    y_train_pred=model.predict(x_train)
    y_test_pred=model.predict(x_test)
    
    y_train_preds+=y_train_pred.reshape((-1, 1))
    y_test_preds+=y_test_pred.reshape((-1, 1))
    
    y_train_preds_tmp=y_train_preds/(i+1)
    y_test_preds_tmp=y_test_preds/(i+1)
    print("means AUC = {}".format(roc_auc_score(y_train, y_train_preds_tmp)))
    print(confusion_matrix(y_train, pd.DataFrame(np.round(y_train_preds_tmp))))

print("Over All AUC = {}".format(roc_auc_score(y_train, y_train_preds_tmp)))
    

0
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.965772	valid_1's auc: 0.890138
[2000]	training's auc: 0.980589	valid_1's auc: 0.896433
[3000]	training's auc: 0.989815	valid_1's auc: 0.898692
[4000]	training's auc: 0.995115	valid_1's auc: 0.898953
[5000]	training's auc: 0.997996	valid_1's auc: 0.898826
[6000]	training's auc: 0.999379	valid_1's auc: 0.898326
[7000]	training's auc: 0.999859	valid_1's auc: 0.898077
Early stopping, best iteration is:
[4150]	training's auc: 0.995645	valid_1's auc: 0.899123

AUC = 0.8991225922021239
[[179303    599]
 [ 15468   4630]]
means AUC = 0.8991225922021239
[[179303    599]
 [ 15468   4630]]
1
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.965528	valid_1's auc: 0.889408
[2000]	training's auc: 0.980558	valid_1's auc: 0.895327
[3000]	training's auc: 0.989777	valid_1's auc: 0.897445
[4000]	training's auc: 0.995029	valid_1's auc: 0.897899
[5000]	training's auc: 0.998001	va

In [13]:
y_train_preds_save=y_train_preds_tmp.reshape((-1, 1))
y_test_preds_save=y_test_preds_tmp.reshape((-1, 1))

(200000, 1)

In [24]:
#結果保存
if (submit_flg ==1 and sampling_flg==0):

    sample = pd.read_csv(sample_submission_dir)
    sample.target = y_test_preds_save.astype(float)
    sample.ID_code = test_df['ID_code']
    sample.to_csv(test_preds_dir, index=False)
    