In [1]:
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit

from sklearn.model_selection import GridSearchCV
from sklearn.grid_search import RandomizedSearchCV

warnings.filterwarnings('ignore')




基本変数定義

In [2]:
sampling_flg=0 #サンプリング有無をコントロール
submit_flg=1 #保存するかをコントロール（サンプリングしない時のみ）

SEED=12345
sample_num=10000
fold_num=5

#train関連
train_dir='../../02_feature/114_train_kmeans.csv'
train_feature_dir_1='../../02_feature/113_train_NegativeCount.csv'
train_feature_dir_2='../../02_feature/102_train_kmeans.csv'
train_drop_col=['ID_code', 'target']
train_label='target'

#test関連
test_dir='../../02_feature/114_test_kmeans.csv'
test_feature_dir_1='../../02_feature/113_test_NegativeCount.csv'
test_feature_dir_2='../../02_feature/102_test_kmeans.csv'
test_drop_col=['ID_code']

#結果ファイル関連　nameは自分の名前に変更する
train_preds_dir='../../03_predict_train/oka_222_LightGBM_Mean_NegCount_kmeans_train.csv'
test_preds_dir='../../04_predict_test/oka_222_LightGBM_Mean_NegCount_kmeans_submission.csv'
save_col_name='oof_xgb'

sample_submission_dir='../../01_input/sample_submission.csv'
submission_target_col_name='target'
submission_id_col_name='ID_code'

前処理

In [3]:
#ロード
train_df=pd.read_csv(train_dir)
test_df=pd.read_csv(test_dir)
train_feature_df_1=pd.read_csv(train_feature_dir_1)
test_feature_df_1=pd.read_csv(test_feature_dir_1)
train_feature_df_2=pd.read_csv(train_feature_dir_2)
test_feature_df_2=pd.read_csv(test_feature_dir_2)


train_df=pd.concat([train_df,train_feature_df_1],axis=1)
test_df=pd.concat([test_df,test_feature_df_1],axis=1)
train_df=pd.concat([train_df,train_feature_df_2],axis=1)
test_df=pd.concat([test_df,test_feature_df_2],axis=1)

In [4]:
train_df.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,kmeans_5,kmeans_6,kmeans_7,kmeans_8,kmeans_9,kmeans_10,kmeans_11,kmeans_12,kmeans_13,kmeans_14
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,0,4,2,0,1,9,0,0,6,4
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,1,1,6,5,5,7,6,5,11,0
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,4,2,1,6,6,1,1,11,9,6
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4,2,3,7,3,5,4,1,3,11
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,4,2,3,7,3,5,4,1,3,2


In [5]:
test_df.head()

Unnamed: 0,ID_code,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,kmeans_5,kmeans_6,kmeans_7,kmeans_8,kmeans_9,kmeans_10,kmeans_11,kmeans_12,kmeans_13,kmeans_14
0,test_0,11.0656,7.7798,12.9536,9.4292,11.4327,-2.3805,5.8493,18.2675,2.1337,...,1,1,6,5,5,7,6,5,11,0
1,test_1,8.5304,1.2543,11.3047,5.1858,9.1974,-4.0117,6.0196,18.6316,-4.4131,...,4,2,1,6,6,1,1,11,9,6
2,test_2,5.4827,-10.3581,10.1407,7.0479,10.2628,9.8052,4.895,20.2537,1.5233,...,1,1,6,5,5,7,6,5,11,0
3,test_3,8.5374,-1.3222,12.022,6.5749,8.8458,3.1744,4.9397,20.566,3.3755,...,1,1,6,5,7,2,2,4,1,13
4,test_4,11.7058,-0.1327,14.1295,7.7506,9.1035,-8.5848,6.8595,10.6048,2.989,...,4,2,1,6,6,1,1,11,9,6


In [6]:
#サンプリング
if sampling_flg ==1:
    train_df=train_df.sample(n=sample_num,random_state=SEED)
    test_df=test_df.sample(n=sample_num,random_state=SEED)

In [7]:
#x,y作成
x_train=train_df.drop(train_drop_col,axis=1)
y_train=train_df[train_label]
x_test=test_df.drop(test_drop_col,axis=1)

モデル実行

In [8]:
#model_param
param = {
        'num_leaves': 2,
        'max_bin': 63,
        'min_data_in_leaf': 45,
        'learning_rate': 0.01,
        'min_sum_hessian_in_leaf': 0.000446,
        'bagging_fraction': 0.55, 
        'bagging_freq': 5, 
        'max_depth': 14,
        'save_binary': True,
        'seed': 31452,
        'feature_fraction_seed': 31415,
        'feature_fraction': 0.51,
        'bagging_seed': 31415,
        'drop_seed': 31415,
        'data_random_seed': 31415,
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'verbose': 1,
        'metric': 'auc',
        'is_unbalance': True,
        'boost_from_average': False,
    }

In [9]:
folds = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=SEED)
oof_preds = np.zeros((len(x_train), 1))
test_preds = np.zeros((len(x_test), 1))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train.values, y_train.values)):
    print("\n")
    print("Fold {}".format(fold_))
    trn_x,trn_y = x_train.iloc[trn_idx], y_train.iloc[trn_idx]
    val_x,val_y = x_train.iloc[val_idx], y_train.iloc[val_idx]
    trn_data = lgb.Dataset(trn_x,trn_y)
    val_data = lgb.Dataset(val_x,val_y)
    
    num_round = 80000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 5000)
    val_pred = clf.predict(val_x, num_iteration=clf.best_iteration)
    test_pred = clf.predict(x_test, num_iteration=clf.best_iteration)
    
    print("AUC = {}".format(roc_auc_score(val_y, val_pred)))
    #print("val = {}".format(val_pred)
    oof_preds[val_idx, :] = val_pred.reshape((-1, 1))
    test_preds += test_pred.reshape((-1, 1))
    
test_preds /= fold_num
roc_score = roc_auc_score(y_train, oof_preds.ravel())
print("Overall AUC = {}".format(roc_score))



Fold 0
Training until validation scores don't improve for 5000 rounds.
[1000]	training's auc: 0.837155	valid_1's auc: 0.831902
[2000]	training's auc: 0.852914	valid_1's auc: 0.846961
[3000]	training's auc: 0.861079	valid_1's auc: 0.854475
[4000]	training's auc: 0.866759	valid_1's auc: 0.859435
[5000]	training's auc: 0.871195	valid_1's auc: 0.863664
[6000]	training's auc: 0.874981	valid_1's auc: 0.867064
[7000]	training's auc: 0.878053	valid_1's auc: 0.8699
[8000]	training's auc: 0.880636	valid_1's auc: 0.872414
[9000]	training's auc: 0.882892	valid_1's auc: 0.874494
[10000]	training's auc: 0.885047	valid_1's auc: 0.876459
[11000]	training's auc: 0.886901	valid_1's auc: 0.878036
[12000]	training's auc: 0.888697	valid_1's auc: 0.87958
[13000]	training's auc: 0.890303	valid_1's auc: 0.880896
[14000]	training's auc: 0.891725	valid_1's auc: 0.882108
[15000]	training's auc: 0.893147	valid_1's auc: 0.883348
[16000]	training's auc: 0.894375	valid_1's auc: 0.884373
[17000]	training's auc: 0.8

[19000]	training's auc: 0.897887	valid_1's auc: 0.887186
[20000]	training's auc: 0.898855	valid_1's auc: 0.887979
[21000]	training's auc: 0.899774	valid_1's auc: 0.888682
[22000]	training's auc: 0.900632	valid_1's auc: 0.889355
[23000]	training's auc: 0.901431	valid_1's auc: 0.890003
[24000]	training's auc: 0.902293	valid_1's auc: 0.890647
[25000]	training's auc: 0.902997	valid_1's auc: 0.891129
[26000]	training's auc: 0.903613	valid_1's auc: 0.891523
[27000]	training's auc: 0.904352	valid_1's auc: 0.892114
[28000]	training's auc: 0.904915	valid_1's auc: 0.892499
[29000]	training's auc: 0.905511	valid_1's auc: 0.892865
[30000]	training's auc: 0.906017	valid_1's auc: 0.893212
[31000]	training's auc: 0.906551	valid_1's auc: 0.893553
[32000]	training's auc: 0.907048	valid_1's auc: 0.893841
[33000]	training's auc: 0.90752	valid_1's auc: 0.894131
[34000]	training's auc: 0.907903	valid_1's auc: 0.894303
[35000]	training's auc: 0.908332	valid_1's auc: 0.894509
[36000]	training's auc: 0.908744

[36000]	training's auc: 0.907057	valid_1's auc: 0.902144
[37000]	training's auc: 0.907437	valid_1's auc: 0.902255
[38000]	training's auc: 0.90785	valid_1's auc: 0.90252
[39000]	training's auc: 0.908192	valid_1's auc: 0.902627
[40000]	training's auc: 0.908496	valid_1's auc: 0.902716
[41000]	training's auc: 0.908879	valid_1's auc: 0.902809
[42000]	training's auc: 0.909145	valid_1's auc: 0.902828
[43000]	training's auc: 0.909438	valid_1's auc: 0.902886
[44000]	training's auc: 0.909725	valid_1's auc: 0.902958
[45000]	training's auc: 0.910033	valid_1's auc: 0.903059
[46000]	training's auc: 0.910302	valid_1's auc: 0.903152
[47000]	training's auc: 0.910561	valid_1's auc: 0.903224
[48000]	training's auc: 0.910815	valid_1's auc: 0.903215
[49000]	training's auc: 0.911049	valid_1's auc: 0.903254
[50000]	training's auc: 0.911263	valid_1's auc: 0.903304
[51000]	training's auc: 0.911496	valid_1's auc: 0.903323
[52000]	training's auc: 0.911701	valid_1's auc: 0.903305
[53000]	training's auc: 0.9119	va

In [10]:
#結果保存
if (submit_flg ==1 and sampling_flg==0):
    series_oof_preds = pd.Series(data=oof_preds[:,0], name=save_col_name, dtype='float')
    series_oof_preds.to_csv(train_preds_dir,header=True, index=False)

    sample = pd.read_csv(sample_submission_dir)
    sample.target = test_preds[:,0].astype(float)
    sample.ID_code = test_df['ID_code']
    sample.to_csv(test_preds_dir, index=False)
    