In [1]:
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.grid_search import RandomizedSearchCV
warnings.filterwarnings('ignore')




基本変数定義

In [2]:
submit_flg=1 #全件処理かどうかをコントロール
grid_flg=0 #GridSearchをするかコントロール

SEED=12345
sample_num=10000
fold_num=5

#train関連
train_dir='./feature/101_train.csv'
train_drop_col=['ID_code', 'target']
train_label='target'

#test関連
test_dir='./feature/101_test.csv'
test_drop_col=['ID_code']

#結果ファイル関連
oof_preds_dir='./oof/102_LightGBM_preds.csv'
test_preds_dir='./oof/102_LightGBM_tests.csv'
submission_dir='./submission/102_LightGBM_submission.csv'
save_col_name='oof_xgb'

sample_submission_dir='./submission/sample_submission.csv'
submission_target_col_name='target'
submission_id_col_name='ID_code'

In [3]:
#model_param
param = {
        'num_leaves': 6,
        'max_bin': 63,
        'min_data_in_leaf': 45,
        'learning_rate': 0.01,
        'min_sum_hessian_in_leaf': 0.000446,
        'bagging_fraction': 0.55, 
        'bagging_freq': 5, 
        'max_depth': 14,
        'save_binary': True,
        'seed': 31452,
        'feature_fraction_seed': 31415,
        'feature_fraction': 0.51,
        'bagging_seed': 31415,
        'drop_seed': 31415,
        'data_random_seed': 31415,
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'verbose': 1,
        'metric': 'auc',
        'is_unbalance': True,
        'boost_from_average': False,
    }

テンプレ処理

In [4]:
#ロード
train_df=pd.read_csv(train_dir)
test_df=pd.read_csv(test_dir)

In [5]:
#サンプリング
if not(submit_flg ==1):
    train_df=train_df.sample(n=sample_num,random_state=SEED)
    test_df=test_df.sample(n=sample_num,random_state=SEED)

In [6]:
#x,y作成
x_train=train_df.drop(train_drop_col,axis=1)
y_train=train_df[train_label]
x_test=test_df.drop(test_drop_col,axis=1)

GridSearch

In [7]:
if grid_flg ==1:
    clf = xgb.XGBRegressor()
    parameters = {
        'learning_rate':[0.1,0.01],
        'n_estimators':[10,100,1000],
        'max_depth':[3,4,5,10,15],
        'min_child_weight':[1,2,3],
        'max_delta_step':[0,5],
        'gamma':[0,3,10,30],
        'subsample':[0.8,1],
        'colsample_bytree':[0.8,1],
        'objective':['binary:logistic','reg:linear'],
        'booster':['gbtree','gblinear'],
        'nthread':[None,4],
        'scale_pos_weight':[1],
        'seed':[SEED]
    }
    clf = RandomizedSearchCV( estimator=model,
                                        param_distributions=parameters,
                                        cv=5,              #CV
                                        scoring="accuracy", #metrics
                                        n_jobs=1,           #num of core
                                        verbose=0,          
                                        random_state=1)
    clf.fit(x_train, y_train)

モデル実行

In [8]:
def LGBM(x_train,y_train,x_test):
    folds = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=SEED)
    oof_preds = np.zeros((len(x_train), 1))
    test_preds = np.zeros((len(x_test), 1))
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train.values, y_train.values)):
        print("\n")
        print("Fold {}".format(fold_))
        trn_x,trn_y = x_train.iloc[trn_idx], y_train.iloc[trn_idx]
        val_x,val_y = x_train.iloc[val_idx], y_train.iloc[val_idx]
        trn_data = lgb.Dataset(trn_x,trn_y)
        val_data = lgb.Dataset(val_x,val_y)

        num_round = 15000
        clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 250)
        val_pred = clf.predict(val_x, num_iteration=clf.best_iteration)
        test_pred = clf.predict(x_test, num_iteration=clf.best_iteration)

        print("AUC = {}".format(roc_auc_score(val_y, val_pred)))
        #print("val = {}".format(val_pred)
        oof_preds[val_idx, :] = val_pred.reshape((-1, 1))
        test_preds += test_pred.reshape((-1, 1))

    test_preds /= fold_num
    roc_score = roc_auc_score(y_train, oof_preds.ravel())
    print("Overall AUC = {}".format(roc_score))
    
    return oof_preds,test_preds
    

In [14]:
def downsampling(train_df,SEED):
    train_df_0=train_df[train_df['target']==0]
    train_df_1=train_df[train_df['target']==1]
    len_1=len(train_df_1)
    train_df_0=train_df_0.sample(n=len_1,random_state=SEED)
    new_train_df=pd.concat([train_df_0,train_df_1])
    return new_train_df



40196

In [8]:
all_oof_preds = np.zeros((len(x_train), 1))
all_test_preds = np.zeros((len(x_test), 1))
count=0
for i in [111,222,333,444,555,666,777,888,999,0]:
    print("\n")
    print(count)
    new_train_df=downsampling(train_df,i)
    x_train=new_train_df.drop(train_drop_col,axis=1)
    y_train=new_train_df[train_label]
    oof_preds,test_preds=LGBM(x_train,y_train,x_test)
    
    roc_score = roc_auc_score(y_train, oof_preds.ravel())
    print("Overall AUC = {}".format(roc_score))
    
    all_oof_preds+= oof_preds
    all_test_preds+= test_preds
    
    count+=1


all_oof_preds/=count
all_test_preds/=count

x_train=train_df.drop(train_drop_col,axis=1)
y_train=train_df[train_label]
x_test=test_df.drop(test_drop_col,axis=1)

roc_score = roc_auc_score(y_train, all_oof_preds.ravel())
print("Overall AUC = {}".format(roc_score))
   



Fold 0
Training until validation scores don't improve for 250 rounds.
[1000]	training's auc: 0.862598	valid_1's auc: 0.847093
[2000]	training's auc: 0.890756	valid_1's auc: 0.872657
[3000]	training's auc: 0.904074	valid_1's auc: 0.883861
[4000]	training's auc: 0.912153	valid_1's auc: 0.889894
[5000]	training's auc: 0.917564	valid_1's auc: 0.893037
[6000]	training's auc: 0.921532	valid_1's auc: 0.895081
[7000]	training's auc: 0.924844	valid_1's auc: 0.896213
[8000]	training's auc: 0.927874	valid_1's auc: 0.896824
[9000]	training's auc: 0.930895	valid_1's auc: 0.896968
[10000]	training's auc: 0.933874	valid_1's auc: 0.897171
[11000]	training's auc: 0.936791	valid_1's auc: 0.897261
Early stopping, best iteration is:
[11150]	training's auc: 0.937224	valid_1's auc: 0.897306
AUC = 0.8973056260621796


Fold 1
Training until validation scores don't improve for 250 rounds.
[1000]	training's auc: 0.86275	valid_1's auc: 0.846109
[2000]	training's auc: 0.891152	valid_1's auc: 0.871568
[3000]	tra

In [9]:
#結果保存
if submit_flg ==1:
    series_oof_preds = pd.Series(data=oof_preds[:,0], name=save_col_name, dtype='float')
    series_test_preds = pd.Series(data=test_preds[:,0], name=save_col_name, dtype='float')

    series_oof_preds.to_csv(oof_preds_dir,header=True, index=False)
    series_test_preds.to_csv(test_preds_dir,header=True, index=False)

    sample = pd.read_csv(sample_submission_dir)
    sample.target = test_preds[:,0].astype(float)
    sample.ID_code = test_df['ID_code']
    sample.to_csv(submission_dir, index=False)
    