In [1]:
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.grid_search import RandomizedSearchCV
warnings.filterwarnings('ignore')




基本変数定義

In [2]:
submit_flg=1 #全件処理かどうかをコントロール
grid_flg=0 #GridSearchをするかコントロール

SEED=12345
sample_num=10000
fold_num=5

#train関連
train_dir='./feature/101_train.csv'
train_drop_col=['ID_code', 'target']
train_label='target'

#test関連
test_dir='./feature/101_test.csv'
test_drop_col=['ID_code']

#結果ファイル関連
oof_preds_dir='./oof/110_LightGBM_downsample_preds.csv'
test_preds_dir='./oof/110_LightGBM_downsample_tests.csv'
submission_dir='./submission/110_LightGBM_downsample_submission.csv'
save_col_name='oof_xgb'

sample_submission_dir='./submission/sample_submission.csv'
submission_target_col_name='target'
submission_id_col_name='ID_code'

テンプレ処理

In [3]:
#ロード
train_df=pd.read_csv(train_dir)
test_df=pd.read_csv(test_dir)

In [4]:
#サンプリング
if not(submit_flg ==1):
    train_df=train_df.sample(n=sample_num,random_state=SEED)
    test_df=test_df.sample(n=sample_num,random_state=SEED)

In [5]:
#x,y作成
x_train=train_df.drop(train_drop_col,axis=1)
y_train=train_df[train_label]
x_test=test_df.drop(test_drop_col,axis=1)

In [6]:
def LGBM(x_train,y_train,x_test,seed):
    #model_param
    param = {
            'num_leaves': 6,
            'max_bin': 63,
            'min_data_in_leaf': 45,
            'learning_rate': 0.01,
            'min_sum_hessian_in_leaf': 0.000446,
            'bagging_fraction': 0.55, 
            'bagging_freq': 5, 
            'max_depth': 14,
            'save_binary': True,
            'seed': seed,
            'feature_fraction_seed': seed,
            'feature_fraction': 0.51,
            'bagging_seed': seed,
            'drop_seed': seed,
            'data_random_seed': seed,
            'objective': 'binary',
            'boosting_type': 'gbdt',
            'verbose': 1,
            'metric': 'auc',
            'is_unbalance': True,
            'boost_from_average': False,
        }
    folds = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=seed)
    oof_preds = np.zeros((len(x_train), 1))
    test_preds = np.zeros((len(x_test), 1))
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train.values, y_train.values)):
        print("\n")
        print("Fold {}".format(fold_))
        trn_x,trn_y = x_train.iloc[trn_idx], y_train.iloc[trn_idx]
        val_x,val_y = x_train.iloc[val_idx], y_train.iloc[val_idx]
        trn_data = lgb.Dataset(trn_x,trn_y)
        val_data = lgb.Dataset(val_x,val_y)

        num_round = 15000
        clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 250)
        val_pred = clf.predict(val_x, num_iteration=clf.best_iteration)
        test_pred = clf.predict(x_test, num_iteration=clf.best_iteration)

        print("AUC = {}".format(roc_auc_score(val_y, val_pred)))
        #print("val = {}".format(val_pred)
        oof_preds[val_idx, :] = val_pred.reshape((-1, 1))
        test_preds += test_pred.reshape((-1, 1))

    test_preds /= fold_num
    roc_score = roc_auc_score(y_train, oof_preds.ravel())
    print("Overall AUC = {}".format(roc_score))
    
    return oof_preds,test_preds
    

In [7]:
def downsampling(train_df,SEED):
    train_df_0=train_df[train_df['target']==0]
    train_df_1=train_df[train_df['target']==1]
    len_1=len(train_df_1)
    train_df_0=train_df_0.sample(n=len_1,random_state=SEED)
    new_train_df=pd.concat([train_df_0,train_df_1])
    return new_train_df



In [10]:
new_train_df=downsampling(train_df,100)
all_oof_preds = np.zeros((len(new_train_df), 1))
all_test_preds = np.zeros((len(x_test), 1))
count=0
for i in range(0,9):
    print("\n")
    print(count)
    new_train_df=downsampling(train_df,i)
    x_train=new_train_df.drop(train_drop_col,axis=1)
    y_train=new_train_df[train_label]

    oof_preds,test_preds=LGBM(x_train,y_train,x_test,i)
    
    roc_score = roc_auc_score(y_train, oof_preds.ravel())
    print("Overall AUC = {}".format(roc_score))
    
    all_oof_preds+= oof_preds
    all_test_preds+= test_preds
    
    count+=1

all_oof_preds/=count
all_test_preds/=count

   



0


Fold 0
Training until validation scores don't improve for 250 rounds.
[1000]	training's auc: 0.871137	valid_1's auc: 0.842834
[2000]	training's auc: 0.90023	valid_1's auc: 0.868897
[3000]	training's auc: 0.914534	valid_1's auc: 0.880079
[4000]	training's auc: 0.923265	valid_1's auc: 0.885849
[5000]	training's auc: 0.929711	valid_1's auc: 0.88872
[6000]	training's auc: 0.935189	valid_1's auc: 0.890497
[7000]	training's auc: 0.940317	valid_1's auc: 0.891486
[8000]	training's auc: 0.945267	valid_1's auc: 0.891965
Early stopping, best iteration is:
[8363]	training's auc: 0.946889	valid_1's auc: 0.892127
AUC = 0.8921274844682062


Fold 1
Training until validation scores don't improve for 250 rounds.
[1000]	training's auc: 0.870115	valid_1's auc: 0.846859
[2000]	training's auc: 0.899745	valid_1's auc: 0.873151
[3000]	training's auc: 0.913807	valid_1's auc: 0.883015
[4000]	training's auc: 0.922759	valid_1's auc: 0.888504
[5000]	training's auc: 0.929407	valid_1's auc: 0.891542
[6000]	tra

MemoryError: 

In [None]:
all_test_preds

In [None]:
#結果保存
if submit_flg ==1:
    series_oof_preds = pd.Series(data=all_oof_preds[:,0], name=save_col_name, dtype='float')
    series_test_preds = pd.Series(data=all_test_preds[:,0], name=save_col_name, dtype='float')

    series_oof_preds.to_csv(oof_preds_dir,header=True, index=False)
    series_test_preds.to_csv(test_preds_dir,header=True, index=False)

    sample = pd.read_csv(sample_submission_dir)
    sample.target = all_test_preds[:,0].astype(float)
    sample.ID_code = test_df['ID_code']
    sample.to_csv(submission_dir, index=False)
    