In [12]:
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.grid_search import RandomizedSearchCV
warnings.filterwarnings('ignore')


基本変数定義

In [13]:
submit_flg=1 #全件処理かどうかをコントロール
grid_flg=1 #GridSearchをするかコントロール
SEED=12345
sample_num=10000
fold_num=5

#train関連
train_dir='./feature/101_train.csv'
train_drop_col=['ID_code', 'target']
train_label='target'

#test関連
test_dir='./feature/101_test.csv'
test_drop_col=['ID_code']

#結果ファイル関連
oof_preds_dir='./oof/101_XGBoost_preds.csv'
test_preds_dir='./oof/101_XGBoost_tests.csv'
submission_dir='./submission/101_XGBoost_submission.csv'
save_col_name='oof_xgb'

sample_submission_dir='./submission/sample_submission.csv'
submission_target_col_name='target'
submission_id_col_name='ID_code'

In [14]:
#model_param
param = {}

テンプレ処理

In [15]:
#ロード
train_df=pd.read_csv(train_dir)
test_df=pd.read_csv(test_dir)

In [16]:
#サンプリング
if not(submit_flg ==1):
    train_df=train_df.sample(n=sample_num,random_state=SEED)
    test_df=test_df.sample(n=sample_num,random_state=SEED)

In [17]:
#x,y作成
x_train=train_df.drop(train_drop_col,axis=1)
y_train=train_df[train_label]
x_test=test_df.drop(test_drop_col,axis=1)

GridSearch

In [18]:
model = xgb.XGBRegressor()
if grid_flg ==1:
    parameters = {
        'learning_rate':[0.1,0.01],
        'n_estimators':[10,100,1000],
        'max_depth':[3,4,5,10,15],
        'min_child_weight':[1,2,3],
        'max_delta_step':[0,5],
        'gamma':[0,3,10,30],
        'subsample':[0.8,1],
        'colsample_bytree':[0.8,1],
        'objective':['binary:logistic','reg:linear'],
        'booster':['gbtree','gblinear'],
        'nthread':[None,4],
        'scale_pos_weight':[1],
        'seed':[SEED]
    }
    clf = RandomizedSearchCV( estimator=model,
                                        param_distributions=parameters,
                                        cv=5,              #CV
                                        scoring="accuracy", #metrics
                                        n_jobs=1,           #num of core
                                        verbose=0,          
                                        random_state=1)
    clf.fit(x_train, y_train)

ValueError: Can't handle mix of binary and continuous

In [20]:
clf.best_params_, clf.best_score_, clf.best_estimator_

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_params_'

モデル実行

In [7]:
folds = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=SEED)
oof_preds = np.zeros((len(x_train), 1))
test_preds = np.zeros((len(x_test), 1))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train.values, y_train.values)):
    print("Fold {}".format(fold_))
    trn_x,trn_y = x_train.iloc[trn_idx], y_train.iloc[trn_idx]
    val_x,val_y = x_train.iloc[val_idx], y_train.iloc[val_idx]
    
    clf = xgb.XGBRegressor()
    clf.fit(trn_x,trn_y)
    
    val_pred=clf.predict(val_x)
    test_pred=clf.predict(x_test)
    
    print("AUC = {}".format(roc_auc_score(val_y, val_pred)))
    #print("val = {}".format(val_pred)
    oof_preds[val_idx, :] = val_pred.reshape((-1, 1))
    test_preds += test_pred.reshape((-1, 1))
    
test_preds /= fold_num
roc_score = roc_auc_score(y_train, oof_preds.ravel())
print("Overall AUC = {}".format(roc_score))

Fold 0
AUC = 0.8239975983731601
Fold 1
AUC = 0.8261265654164353
Fold 2
AUC = 0.8253671919723229
Fold 3
AUC = 0.8287133302748576
Fold 4
AUC = 0.833134827468358
Overall AUC = 0.8274632678935152


AttributeError: 'RandomizedSearchCV' object has no attribute 'best_params_'

In [8]:
#結果保存
series_oof_preds = pd.Series(data=oof_preds[:,0], name=save_col_name, dtype='float')
series_test_preds = pd.Series(data=test_preds[:,0], name=save_col_name, dtype='float')

series_oof_preds.to_csv(oof_preds_dir,header=True, index=False)
series_test_preds.to_csv(test_preds_dir,header=True, index=False)

sample = pd.read_csv(sample_submission_dir)
sample.target = test_preds[:,0].astype(float)
sample.ID_code = test_df['ID_code']
sample.to_csv(submission_dir, index=False)