<a href="https://colab.research.google.com/github/steelpipe75/kagglebook-for-colab/blob/master/ch02/ch02-02-custom-usage.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ---------------------------------
# データ等の準備
# ----------------------------------
import numpy as np
import pandas as pd

In [2]:
import importlib
import sys
import subprocess

# Google Colab 上で実行しているかどうかを判断するフラグ
ON_COLAB = "google.colab" in sys.modules
print(f"ON_COLAB: {ON_COLAB}")

if ON_COLAB:
    USE_GIT = True # Gitを使う
    # USE_GIT = False # Gitを使わない

    print(f"USE_GIT: {USE_GIT}")
    if USE_GIT:
        !git clone https://github.com/ghmagazine/kagglebook.git
    else:
        # Google Drive にマウントする
        drive = importlib.import_module("google.colab.drive")
        drive.mount("/content/drive/")

        import os
        colab_dir = "/content/drive/MyDrive/kagglebook/" # データ置き場

ON_COLAB: True
USE_GIT: True
fatal: destination path 'kagglebook' already exists and is not an empty directory.


train_xは学習データ、train_yは目的変数、test_xはテストデータ
pandasのDataFrame, Seriesで保持します。（numpyのarrayで保持することもあります）

In [3]:
if ON_COLAB:
    if USE_GIT:
        train = pd.read_csv('/content/kagglebook/input/sample-data/train_preprocessed.csv')
    else:
        train = pd.read_csv(os.path.join(colab_dir, 'input/sample-data/train_preprocessed.csv'))
else:
    train = pd.read_csv('../input/sample-data/train_preprocessed.csv')
train_x = train.drop(['target'], axis=1)
train_y = train['target']
if ON_COLAB:
    if USE_GIT:
        test_x = pd.read_csv('/content/kagglebook/input/sample-data/test_preprocessed.csv')
    else:
        test_x = pd.read_csv(os.path.join(colab_dir, 'input/sample-data/test_preprocessed.csv'))
else:
    test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv')

In [4]:
from sklearn.model_selection import KFold

In [5]:
kf = KFold(n_splits=4, shuffle=True, random_state=71)
tr_idx, va_idx = list(kf.split(train_x))[0]

In [6]:
# 学習データを学習データとバリデーションデータに分ける
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

In [7]:
# -----------------------------------
# xgboost におけるカスタム評価指標と目的関数の例
# （参考）https://github.com/dmlc/xgboost/blob/master/demo/guide-python/custom_objective.py
# -----------------------------------
import xgboost as xgb
from sklearn.metrics import log_loss

In [8]:
# 特徴量と目的変数をxgboostのデータ構造に変換する
# 学習データの特徴量と目的変数がtr_x, tr_y、バリデーションデータの特徴量と目的変数がva_x, va_yとする
dtrain = xgb.DMatrix(tr_x, label=tr_y)
dvalid = xgb.DMatrix(va_x, label=va_y)

In [9]:
# カスタム目的関数（この場合はloglossであり、xgboostの'binary:logistic'と等価）
def logregobj(preds, dtrain):
    labels = dtrain.get_label()  # 真の値のラベルを取得
    preds = 1.0 / (1.0 + np.exp(-preds))  # シグモイド関数
    grad = preds - labels  # 勾配
    hess = preds * (1.0 - preds)  # 二階微分値
    return grad, hess

In [10]:
# カスタム評価指標（この場合は誤答率）
def evalerror(preds, dtrain):
    labels = dtrain.get_label()  # 真の値のラベルを取得
    return 'custom-error', float(sum(labels != (preds > 0.0))) / len(labels)

In [11]:
# ハイパーパラメータの設定
params = {'silent': 1, 'random_state': 71}
num_round = 50
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

In [12]:
# モデルの学習の実行
bst = xgb.train(params, dtrain, num_round, watchlist, obj=logregobj, feval=evalerror)

[0]	train-rmse:0.40096	train-custom-error:0.17067	eval-rmse:0.42510	eval-custom-error:0.19160
[1]	train-rmse:0.70012	train-custom-error:0.11627	eval-rmse:0.71947	eval-custom-error:0.14880


Parameters: { "silent" } are not used.



[2]	train-rmse:0.98004	train-custom-error:0.10707	eval-rmse:0.99417	eval-custom-error:0.14120
[3]	train-rmse:1.22553	train-custom-error:0.09853	eval-rmse:1.23620	eval-custom-error:0.13680
[4]	train-rmse:1.43887	train-custom-error:0.09307	eval-rmse:1.45035	eval-custom-error:0.13280
[5]	train-rmse:1.62458	train-custom-error:0.09027	eval-rmse:1.62992	eval-custom-error:0.13160
[6]	train-rmse:1.79119	train-custom-error:0.08507	eval-rmse:1.79517	eval-custom-error:0.13080
[7]	train-rmse:1.92439	train-custom-error:0.08133	eval-rmse:1.92730	eval-custom-error:0.13720
[8]	train-rmse:2.05722	train-custom-error:0.07693	eval-rmse:2.05313	eval-custom-error:0.12880
[9]	train-rmse:2.16512	train-custom-error:0.07427	eval-rmse:2.15662	eval-custom-error:0.12320
[10]	train-rmse:2.25427	train-custom-error:0.07227	eval-rmse:2.24850	eval-custom-error:0.12160
[11]	train-rmse:2.34389	train-custom-error:0.06680	eval-rmse:2.33289	eval-custom-error:0.11640
[12]	train-rmse:2.43563	train-custom-error:0.06373	eval-rm

In [13]:
# 目的関数にbinary:logisticを指定したときと違い、確率に変換する前の値で予測値が出力されるので変換が必要
pred_val = bst.predict(dvalid)
pred = 1.0 / (1.0 + np.exp(-pred_val))
logloss = log_loss(va_y, pred)
print(logloss)

0.21992204652445954


In [14]:
# （参考）通常の方法で学習を行う場合
params = {'silent': 1, 'random_state': 71, 'objective': 'binary:logistic'}
bst = xgb.train(params, dtrain, num_round, watchlist)

[0]	train-logloss:0.41663	eval-logloss:0.43550
[1]	train-logloss:0.37126	eval-logloss:0.39889
[2]	train-logloss:0.33889	eval-logloss:0.37205
[3]	train-logloss:0.31320	eval-logloss:0.35606
[4]	train-logloss:0.29062	eval-logloss:0.33709
[5]	train-logloss:0.27315	eval-logloss:0.32549
[6]	train-logloss:0.25735	eval-logloss:0.31328
[7]	train-logloss:0.24299	eval-logloss:0.30434
[8]	train-logloss:0.22850	eval-logloss:0.29755
[9]	train-logloss:0.21984	eval-logloss:0.29490


Parameters: { "silent" } are not used.



[10]	train-logloss:0.21094	eval-logloss:0.28867
[11]	train-logloss:0.20280	eval-logloss:0.28290
[12]	train-logloss:0.19173	eval-logloss:0.27823
[13]	train-logloss:0.18626	eval-logloss:0.27372
[14]	train-logloss:0.17750	eval-logloss:0.27174
[15]	train-logloss:0.17111	eval-logloss:0.26724
[16]	train-logloss:0.16269	eval-logloss:0.26364
[17]	train-logloss:0.15749	eval-logloss:0.26171
[18]	train-logloss:0.15286	eval-logloss:0.25777
[19]	train-logloss:0.14977	eval-logloss:0.25564
[20]	train-logloss:0.14463	eval-logloss:0.25519
[21]	train-logloss:0.13899	eval-logloss:0.25306
[22]	train-logloss:0.13512	eval-logloss:0.25066
[23]	train-logloss:0.13177	eval-logloss:0.24852
[24]	train-logloss:0.12694	eval-logloss:0.24723
[25]	train-logloss:0.12473	eval-logloss:0.24621
[26]	train-logloss:0.12146	eval-logloss:0.24510
[27]	train-logloss:0.11716	eval-logloss:0.24219
[28]	train-logloss:0.11285	eval-logloss:0.24188
[29]	train-logloss:0.11031	eval-logloss:0.24129
[30]	train-logloss:0.10748	eval-logloss:

In [15]:
pred = bst.predict(dvalid)
logloss = log_loss(va_y, pred)
print(logloss)

0.22572590332195305
