<a href="https://colab.research.google.com/github/steelpipe75/kagglebook-for-colab/blob/master/ch05/ch05-01-validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ---------------------------------
# データ等の準備
# ----------------------------------
import numpy as np
import pandas as pd

In [2]:
import importlib
import sys
import subprocess

# Google Colab 上で実行しているかどうかを判断するフラグ
ON_COLAB = "google.colab" in sys.modules
print(f"ON_COLAB: {ON_COLAB}")

if ON_COLAB:
    USE_GIT = True # Gitを使う
    # USE_GIT = False # Gitを使わない

    print(f"USE_GIT: {USE_GIT}")
    if USE_GIT:
        !git clone https://github.com/ghmagazine/kagglebook.git
    else:
        # Google Drive にマウントする
        drive = importlib.import_module("google.colab.drive")
        drive.mount("/content/drive/")

        import os
        colab_dir = "/content/drive/MyDrive/kagglebook/" # データ置き場

ON_COLAB: True
USE_GIT: True
fatal: destination path 'kagglebook' already exists and is not an empty directory.


train_xは学習データ、train_yは目的変数、test_xはテストデータ
pandasのDataFrame, Seriesで保持します。（numpyのarrayで保持することもあります）

In [3]:
if ON_COLAB:
    if USE_GIT:
        train = pd.read_csv('/content/kagglebook/input/sample-data/train_preprocessed.csv')
    else:
        train = pd.read_csv(os.path.join(colab_dir, 'input/sample-data/train_preprocessed.csv'))
else:
    train = pd.read_csv('../input/sample-data/train_preprocessed.csv')
train_x = train.drop(['target'], axis=1)
train_y = train['target']
if ON_COLAB:
    if USE_GIT:
        test_x = pd.read_csv('/content/kagglebook/input/sample-data/test_preprocessed.csv')
    else:
        test_x = pd.read_csv(os.path.join(colab_dir, 'input/sample-data/test_preprocessed.csv'))
else:
    test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv')

In [4]:
# xgboostによる学習・予測を行うクラス
import xgboost as xgb

In [5]:
class Model:

    def __init__(self, params=None):
        self.model = None
        if params is None:
            self.params = {}
        else:
            self.params = params

    def fit(self, tr_x, tr_y, va_x, va_y):
        params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71}
        params.update(self.params)
        num_round = 10
        dtrain = xgb.DMatrix(tr_x, label=tr_y)
        dvalid = xgb.DMatrix(va_x, label=va_y)
        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
        self.model = xgb.train(params, dtrain, num_round, evals=watchlist)

    def predict(self, x):
        data = xgb.DMatrix(x)
        pred = self.model.predict(data)
        return pred

-----------------------------------
hold-out法
-----------------------------------
hold-out法でのバリデーションデータの分割

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
# train_test_split関数を用いてhold-out法で分割する
tr_x, va_x, tr_y, va_y = train_test_split(train_x, train_y,
                                          test_size=0.25, random_state=71, shuffle=True)

-----------------------------------
hold-out法でバリデーションを行う

In [8]:
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

Modelクラスを定義しているものとする
Modelクラスは、fitで学習し、predictで予測値の確率を出力する

In [9]:
# train_test_split関数を用いてhold-out法で分割する
tr_x, va_x, tr_y, va_y = train_test_split(train_x, train_y,
                                          test_size=0.25, random_state=71, shuffle=True)

In [10]:
# 学習の実行、バリデーションデータの予測値の出力、スコアの計算を行う
model = Model()
model.fit(tr_x, tr_y, va_x, va_y)
va_pred = model.predict(va_x)
score = log_loss(va_y, va_pred)
print(score)

[0]	train-logloss:0.41663	eval-logloss:0.43550
[1]	train-logloss:0.37126	eval-logloss:0.39889
[2]	train-logloss:0.33889	eval-logloss:0.37205
[3]	train-logloss:0.31320	eval-logloss:0.35606
[4]	train-logloss:0.29062	eval-logloss:0.33709
[5]	train-logloss:0.27315	eval-logloss:0.32549


Parameters: { "silent" } are not used.



[6]	train-logloss:0.25735	eval-logloss:0.31328
[7]	train-logloss:0.24299	eval-logloss:0.30434
[8]	train-logloss:0.22850	eval-logloss:0.29755
[9]	train-logloss:0.21984	eval-logloss:0.29490
0.2948980098778489


-----------------------------------
KFoldクラスを用いてhold-out法でバリデーションデータを分割

In [11]:
from sklearn.model_selection import KFold

In [12]:
# KFoldクラスを用いてhold-out法で分割する
kf = KFold(n_splits=4, shuffle=True, random_state=71)
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

-----------------------------------
クロスバリデーション
-----------------------------------
クロスバリデーションでのデータの分割

In [13]:
from sklearn.model_selection import KFold

In [14]:
# KFoldクラスを用いてクロスバリデーションの分割を行う
kf = KFold(n_splits=4, shuffle=True, random_state=71)
for tr_idx, va_idx in kf.split(train_x):
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

-----------------------------------
クロスバリデーションを行う

In [15]:
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold

Modelクラスを定義しているものとする
Modelクラスは、fitで学習し、predictで予測値の確率を出力する

In [16]:
scores = []

In [17]:
# KFoldクラスを用いてクロスバリデーションの分割を行う
kf = KFold(n_splits=4, shuffle=True, random_state=71)
for tr_idx, va_idx in kf.split(train_x):
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

    # 学習の実行、バリデーションデータの予測値の出力、スコアの計算を行う
    model = Model()
    model.fit(tr_x, tr_y, va_x, va_y)
    va_pred = model.predict(va_x)
    score = log_loss(va_y, va_pred)
    scores.append(score)

Parameters: { "silent" } are not used.



[0]	train-logloss:0.41663	eval-logloss:0.43550
[1]	train-logloss:0.37126	eval-logloss:0.39889
[2]	train-logloss:0.33889	eval-logloss:0.37205
[3]	train-logloss:0.31320	eval-logloss:0.35606
[4]	train-logloss:0.29062	eval-logloss:0.33709
[5]	train-logloss:0.27315	eval-logloss:0.32549
[6]	train-logloss:0.25735	eval-logloss:0.31328
[7]	train-logloss:0.24299	eval-logloss:0.30434
[8]	train-logloss:0.22850	eval-logloss:0.29755
[9]	train-logloss:0.21984	eval-logloss:0.29490
[0]	train-logloss:0.41964	eval-logloss:0.41762


Parameters: { "silent" } are not used.



[1]	train-logloss:0.37576	eval-logloss:0.38620
[2]	train-logloss:0.34343	eval-logloss:0.36018
[3]	train-logloss:0.31776	eval-logloss:0.34322
[4]	train-logloss:0.29396	eval-logloss:0.32650
[5]	train-logloss:0.27618	eval-logloss:0.31509
[6]	train-logloss:0.26377	eval-logloss:0.30590
[7]	train-logloss:0.24848	eval-logloss:0.29695
[8]	train-logloss:0.23521	eval-logloss:0.28744
[9]	train-logloss:0.22311	eval-logloss:0.28122
[0]	train-logloss:0.41890	eval-logloss:0.43249
[1]	train-logloss:0.37293	eval-logloss:0.39662
[2]	train-logloss:0.34196	eval-logloss:0.37120
[3]	train-logloss:0.31459	eval-logloss:0.35191
[4]	train-logloss:0.29310	eval-logloss:0.33963
[5]	train-logloss:0.27737	eval-logloss:0.32667
[6]	train-logloss:0.25757	eval-logloss:0.31103
[7]	train-logloss:0.24559	eval-logloss:0.30324
[8]	train-logloss:0.23499	eval-logloss:0.29798


Parameters: { "silent" } are not used.



[9]	train-logloss:0.22137	eval-logloss:0.28779
[0]	train-logloss:0.41761	eval-logloss:0.43674
[1]	train-logloss:0.37038	eval-logloss:0.39970
[2]	train-logloss:0.33898	eval-logloss:0.37438
[3]	train-logloss:0.31250	eval-logloss:0.35629
[4]	train-logloss:0.28992	eval-logloss:0.34197
[5]	train-logloss:0.27194	eval-logloss:0.33111
[6]	train-logloss:0.25641	eval-logloss:0.32024
[7]	train-logloss:0.24382	eval-logloss:0.31233
[8]	train-logloss:0.23406	eval-logloss:0.30578
[9]	train-logloss:0.22478	eval-logloss:0.30087


Parameters: { "silent" } are not used.



In [18]:
# 各foldのスコアの平均をとる
print(np.mean(scores))

0.2911975271969166


In [19]:
# -----------------------------------
# Stratified K-Fold
# -----------------------------------
from sklearn.model_selection import StratifiedKFold

In [20]:
# StratifiedKFoldクラスを用いて層化抽出による分割を行う
kf = StratifiedKFold(n_splits=4, shuffle=True, random_state=71)
for tr_idx, va_idx in kf.split(train_x, train_y):
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

In [21]:
# -----------------------------------
# GroupKFold
# -----------------------------------
# 4件ずつ同じユーザーがいるデータであったとする
train_x['user_id'] = np.arange(0, len(train_x)) // 4
# -----------------------------------

In [22]:
from sklearn.model_selection import KFold, GroupKFold

In [23]:
# user_id列の顧客IDを単位として分割することにする
user_id = train_x['user_id']
unique_user_ids = user_id.unique()

In [24]:
# KFoldクラスを用いて、顧客ID単位で分割する
scores = []
kf = KFold(n_splits=4, shuffle=True, random_state=71)
for tr_group_idx, va_group_idx in kf.split(unique_user_ids):
    # 顧客IDをtrain/valid（学習に使うデータ、バリデーションデータ）に分割する
    tr_groups, va_groups = unique_user_ids[tr_group_idx], unique_user_ids[va_group_idx]

    # 各レコードの顧客IDがtrain/validのどちらに属しているかによって分割する
    is_tr = user_id.isin(tr_groups)
    is_va = user_id.isin(va_groups)
    tr_x, va_x = train_x[is_tr], train_x[is_va]
    tr_y, va_y = train_y[is_tr], train_y[is_va]

In [25]:
# （参考）GroupKFoldクラスではシャッフルと乱数シードの指定ができないため使いづらい
kf = GroupKFold(n_splits=4)
for tr_idx, va_idx in kf.split(train_x, train_y, user_id):
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

In [26]:
# -----------------------------------
# leave-one-out
# -----------------------------------
# データが100件しかないものとする
train_x = train_x.iloc[:100, :].copy()
# -----------------------------------
from sklearn.model_selection import LeaveOneOut

In [27]:
loo = LeaveOneOut()
for tr_idx, va_idx in loo.split(train_x):
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]