<a href="https://colab.research.google.com/github/shuta13/sklearn-sandbox-colab/blob/main/scikit_learn_sandbox.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ref: https://colab.research.google.com/github/chainer/tutorials/blob/master/ja/09_Introduction_to_Scikit-learn.ipynb

In [2]:
!pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.0.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (23.2 MB)
[K     |████████████████████████████████| 23.2 MB 2.2 MB/s 
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.0.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl, scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.post1:
      Successfully uninstalled scikit-learn-0.22.2.post1
Successfully installed scikit-learn-1.0.1 threadpoolctl-3.0.0


In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
# https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_boston.html
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

# 6:2:2 に分割する
X, X_test, y, y_test = train_test_split(data, target, test_size=0.2, random_state=0)

In [5]:
# 交差検証のジェネレーター関数
def gen_cv():
  m_train = np.floor(len(y) * 0.75).astype(int)
  train_indices = np.arange(m_train)
  test_indices = np.arange(m_train, len(y))
  yield (train_indices, test_indices)

print("訓練データ、交差検証データ、テストデータの数 : ", end="")
print(len(next(gen_cv())[0]), len(next(gen_cv())[1]), len(y_test) )

訓練データ、交差検証データ、テストデータの数 : 303 101 102


In [6]:
# 訓練データを基準に標準化
scaler = StandardScaler()
X_norm = scaler.fit_transform(X)
# テストデータも標準化
# X_test_norm = scaler.transform(X_test)
X_test_norm = scaler.fit_transform(X_test)

In [7]:
# ハイパーパラメータのチューニング
params_cnt = 20

KERNEL = "rbf"

# params for rbf(default)
params = {
    "C": np.logspace(2, 0, params_cnt),
    "epsilon": np.logspace(-1, 1, params_cnt),
}

if KERNEL == "poly":
  params[""]
elif KERNEL == "sigmoid":
  params["gamma"] = np.logspace(-4, -1, params_cnt)
  params["coef0"] = np.logspace(-3, 0, params_cnt)
  grid_search = GridSearchCV(SVR(kernel=KERNEL), params, cv=gen_cv(), return_train_score=True, verbose=True)
  grid_search.fit(X_norm, y)
elif KERNEL == "linear":
  params[""]
elif KERNEL == "rbf":
  grid_search = GridSearchCV(SVR(kernel=KERNEL), params, cv=gen_cv(), scoring="r2", return_train_score=True)
  grid_search.fit(X_norm, y)

print("最適なパラメーター : ", grid_search.best_params_)
print("決定係数 : ", grid_search.best_score_)

最適なパラメーター :  {'C': 78.47599703514615, 'epsilon': 0.8858667904100825}
決定係数 :  0.9320541390535148


In [8]:
# チューニングしたC, εでフィット
regr = grid_search.best_estimator_
train_indices = next(gen_cv())[0]
valid_indices = next(gen_cv())[1]
regr.fit(X_norm[train_indices, :], y[train_indices])

SVR(C=78.47599703514615, epsilon=0.8858667904100825)

In [9]:
# テストデータの精度を計算
print("テストデータの精度 : ", regr.score(X_test_norm, y_test))

テストデータの精度 :  0.7319634328087761


メモ
* kernel を sigmoid にすると精度が下がった(0.5ぐらい)
* ボストンで行っているので本番どうなるかはわからない
* 本番の colab は公開しない( private で行って論文に書く )

