## パラメータ・チューニング（グリッドサーチ）

In [1]:
!git clone https://github.com/saiku122/AIJobcolle.git

Cloning into 'AIJobcolle'...
remote: Enumerating objects: 465, done.[K
remote: Counting objects: 100% (465/465), done.[K
remote: Compressing objects: 100% (342/342), done.[K
remote: Total 465 (delta 195), reused 221 (delta 70), pack-reused 0[K
Receiving objects: 100% (465/465), 12.08 MiB | 15.50 MiB/s, done.
Resolving deltas: 100% (195/195), done.


In [2]:
cd /content/AIJobcolle/MachineLearning/python

/content/AIJobcolle/MachineLearning/python


##### サンプルデータの読み込み
ここではグリッドサーチの実行方法について学びます。<br>
まずはサンプルデータ（分類用）を読み込みます。

In [3]:
# Breast cancer dataset for binary classification
import pandas as pd
from sklearn.datasets import load_breast_cancer

# Set X and y
dataset = load_breast_cancer()
X = pd.DataFrame(dataset.data, columns=dataset.feature_names)
y = pd.Series(dataset.target, name='y')
X.join(y).head(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,y
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0


##### チューニングするパイプラインの設定
分類器としてロジスティック回帰を設定しておきます。<br>
その際、以下を認識しておきましょう。
- ここでは、学習器の略称として"est"、次元圧縮に"pca"を使います
- 本記号がグリッドサーチ時の設定にも使われます。

In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

# パイプラインの設定
pipe_logistic = Pipeline([('scl', StandardScaler()),
                          ('pca', PCA(random_state=1)),
                          ('est', LogisticRegression(solver='lbfgs',random_state=1))])

##### グリッドサーチ条件の設定方法
グリッドサーチによる探索条件の設定は、以下のルールに従います。
- 上記指定の文字列"pca"や"est"と各アルゴリズムが持つパラメータ名の文字列を結合

下記設定例で、具体的に、上記ルールの設定方法を確認しましょう。
- 主成分圧縮：抽出主成分数を5,7,9の3通り
- ロジスティック回帰：正則化パラメータは0.1から100の4通り

In [5]:
# パラメータグリッドの設定方法
param_grid_logistic = {'pca__n_components':[5,7,9],
                       'est__C':[0.1,1.0,10.0,100.0]}

##### グリッドサーチの実行（学習）
ハイパーパラメータの異なるモデルの評価はGridSerachCVで行うことができます。<br>
gsをfitした時点で、各パラメータのモデルの構築と評価を終え、ベストモデルを内部で保持するところまで全自動で動きます。

In [6]:
print('探索空間:%s' %param_grid_logistic)
gs = GridSearchCV(estimator=pipe_logistic,
                  param_grid=param_grid_logistic,
                  scoring='f1',
                  cv=3,
                  return_train_score=False)
gs.fit(X, y)

探索空間:{'pca__n_components': [5, 7, 9], 'est__C': [0.1, 1.0, 10.0, 100.0]}


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scl',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('pca',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=None, random_state=1,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('est',
                                        LogisticRegression(C=1.0,
                                                           class_weight=None,
                                                           dual=False,
                                                           fit_in

ベストモデルで予測をしたい場合は以下です。

In [7]:
gs.predict_proba(X)

array([[9.99999991e-01, 9.16949126e-09],
       [9.99858219e-01, 1.41781006e-04],
       [9.99999722e-01, 2.78322042e-07],
       ...,
       [9.94479111e-01, 5.52088883e-03],
       [1.00000000e+00, 2.00899844e-11],
       [7.89925108e-06, 9.99992101e-01]])

グリッドサーチの探索結果を閲覧したい場合は以下です。

In [8]:
pd.DataFrame(gs.cv_results_).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_est__C,param_pca__n_components,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.022323,0.013587,0.004618,0.001824,0.1,5,"{'est__C': 0.1, 'pca__n_components': 5}",0.979424,0.97541,0.978903,0.977912,0.001782,5
1,0.012787,0.000774,0.003422,0.000127,0.1,7,"{'est__C': 0.1, 'pca__n_components': 7}",0.983471,0.97541,0.983051,0.980644,0.003705,2
2,0.013005,0.000161,0.006128,0.004042,0.1,9,"{'est__C': 0.1, 'pca__n_components': 9}",0.983471,0.971429,0.983193,0.979364,0.005613,4
3,0.015355,0.002602,0.003933,0.000858,1.0,5,"{'est__C': 1.0, 'pca__n_components': 5}",0.983333,0.983471,0.974359,0.980388,0.004263,3
4,0.014794,0.001019,0.003361,0.000151,1.0,7,"{'est__C': 1.0, 'pca__n_components': 7}",0.983333,0.971193,0.974359,0.976295,0.005142,6


以上でグリッドサーチの実行は終了です。

<b>[確認してみよう]</b>Scikit-learn公式ドキュメントで、GridSearchCVクラスのメソッド（precidt, predict_proba）が、ベストモデルで行われているということを確認しよう。
- 以下のページで、Methods部分の説明文を読んでみよう
- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV