# 作業
請使用不同的資料集，並使用 hyper-parameter search 的方式，看能不能找出最佳的超參數組合

In [1]:
%matplotlib inline
from sklearn import datasets, metrics
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
plt.style.use('ggplot')

In [3]:
digits = datasets.load_digits()
# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.25, random_state=4)


In [4]:
def get_best_model_and_accuracy(model, params, X, y):
    from sklearn.model_selection import GridSearchCV
    grid = GridSearchCV(model, params, cv=5, error_score=0, n_jobs=-1)
    grid.fit(X, y)
    print(f"Best accuracy: {grid.best_score_}")
    print(f"Best params: {grid.best_params_}")
    print(f"Average time to fit (s): {grid.cv_results_['mean_fit_time'].mean():.3f}")
    print(f"Average time to score (s): {grid.cv_results_['mean_score_time'].mean():.3f}")

In [9]:
from sklearn.pipeline import Pipeline
import time

clf = GradientBoostingClassifier()

model = Pipeline([
    ('clf', clf)
])
params = {
    'clf__n_estimators': [100,200,500,800,1000],
    'clf__max_depth': [5,10,20,50]
}
time_start = time.time()
get_best_model_and_accuracy(model, params, digits.data, digits.target)
print(f"Time elapsed = {time.time() - time_start} (sec)")



Best accuracy: 0.9053978853644964
Best params: {'clf__max_depth': 5, 'clf__n_estimators': 200}
Average time to fit (s): 12.424
Average time to score (s): 0.013
Time elapsed = 322.7537338733673 (sec)


In [10]:
# 建立模型
clf = GradientBoostingClassifier(n_estimators=200,max_depth=5)

# 訓練模型
clf.fit(x_train, y_train)

# 預測測試集
y_pred = clf.predict(x_test)

In [11]:
import pandas as pd
pd.crosstab(y_test, y_pred, rownames=['label'], colnames=['predict'])

predict,0,1,2,3,4,5,6,7,8,9
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,46,1,0,0,0,0,0,0,1,0
1,0,37,1,1,0,0,0,0,0,0
2,0,0,47,1,0,0,0,0,0,0
3,0,0,0,40,0,0,1,0,1,1
4,0,0,0,0,41,1,0,0,0,0
5,0,1,0,0,0,48,0,0,0,0
6,0,0,0,1,1,0,36,0,1,0
7,0,0,0,1,0,0,0,48,0,0
8,0,1,0,0,0,1,0,0,41,0
9,0,0,0,0,0,1,0,0,1,48


In [12]:
acc = metrics.accuracy_score(y_test, y_pred)
print("Acuuracy: ", acc)

Acuuracy:  0.96
