In [40]:
from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
## 梯度提升樹算法 https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html
## 梯度提升用法 補充資料: https://sklearn.apachecn.org/docs/master/12.html 
from sklearn.ensemble import GradientBoostingRegressor

In [41]:
# 讀取波士頓房價資料集
boston = datasets.load_boston()

In [42]:
## 查看數據集
boston

{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
         4.9800e+00],
        [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
         9.1400e+00],
        [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
         4.0300e+00],
        ...,
        [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         5.6400e+00],
        [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
         6.4800e+00],
        [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         7.8800e+00]]),
 'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
        18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
        15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
        13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
        21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
        35.4, 24.7, 3

In [43]:
# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.25, random_state=42)

# 建立模型
clf = GradientBoostingRegressor(random_state=7)

# 先看看使用預設參數得到的結果，約為 8.379 的 MSE
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(y_pred)
print(metrics.mean_squared_error(y_test, y_pred))

[23.04052211 30.80125144 16.44079235 24.28386434 17.51775094 22.20424803
 18.37679613 13.87826852 20.69567546 21.05630329 20.65800177 18.25866334
  7.12849106 21.70412526 20.42411344 25.68150134 19.65025197  9.04186004
 45.92622827 16.24352807 24.16750847 25.58242866 13.55637357 21.63216117
 15.24383795 16.02780139 21.97039232 14.1308893  19.7882457  21.4943401
 19.96374277 23.55638271 23.44639924 19.94756007 14.59751632 17.07294658
 33.48430381 19.44918017 21.13751246 23.61682922 18.32202324 30.25253963
 45.28348352 20.90449728 22.53997442 15.13571919 16.28600727 23.72253077
 18.01993116 27.80166399 20.29367355 35.77815626 16.5197479  25.490704
 47.51880799 21.53764501 15.99636471 31.79864176 21.85748794 18.29080884
 22.7379009  34.0542103  30.7125856  19.8255971  24.76787729 18.05612794
 14.58785612 23.67111194 28.79028621 15.10698901 21.29168096 26.46787175
 10.38756407 20.76810568 22.63374113  5.83334008 20.52105443 45.32080046
 12.12777804 11.99035843 21.61277948 11.8664967  17.97

In [44]:
# 設定要訓練的超參數組合
n_estimators = [100, 200, 300, 400, 500]
max_depth = [1, 3, 5, 7, 9]
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)

## 建立搜尋物件，放入模型及參數組合字典 (n_jobs=-1 會使用全部 cpu 平行運算)
## GridSearchCV:https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
## scoring選擇 https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
grid_search = GridSearchCV(clf, param_grid, scoring="neg_mean_squared_error", n_jobs=-1, verbose=1)
# 開始搜尋最佳參數
grid_result = grid_search.fit(x_train, y_train)
# 預設會跑 5-fold cross-validadtion，總共 9 種參數組合，總共要 train 27 次模型

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:    5.9s finished


In [45]:
# 印出最佳結果與最佳參數
print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best Accuracy: -10.843075 using {'max_depth': 3, 'n_estimators': 100}


In [46]:
grid_result.best_params_

{'max_depth': 3, 'n_estimators': 100}

In [49]:
# 使用最佳參數重新建立模型
clf_bestparam = GradientBoostingRegressor(max_depth=grid_result.best_params_['max_depth'],
                                           n_estimators=grid_result.best_params_['n_estimators'])

# 訓練模型
clf_bestparam.fit(x_train, y_train)

# 預測測試集
y_pred = clf_bestparam.predict(x_test)
y_pred

array([23.04052211, 30.80125144, 16.44079235, 24.28386434, 17.51775094,
       22.20424803, 18.37679613, 13.87826852, 20.69567546, 21.05630329,
       20.65800177, 18.17504716,  8.02424687, 21.70412526, 20.42411344,
       25.68150134, 19.65025197,  9.04186004, 45.84663215, 16.24352807,
       24.16750847, 25.58242866, 13.55637357, 21.63216117, 15.24383795,
       16.02780139, 21.97039232, 14.1308893 , 19.7882457 , 21.4943401 ,
       19.96374277, 23.55638271, 23.44639924, 19.94756007, 14.59751632,
       17.07294658, 33.48430381, 19.44918017, 21.13751246, 23.5933276 ,
       18.38875615, 30.25253963, 45.28348352, 20.90449728, 22.53997442,
       15.13571919, 16.28600727, 23.60866936, 18.01993116, 27.80166399,
       20.29367355, 35.77815626, 16.5197479 , 25.490704  , 47.51880799,
       21.53764501, 15.99636471, 31.79864176, 21.85748794, 18.29080884,
       22.7379009 , 34.20017994, 30.7125856 , 19.8255971 , 24.76787729,
       18.05612794, 14.58785612, 23.67111194, 28.79028621, 15.10

In [50]:
# 調整參數後約可降至 8.30 的 MSE
print(metrics.mean_squared_error(y_test, y_pred))

8.92635103667199
