In [5]:
import pandas as pd
import numpy as np
import time

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import HalvingRandomSearchCV as HRSCV
from xgboost import XGBRegressor

from scipy.stats import norm

from project_module import regression_report
from project_module.feature_selection import SelectKBestByCoefficient

In [6]:
""" Your code here: 讀取資料 """
# load data
x_train = np.load('./x_train.npy')
y_train = np.load('./y_train.npy')
x_test = np.load('./x_test.npy')
y_test = np.load('./y_test.npy')

print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(1095, 89) (1095,) (365, 89) (365,)


## 解題步驟：

1. 讀取 x_train.npy, y_train.npy, x_test.npy, y_test.npy
2. 先以上課的知識或 default hyperparameter 調整出一個不會 over-fitting 太多的 XGBoost 模型
3. 以該組超參數為基準，搜尋附近的參數(可以用自己偏好的搜尋策略)
4. 將最終調整結果與一開始的模型做比較，誤差是否有降低
5. 請比較 Random Forest, XGBoost(有時間的同學可以增加 GBDT, Adaboost) 的超參數搜尋時間與誤差(記得要控制 n_estimators 等會影響到時間的參數，使其叫)

In [8]:
""" Your code here: 初始化一個 XGBoost 模型並判斷其誤差 """

def get_XGBoost(params: dict) -> XGBRegressor:
    XGB = XGBRegressor(**params)
    return XGB

In [9]:
XGB = XGBRegressor()
XGB.fit(x_train, y_train)

pred = XGB.predict(x_test)
regression_report(y_test, pred)

mse = 1577792384.2319
mae = 18800.7493
rmse = 39721.4348
mape = 0.1029


## [XGBoost 官方文檔](https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn)

## 計算 XGBoost 超參數搜尋時間

In [16]:
""" Your code here: 搜尋超參數，並計算搜尋時間 """
XGB = XGBRegressor(n_jobs = -1)

# set searching hyperparameters
search_params = {
    'n_estimators': np.arange(70, 200),
    'max_depth': np.arange(4, 8),
    'eta': np.abs(norm(loc = 0.3, scale = 0.5).rvs(size=40)),
    'objective': ['reg:squarederror', 'reg:squaredlogerror', 'reg:pseudohubererror'],
    'min_child_weight': np.abs(norm(loc = 1, scale = 0.5).rvs(size=40)),
    'subsample': np.arange(0.7, 1, 0.01),
    'lambda': np.arange(0.01, 1, 0.05),
    'alpha': np.arange(0.01, 1, 0.05)
}

In [19]:
# set Successive Halving algorithm
SH_search = HRSCV(
    estimator = XGB, param_distributions = search_params, n_candidates = 200, 
    factor = 2, resource = 'n_samples', max_resources='auto', min_resources='smallest', 
    aggressive_elimination=False, cv=3, scoring='neg_mean_absolute_percentage_error', refit=True,
    return_train_score=True, random_state=None, n_jobs=-1, verbose=0
)



In [20]:
start = time.time()
SH_search.fit(x_train, y_train)
end = time.time()


OverflowError: Python int too large to convert to C long

In [21]:
print(f'HRSCV ended in {end - start:.4f}s\n')

print(f'Best score is {(-1) * SH_search.best_score_:.4f}')
print(f'Best params is {SH_search.best_params_}')

HRSCV ended in -582.1792s



AttributeError: 'HalvingRandomSearchCV' object has no attribute 'best_score_'

## 計算 XGBoost 單輪訓練時間

In [11]:
""" Your code here: 使用搜尋到的參數分析在 testing data 上的誤差表現，並計算時間 """
# 這組參數是我搜尋到比較好的一組參數
best_params = {'subsample': 0.71, 'objective': 'reg:squarederror', 'n_estimators': 135, 'min_child_weight': 1.106778500318303, 'max_depth': 4, 'lambda': 0.66, 'eta': 0.06554882531678949, 'alpha': 0.81}
best_params['n_jobs'] = -1
XGB = get_XGBoost(best_params)

start = time.time()
XGB.fit(x_train, y_train)
end = time.time()
print(f'XGBoost was trained in {end - start:.4f}s\n')

pred = XGB.predict(x_test)
regression_report(y_test, pred)

XGBoost was trained in 0.3172s

mse = 1010869446.2126
mae = 15842.4891
rmse = 31794.1731
mape = 0.0904


## 計算 Random Forest 超參數搜尋時間

In [12]:
""" Your code here: 搜尋 Random Forest 超參數，並計算搜尋時間 """
RF = RandomForestRegressor(n_jobs = -1)

# set searching hyperparameters
search_params = {
    'n_estimators': np.arange(70, 200),
    'max_depth': np.arange(7, 20),
    'ccp_alpha': np.abs(norm(loc = 1.5, scale = 0.5).rvs(size=20)),
    'criterion': ['mae', 'mse'],
    'min_samples_split': np.arange(2, 10),
    'max_features': ['sqrt', 'log2'],
    'min_impurity_decrease': np.abs(norm(loc = 1, scale = 0.5).rvs(size=40)),
    'max_samples': np.arange(0.7, 0.99, 0.01)
}

# set Successive Halving algorithm
SH_search = HRSCV(
    estimator = RF, param_distributions = search_params, n_candidates = 200, 
    factor = 2, resource = 'n_samples', max_resources='auto', min_resources='smallest', 
    aggressive_elimination=False, cv=3, scoring='neg_mean_absolute_percentage_error', refit=True,
    return_train_score=True, random_state=None, n_jobs = -1, verbose=0
)

start = time.time()
SH_search.fit(x_train, y_train)
end = time.time()
print(f'HRSCV ended in {end - start:.4f}s\n')

print(f'Best score is {(-1) * SH_search.best_score_:.4f}')
print(f'Best params is {SH_search.best_params_}')

HRSCV ended in 136.2125s

Best score is 0.1087
Best params is {'n_estimators': 151, 'min_samples_split': 2, 'min_impurity_decrease': 1.217730660943888, 'max_samples': 0.9700000000000002, 'max_features': 'sqrt', 'max_depth': 13, 'criterion': 'mse', 'ccp_alpha': 0.7034844342131232}


## 計算 Random Forest 單輪訓練時間

In [14]:
""" Your code here: 使用搜尋到的參數分析在 testing data 上的誤差表現，並計算時間 """
best_params = {'n_estimators': 107, 'min_samples_split': 3, 'min_impurity_decrease': 1.5204750662278061, 'max_samples': 0.81, 'max_features': 'sqrt', 'max_depth': 11, 'criterion': 'mse', 'ccp_alpha': 1.205496871022525}
best_params['n_jobs'] = 4

RF = RandomForestRegressor(**best_params)
start = time.time()
RF.fit(x_train, y_train)
end = time.time()
print(f'Random Forest was trained in {end - start:.4f}s\n')

pred = RF.predict(x_test)
regression_report(y_test, pred, verbose = 1)

Random Forest was trained in 0.2551s

mse = 1261008143.8174
mae = 18715.7155
rmse = 35510.6765
mape = 0.1061
