## [作業重點]
了解如何使用 Sklearn 中的 hyper-parameter search 找出最佳的超參數

### 作業
請使用不同的資料集，並使用 hyper-parameter search 的方式，看能不能找出最佳的超參數組合

In [100]:
from sklearn import datasets, metrics
digits = datasets.load_digits()

In [101]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import cross_validate

In [102]:
X_train, X_test, y_train, y_test = train_test_split(
    digits.data, digits.target, test_size=0.20)

In [103]:
lgb_train = lgb.Dataset(X_train, y_train)
#lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)
#lgb_val要用lgb.cv進行設定才會有cross validation的效果，lgb_val只是切下固定大小的training set作為validation set

In [104]:
#這邊的設定是for mulituclass，尚未進行optimization的參數
params_lightGB = {
    'task': 'train',
    'objective':'multiclass',
    'boosting': 'gbdt',
    'min_data_in_leaf':40,
    'metric': 'multi_logloss',
    'metric_freq':50,
    'max_depth':10,
    'num_leaves': 70,
    'learning_rate': 0.01,
    'num_class':10,
    'verbose':None
}

In [105]:
#設定cross validation的參數，部分的參數follow 上面的params_lightGB參數
cv_results = lgb.cv(
    params_lightGB, train_set=lgb_train, num_boost_round=10000, nfold=5,
    shuffle=True, stratified=True, verbose_eval=None, early_stopping_rounds=100)

# verbose_eval設定為沒幾個evaluation會出現一次數值，沒必要使用
# classification using stratified, if regression set False

In [106]:
nround = np.argmin(cv_results['multi_logloss-mean'])

In [107]:
#求出上面這些參數能夠得到最佳解的round是多少？
nround

1255

In [108]:
#開始真正的訓練
gbm = lgb.train(params_lightGB, lgb_train, num_boost_round=nround)

In [109]:
#np.argmax( )参数返回的是沿轴axis最大值的索引值
#refer to https://blog.csdn.net/qq1483661204/article/details/78959293

y_pred = gbm.predict(X_test) #這個是機率值
y_pred_1 = [np.argmax(line) for line in y_pred] 
#共有360次猜測，每次猜測共有10個類別(0~9)，所以有10個機率值，從中選出最大機率值所對應的索引值，就是gbm所預測的值

test_acc_score = metrics.accuracy_score(y_test, y_pred_1) #拿實際值與gbm所預測的值做比較
print("val_acc_score:", test_acc_score)

test_acc = metrics.classification_report(y_test, y_pred_1)
print(test_acc)

val_acc_score: 0.9777777777777777
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        35
           1       0.93      0.97      0.95        39
           2       1.00      1.00      1.00        35
           3       0.97      0.97      0.97        35
           4       1.00      0.95      0.97        38
           5       1.00      1.00      1.00        39
           6       1.00      1.00      1.00        37
           7       0.95      1.00      0.97        37
           8       1.00      0.91      0.96        35
           9       0.94      0.97      0.95        30

    accuracy                           0.98       360
   macro avg       0.98      0.98      0.98       360
weighted avg       0.98      0.98      0.98       360



In [110]:
#RandomizedSearchCV引數說明，clf1設定訓練的學習器
#param_dist字典型別，放入引數搜尋範圍
#scoring = 'neg_log_loss'，精度評價方式設定為“neg_log_loss“
#n_iter=300，訓練300次，數值越大，獲得的引數精度越大，但是搜尋時間越長
#n_jobs = -1，使用所有的CPU進行訓練，預設為1，使用1個CPU
from sklearn.model_selection import RandomizedSearchCV

In [111]:
hw = datasets.load_digits()
X_train, X_test, y_train, y_test = train_test_split(
    hw.data, hw.target, test_size=0.20)
param_grid_dist = {
        'min_data_in_leaf':range(5,20,5),
        'max_depth':range(5,20,5),
        'learning_rate':np.linspace(0.1,0.5,num=4),
        'num_leaves':range(30,50,5)
        }
clf = lgb.LGBMClassifier()
grid = RandomizedSearchCV(
    clf,param_grid_dist,cv=5,scoring='neg_log_loss',n_iter=150,n_jobs = -1)

search = grid.fit(X_train, y_train)
search.best_params_



{'num_leaves': 35,
 'min_data_in_leaf': 15,
 'max_depth': 10,
 'learning_rate': 0.23333333333333334}

In [117]:
#這邊的設定是for mulituclass
params_lightGB = {
    'task': 'train',
    'objective':'multiclass',
    'boosting': 'gbdt',
    'min_data_in_leaf':search.best_params_["min_data_in_leaf"],
    'metric': 'multi_logloss',
    'metric_freq':50,
    'max_depth':search.best_params_["max_depth"],
    'num_leaves':search.best_params_["num_leaves"],
    'learning_rate':0.1,
    'num_class':10,
    'verbose':None
}

In [118]:
lgb_train = lgb.Dataset(X_train, y_train)
#設定cross validation的參數，部分的參數follow 上面的params_lightGB參數
cv_results = lgb.cv(
    params_lightGB, train_set=lgb_train, num_boost_round=10000, nfold=5,
    shuffle=True, stratified=True, verbose_eval=None, early_stopping_rounds=100)

# verbose_eval設定為沒幾個evaluation會出現一次數值，沒必要使用
# classification using stratified, if regression set False

In [119]:
nround = np.argmin(cv_results['multi_logloss-mean'])
print(nround)

126


In [120]:
gbm = lgb.train(params_lightGB, lgb_train, num_boost_round=nround)

In [121]:
#np.argmax( )参数返回的是沿轴axis最大值的索引值
#refer to https://blog.csdn.net/qq1483661204/article/details/78959293

y_pred = gbm.predict(X_test) #這個是機率值
y_pred_1 = [np.argmax(line) for line in y_pred] 
#共有360次猜測，每次猜測共有10個類別(0~9)，所以有10個機率值，從中選出最大機率值所對應的索引值，就是gbm所預測的值

test_acc_score = metrics.accuracy_score(y_test, y_pred_1) #拿實際值與gbm所預測的值做比較
print("val_acc_score:", test_acc_score)

test_acc = metrics.classification_report(y_test, y_pred_1)
print(test_acc)

val_acc_score: 0.9638888888888889
              precision    recall  f1-score   support

           0       0.97      1.00      0.99        33
           1       0.95      0.98      0.96        41
           2       0.97      0.97      0.97        40
           3       1.00      1.00      1.00        34
           4       0.92      0.97      0.94        35
           5       0.93      0.97      0.95        29
           6       1.00      0.94      0.97        35
           7       0.95      0.97      0.96        36
           8       0.98      0.89      0.93        46
           9       0.97      0.97      0.97        31

    accuracy                           0.96       360
   macro avg       0.96      0.97      0.96       360
weighted avg       0.96      0.96      0.96       360

