In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_absolute_percentage_error as MAPE

## 解題步驟：

1. 讀取 x_train.npy, y_train.npy, x_test.npy, y_test.npy
2. 將 training dataset 再切分為 training set, validation set (圖一)
3. 先以上課的知識調整出一個不會 over-fitting 太多的決策樹模型
4. 以 validation set 作為調整參數的基準，陸續調整其他參數 (請同學測試看看 validation 要佔多少比例，後續的調整會比較客觀)
5. 將最終調整結果與一開始的決策樹做比較，誤差是否有降低
6. 同學若也懂其他模型的知識也可以試試看一樣的做法比較看看

#### 圖一 (Train, Validation and Test)
<img src="./train_val_test.png" style="zoom:30%;" />

#### 圖二 (是我前後調整的結果)
<img src="./report.png" style="zoom:30%;" />

In [2]:
# Read data
X_train = np.load('x_train.npy')
x_test = np.load('x_test.npy')
Y_train = np.load('y_train.npy')
y_test = np.load('y_test.npy')

## 請同學先依照上課的知識，調整一個沒有 overfitting 的決策樹，請以 mape 作為參考誤差指標，較容易看出關係

In [3]:
triallist = [0.5, 0.4, 0.3, 0.2, 0.1, 0.05, 0.01]
for num in triallist:
    x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size=num)
    print('test_size:',num)
    print('Before tuning')
    reg = DecisionTreeRegressor().fit(x_train, y_train)
    y_pred_train = reg.predict(x_train)
    print('train:', MAPE(y_pred_train, y_train))
    y_pred = reg.predict(x_val)
    before_val = MAPE(y_pred, y_val)
    print('val:', before_val)
    y_pred_test = reg.predict(x_test)
    before_test = MAPE(y_pred_test, y_test)
    print('test:', before_test)
    print('----------------------------')
    print('After tuning')
    reg_tune = DecisionTreeRegressor(max_depth=5, min_samples_split=3, min_samples_leaf=2, max_features=0.8, min_impurity_decrease=0.5, ccp_alpha=1.0, random_state=42).fit(x_train, y_train)
    y_pred_train = reg_tune.predict(x_train)
    print('train:', MAPE(y_pred_train, y_train))
    y_pred = reg_tune.predict(x_val)
    tune_val = MAPE(y_pred, y_val)
    print('val:', tune_val)
    y_pred_test = reg_tune.predict(x_test)
    tune_test = MAPE(y_pred_test, y_test)
    print('test:', tune_test)
    print('變動幅度衡量: ', (before_test-tune_test)/(before_val-tune_val))
    print('============================')

test_size: 0.5
Before tuning
train: 0.0
val: 0.16611151411831768
test: 0.1680821741494072
----------------------------
After tuning
train: 0.10095119237548746
val: 0.14515996369878895
test: 0.15501782844667686
變動幅度衡量:  0.6235503073105848
test_size: 0.4
Before tuning
train: 0.0
val: 0.14433650152954408
test: 0.16546807794053275
----------------------------
After tuning
train: 0.10062583670008882
val: 0.12938607279881958
test: 0.13896177043059474
變動幅度衡量:  1.7729463139384847
test_size: 0.3
Before tuning
train: 0.0
val: 0.15335225500144978
test: 0.14719260024809944
----------------------------
After tuning
train: 0.10820634499598887
val: 0.1434810950625671
test: 0.13753635079227636
變動幅度衡量:  0.9782284468704566
test_size: 0.2
Before tuning
train: 0.0
val: 0.14711411482111325
test: 0.1644773269223586
----------------------------
After tuning
train: 0.10835996557581787
val: 0.13082307795408604
test: 0.1404385860954004
變動幅度衡量:  1.475580776298668
test_size: 0.1
Before tuning
train: 0.0
val: 0.12

In [4]:
#由上面實驗發現當test_size為0.1時，val的變動幅度與test的變動幅度為0.97最為相近，
#代表調參時val最能反映test的測試結果

## 調整 決策樹參數
### 請同學從 criterion 的設定中，判斷這組資料集裡的 outlier 多嗎？

In [5]:
#MSE較MAE的MAPE高一些，表示資料內應有離群值，但我也不知道您的outlier要多少才算多~
x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size=0.1, random_state=42)
reg = DecisionTreeRegressor().fit(x_train, y_train)
y_pred = reg.predict(x_val)
before_val = MAPE(y_pred, y_val)
print('criterion:MSE', before_val)
reg = DecisionTreeRegressor(criterion='mae').fit(x_train, y_train)
y_pred = reg.predict(x_val)
before_val = MAPE(y_pred, y_val)
print('criterion:MAE', before_val)

criterion:MSE 0.16627169158136268
criterion:MAE 0.13967992009617888
