# 4.5 使用堆疊法處理迴歸問題

In [40]:
# --- 第 1 部分 ---
# 載入函式庫與資料集
from sklearn.datasets import load_diabetes
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import KFold
from sklearn import metrics
import numpy as np
diabetes = load_diabetes()

train_x, train_y = diabetes.data[:400], diabetes.target[:400]
test_x, test_y = diabetes.data[400:], diabetes.target[400:]


In [41]:
# --- 第 2 部分 ---
# 建立基學習器與超學習器
# 將基學習器放到串列中
base_learners = []

knn = KNeighborsRegressor(n_neighbors = 5)
base_learners.append(knn)

dtr = DecisionTreeRegressor(max_depth = 4, random_state = 123456)
base_learners.append(dtr)

ridge = Ridge()
base_learners.append(ridge)

meta_learner = LinearRegression() # 超學習器為線性迴歸

In [42]:
# --- 第 3 部分 ---
# 產生訓練超學習器用的中繼資料

# 建立變數以儲存中繼資料及其標籤
meta_data = np.zeros((len(base_learners), len(train_x))) # meta_data.shape (3, 400)
meta_targets = np.zeros(len(train_x))                    # meta_targets.shape (400,) 

# 進行交叉驗證
KF = KFold(n_splits=5) # 使用KFold函式並指訂將資料分成5折，傳回這5折資料的索引
index = 0
for train_indices, test_indices in KF.split(train_x): # train_indices包含4/5的資料索引 ,test_indices包含1/5的資料索引
    # 前K-1折是訓練資料集
    # 第K折是驗證資料集
    for i in range(len(base_learners)):
        learner = base_learners[i]
        learner.fit(train_x[train_indices], train_y[train_indices])
        p = learner.predict(train_x[test_indices])
        meta_data[i][index:index + len(test_indices)] = p

    meta_targets[index:index + 
                 len(test_indices)] = train_y[test_indices]
    index += len(test_indices)

# 中繼資料的特徵存在meta_data陣列
# 中繼資料的標籤存在meta_targets陣列        
    
# 將中繼資料轉置為超學習器需要的形式，使陣列的每一列是一筆資料，每一行是一個基學習器的預測值
meta_data = meta_data.transpose()  # meta_data.shape (400, 3)

In [43]:
# --- 第 4 部分 ---
# 產生超學習器的測試資料
test_meta_data = np.zeros((len(base_learners), len(test_x))) # test_meta_data.shape (3, 42)
base_errors = []
base_r2 = []
for i in range(len(base_learners)):
    learner = base_learners[i]
    learner.fit(train_x, train_y)
    predictions = learner.predict(test_x)
    test_meta_data[i] = predictions

    err = metrics.mean_squared_error(test_y, predictions)
    r2 = metrics.r2_score(test_y, predictions)

    base_errors.append(err)
    base_r2.append(r2)

test_meta_data = test_meta_data.transpose()  # test_meta_data.shape (42, 3)


In [44]:
# --- 第 5 部分 ---
# 訓練超學習器
meta_learner.fit(meta_data, meta_targets)
ensemble_predictions = meta_learner.predict(test_meta_data)

err = metrics.mean_squared_error(test_y, ensemble_predictions)
r2 = metrics.r2_score(test_y, ensemble_predictions)


In [45]:
# --- 第 6 部分 ---
# 顯示結果
print('ERROR  R2  Name')
print('-'*20)
for i in range(len(base_learners)):
    e = base_errors[i]
    r = base_r2[i]
    b = base_learners[i]

    print(f'{e:.1f} {r:.2f} {b.__class__.__name__}')
print(f'{err:.1f} {r2:.2f} Ensemble')


ERROR  R2  Name
--------------------
2697.8 0.51 KNeighborsRegressor
3142.5 0.43 DecisionTreeRegressor
2564.8 0.54 Ridge
2066.6 0.63 Ensemble
