In [55]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score

In [56]:
#匯入資料
boston = pd.read_csv('boston_house_prices.csv')

In [57]:
boston

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


In [58]:
#檢查資料是否有空值
print(boston.isnull().sum())

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64


In [59]:
X = boston.drop('MEDV', axis=1).values
y = boston['MEDV'].values

In [60]:
scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(boston), columns=boston.columns)
X_scaled

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.000000,0.18,0.067815,0.0,0.314815,0.577505,0.641607,0.269203,0.000000,0.208015,0.287234,1.000000,0.089680,0.422222
1,0.000236,0.00,0.242302,0.0,0.172840,0.547998,0.782698,0.348962,0.043478,0.104962,0.553191,1.000000,0.204470,0.368889
2,0.000236,0.00,0.242302,0.0,0.172840,0.694386,0.599382,0.348962,0.043478,0.104962,0.553191,0.989737,0.063466,0.660000
3,0.000293,0.00,0.063050,0.0,0.150206,0.658555,0.441813,0.448545,0.086957,0.066794,0.648936,0.994276,0.033389,0.631111
4,0.000705,0.00,0.063050,0.0,0.150206,0.687105,0.528321,0.448545,0.086957,0.066794,0.648936,1.000000,0.099338,0.693333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.000633,0.00,0.420455,0.0,0.386831,0.580954,0.681771,0.122671,0.000000,0.164122,0.893617,0.987619,0.219095,0.386667
502,0.000438,0.00,0.420455,0.0,0.386831,0.490324,0.760041,0.105293,0.000000,0.164122,0.893617,1.000000,0.202815,0.346667
503,0.000612,0.00,0.420455,0.0,0.386831,0.654340,0.907312,0.094381,0.000000,0.164122,0.893617,1.000000,0.107892,0.420000
504,0.001161,0.00,0.420455,0.0,0.386831,0.619467,0.889804,0.114514,0.000000,0.164122,0.893617,0.991301,0.131071,0.377778


In [61]:
#資料切割
X_train, X_test, y_train, y_test = train_test_split (X_scaled, y, test_size=0.20, random_state = 4)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(404, 14)
(102, 14)
(404,)
(102,)


In [62]:
xgb = XGBRegressor()
xgb.fit(X_train, y_train)

In [63]:
# xgb = XGBRegressor(
#     n_estimators=100,
#     reg_lambda=1,
#     gamma=0,
#     max_depth=3
# )
# xgb.fit(X_train, y_train)

In [64]:
#進行K-FOLD
kf = KFold(n_splits=5, shuffle=True, random_state=42)

fold_rmse_scores = []
fold_mape_scores = []
fold_r2_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    xgb.fit(X_train, y_train)
    
    y_pred = xgb.predict(X_test)
    
    mape = np.mean(np.abs((y_test - y_pred) / y_test))
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    fold_mape_scores.append(mape)
    fold_rmse_scores.append(rmse)
    fold_r2_scores.append(r2)
    
    print(f'Fold MAPE: {mape}, RMSE: {rmse}, R2: {r2}')

Fold MAPE: 0.10536624699449401, RMSE: 2.6285417184029902, R2: 0.9057837838492537
Fold MAPE: 0.10474927719938894, RMSE: 2.9846397745145574, R2: 0.8865588384043797
Fold MAPE: 0.11910151896475836, RMSE: 3.524328708581476, R2: 0.8634744838656743
Fold MAPE: 0.11016201794054921, RMSE: 3.3435833178143564, R2: 0.8929646083333921
Fold MAPE: 0.10076554512601095, RMSE: 2.9433444944610128, R2: 0.8795200958464624


In [65]:
average_rmse = np.mean(fold_rmse_scores)
average_mape = np.mean(fold_mape_scores)
average_r2 = np.mean(fold_r2_scores)

print(f'\nAverage Average MAPE: {average_mape}, RMSE: {average_rmse}, Average R2: {average_r2}')


Average RMSE: 3.084887602754878, Average MAPE: 0.1080289212450403, Average R2: 0.8856603620598325
