<a href="https://colab.research.google.com/github/shu0518/hw2_m11423036/blob/main/%E5%AF%A6%E9%A9%97%E4%B8%80/XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor

# --- 1. 數據定義與載入 ---
COLUMNS = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num',
    'marital-status', 'occupation', 'relationship', 'race', 'sex',
    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'
]
TARGET_COLUMN = 'hours-per-week'

print("--- 載入 Adult 資料集 ---")
try:
    # 訓練集
    df_train = pd.read_csv('https://raw.githubusercontent.com/shu0518/hw2_m11423036/refs/heads/main/adult/adult.data', sep=r',\s*', engine='python', header=None, names=COLUMNS,
                           skipinitialspace=True)
    # 測試集：跳過第一行標題
    df_test = pd.read_csv('https://raw.githubusercontent.com/shu0518/hw2_m11423036/refs/heads/main/adult/adult.test', sep=r',\s*', engine='python', header=None, names=COLUMNS,
                          skipinitialspace=True, skiprows=1)

    # 修正：移除 adult.test.txt 最後一行雜訊 (導致 ValueError 的關鍵)
    if '|1x3 Cross validator' in df_test.iloc[-1].values:
        df_test = df_test.iloc[:-1].copy()

    print("資料載入成功！")
except FileNotFoundError:
    print(
        "❌ 錯誤：無法找到指定的資料檔案。請確保 'adult.train.txt' 和 'adult.test.txt' 檔案與您的 Python 腳本位於同一目錄。")
    exit()

# --- 2. 分割特徵與目標 ---
X_train_raw = df_train.drop(columns=[TARGET_COLUMN, 'income'])
y_train_raw = df_train[TARGET_COLUMN]

X_test_raw = df_test.drop(columns=[TARGET_COLUMN, 'income'])
y_test = df_test[TARGET_COLUMN]

# --- 3. 數據清理與缺失值處理 ---
print("--- 數據清洗與缺失值處理 ---")

# 將所有 ' ?' 替換為 NaN
X_train_raw = X_train_raw.replace('?', np.nan).replace(' ?', np.nan)
X_test_raw = X_test_raw.replace('?', np.nan).replace(' ?', np.nan)

# 訓練集：使用 dropna 移除所有包含缺失值的行
X_train = X_train_raw.dropna()
y_train = y_train_raw[X_train.index]
print(f"訓練集樣本數: {len(X_train)}")

# 定義特徵類型 (基於已清理的訓練集)
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

# 測試集：使用訓練集的統計值填補缺失值 (避免刪除測試樣本)
X_test = X_test_raw.copy()
for col in categorical_features:
    mode_val = X_train[col].mode()[0]
    X_test[col] = X_test[col].fillna(mode_val)

for col in numerical_features:
    median_val = X_train[col].median()
    X_test[col] = X_test[col].fillna(median_val)
print(f"測試集樣本數: {len(X_test)}")

# --- 4. 特徵編碼與縮放 (ColumnTransformer) ---

# 建立預處理管道
# 數值特徵：標準化 (StandardScaler)
# 類別特徵：獨熱編碼 (OneHotEncoder)，忽略訓練集未見過的類別
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'  # 其他欄位 (如fnlwgt) 不做處理
)

# 套用預處理器
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
# --- 5. XGBoost 模型訓練與評估 ---

# 初始化 XGBoost 迴歸器
xgb_model = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,  # 樹的數量
    random_state=42
)

# 2. 計時開始
start_time = time.time()

# 3. 訓練模型
print("--- XGBoost 模型開始訓練 ---")
xgb_model.fit(X_train_processed, y_train)

# 4. 訓練時間結束
train_end_time = time.time()
training_time = train_end_time - start_time
print(f"訓練時間: {training_time:.4f} 秒")

# 5. 進行預測
y_pred = xgb_model.predict(X_test_processed)

# 6. 預測時間結束
prediction_time = time.time() - train_end_time
total_time = time.time() - start_time
print(f"預測時間: {prediction_time:.4f} 秒")
print(f"總計算時間: {total_time:.4f} 秒")


# 7. 績效評估函式
def evaluate_model(y_true, y_pred):
    # 修正：移除 squared=False，改為手動計算平方根
    mse = mean_squared_error(y_true, y_pred)
    # RMSE (Root Mean Squared Error) = MSE 的平方根
    rmse = np.sqrt(mse)

    # R^2 (Coefficient of Determination)
    r2 = r2_score(y_true, y_pred)

    # MAPE (Mean Absolute Percentage Error)
    mape = np.mean(np.abs((y_true - y_pred) / np.where(y_true == 0, 1e-8, y_true))) * 100

    return {'RMSE': rmse, 'R^2': r2, 'MAPE': mape}

metrics = evaluate_model(y_test, y_pred)

print("\n--- XGBoost 預測績效 ---")
print(f"MAPE: {metrics['MAPE']:.4f}%")
print(f"RMSE: {metrics['RMSE']:.4f}")
print(f"R²: {metrics['R^2']:.4f}")


--- 載入 Adult 資料集 ---
資料載入成功！
--- 數據清洗與缺失值處理 ---
訓練集樣本數: 30162
測試集樣本數: 16281
--- XGBoost 模型開始訓練 ---
訓練時間: 1.9141 秒
預測時間: 0.1596 秒
總計算時間: 2.0738 秒

--- XGBoost 預測績效 ---
MAPE: 30.6772%
RMSE: 10.8548
R^2: 0.2434
