<a href="https://colab.research.google.com/github/shu0518/hw2_m11423036/blob/main/%E5%AF%A6%E9%A9%97%E4%B8%80/KNN_regressor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import time
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score

# -----------------------------
# 1. 讀取資料
# -----------------------------
train_path = "https://raw.githubusercontent.com/shu0518/hw2_m11423036/refs/heads/main/adult/adult.data"
test_path  = "https://raw.githubusercontent.com/shu0518/hw2_m11423036/refs/heads/main/adult/adult.test"

column_names = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "sex",
    "capital_gain", "capital_loss", "hours_per_week", "native_country",
    "income"
]

train_df = pd.read_csv(train_path, header=None, names=column_names, sep=",", engine="python")
test_df  = pd.read_csv(test_path,  header=None, names=column_names, sep=",", engine="python")

# 去除空白
train_df = train_df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
test_df  = test_df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# -----------------------------
# 2. 修正數值欄位
# -----------------------------
numeric_cols_all = ["age", "fnlwgt", "education_num", "capital_gain", "capital_loss", "hours_per_week"]

for df in [train_df, test_df]:
    for col in numeric_cols_all:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# 移除 rows with invalid number
train_df = train_df.dropna(subset=["hours_per_week", "age"])
test_df  = test_df.dropna(subset=["hours_per_week", "age"])

# -----------------------------
# 3. X / y
# -----------------------------
target_col = "hours_per_week"

X_train = train_df.drop(columns=[target_col, "income"])
y_train = train_df[target_col].astype(float)

X_test = test_df.drop(columns=[target_col, "income"])
y_test = test_df[target_col].astype(float)

X_train = X_train.replace("?", np.nan)
X_test  = X_test.replace("?", np.nan)

numeric_features = ["age", "fnlwgt", "education_num", "capital_gain", "capital_loss"]
categorical_features = [c for c in X_train.columns if c not in numeric_features]

# -----------------------------
# 4. Preprocess
# -----------------------------
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# -----------------------------
# 5. 定義模型
# -----------------------------
models = {
    "KNN_regressor": KNeighborsRegressor(n_neighbors=10, weights="distance"),
}

results = []

# -----------------------------
# 6. 訓練 + 預測 + 評估
# -----------------------------
for name, model in models.items():
    print("\n============================")
    print(f"訓練模型：{name}")
    print("============================")

    clf = Pipeline([
        ("preprocess", preprocess),
        ("model", model)
    ])

    # 訓練時間
    t0 = time.perf_counter()
    clf.fit(X_train, y_train)
    train_time = time.perf_counter() - t0

    # 預測時間
    t0 = time.perf_counter()
    y_pred = clf.predict(X_test)
    pred_time = time.perf_counter() - t0

    # 評估（修正 RMSE 計算方式）
    rmse = mean_squared_error(y_test, y_pred) ** 0.5
    mape = mean_absolute_percentage_error(y_test, y_pred) * 100
    r2   = r2_score(y_test, y_pred)

    print(f"RMSE : {rmse:.4f}")
    print(f"MAPE : {mape:.2f}%")
    print(f"R²   : {r2:.4f}")
    print(f"訓練時間: {train_time:.4f} 秒")
    print(f"預測時間: {pred_time:.4f} 秒")

    results.append({
        "model": name,
        "RMSE": rmse,
        "MAPE(%)": mape,
        "R2": r2,
        "train_time(s)": train_time,
        "predict_time(s)": pred_time
    })


  train_df = train_df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  test_df  = test_df.applymap(lambda x: x.strip() if isinstance(x, str) else x)



訓練模型：KNN_regressor
RMSE : 11.2969
MAPE : 32.15%
R²   : 0.1805
訓練時間: 0.1931 秒
預測時間: 40.9884 秒
