In [7]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
from scipy.stats import skew

# ============================
# 1. 設定檔案路徑
# ============================
base_dir = "/Users/sum/Desktop/house-prices-advanced-regression-techniques"
train_path = os.path.join(base_dir, "train.csv")
test_path = os.path.join(base_dir, "test.csv")
sample_path = os.path.join(base_dir, "sample_submission.csv")
output_path = os.path.join(base_dir, "submission.csv")

# ============================
# 2. 讀取資料
# ============================
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
submission = pd.read_csv(sample_path)

# ============================
# 3. 目標變數與新增特徵工程
# ============================
# 將目標變數取出並取對數（評分指標使用 log RMSE）
y = np.log(train['SalePrice'])
# 刪除 train 資料中的 SalePrice 欄位
train.drop(['SalePrice'], axis=1, inplace=True)

# 新增自定義函數，建立進階特徵
def create_features(df):
    # 總面積：地下室 + 一樓 + 二樓
    df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
    
    # 房屋年齡與最近裝修距離銷售年份的差距
    df['Age'] = df['YrSold'] - df['YearBuilt']
    df['RemodAge'] = df['YrSold'] - df['YearRemodAdd']
    
    # 總浴室數：FullBath + 0.5*HalfBath, 加上地下室部分
    df['TotalBathrooms'] = (df['FullBath'] + 0.5 * df['HalfBath'] +
                            df['BsmtFullBath'] + 0.5 * df['BsmtHalfBath'])
    return df

# 為 train 與 test 新增特徵
train = create_features(train)
test = create_features(test)

# ============================
# 4. 合併資料並進行資料前處理
# ============================
# 為了一致性，將 train 與 test 資料合併（後續再拆分）
# 這裡移除 Id 欄位（非預測特徵）
train_ids = train['Id']  # 若後續需要
test_ids = test['Id']
train.drop(['Id'], axis=1, inplace=True)
test.drop(['Id'], axis=1, inplace=True)

all_data = pd.concat([train, test], axis=0)

# 4-1. 填補缺失值
# 對數值型欄位：用中位數填補
num_cols = all_data.select_dtypes(include=[np.number]).columns
all_data[num_cols] = all_data[num_cols].fillna(all_data[num_cols].median())

# 對類別型欄位：用眾數填補
cat_cols = all_data.select_dtypes(include=["object"]).columns
for col in cat_cols:
    all_data[col] = all_data[col].fillna(all_data[col].mode()[0])

# 4-2. 處理偏態：對偏態數值特徵進行 log1p 轉換（避免極端值影響）
skewness = all_data[num_cols].apply(lambda x: skew(x))
skewed_feats = skewness[abs(skewness) > 0.75].index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

# 4-3. 對類別變數做 one-hot encoding
all_data = pd.get_dummies(all_data)

# ============================
# 5. 拆分回訓練集與測試集
# ============================
X_train_processed = all_data.iloc[:train.shape[0], :].copy()
X_test_processed  = all_data.iloc[train.shape[0]:, :].copy()

# ============================
# 6. 模型訓練：使用 Ridge Regression
# ============================
model = Ridge(alpha=10, random_state=42)
model.fit(X_train_processed, y)

# 可利用 5-fold 交叉驗證評估模型表現
cv_scores = cross_val_score(model, X_train_processed, y, scoring="neg_mean_squared_error", cv=5)
cv_rmse = np.sqrt(-cv_scores)
print(f"Cross-validated RMSE (log space): {cv_rmse.mean():.5f}")

# ============================
# 7. 預測測試集，並生成 submission 檔案
# ============================
# 模型預測的是 log(SalePrice)，需取 exp 後還原
pred_log = model.predict(X_test_processed)
pred = np.exp(pred_log)

submission['SalePrice'] = pred
submission.to_csv(output_path, index=False)
print(f"Submission file saved to: {output_path}")

# ============================
# 8. 在 macOS 上自動開啟 submission.csv 檔案
# ============================
os.system(f'open "{output_path}"')


Cross-validated RMSE (log space): 0.12655
Submission file saved to: /Users/sum/Desktop/house-prices-advanced-regression-techniques/submission.csv


0