In [18]:

# 1. Imports
import numpy as np
import pandas as pd
import joblib

model = joblib.load("ridge_house_prices.pkl")


In [19]:
# =========================
# 3. Load test data
# =========================
test = pd.read_csv("data/test.csv")

# Save Id for submission
test_ids = test["Id"]

# Drop Id from features
X_test = test.drop("Id", axis=1)


In [20]:
# =========================
# 4. DOMAIN-AWARE MISSING VALUE HANDLING
# (must EXACTLY match training)
# =========================

# Categorical where NaN means NONE
none_cols = [
    "Alley", "BsmtQual", "BsmtCond", "BsmtExposure",
    "BsmtFinType1", "BsmtFinType2", "FireplaceQu",
    "GarageType", "GarageFinish", "GarageQual", "GarageCond",
    "PoolQC", "Fence", "MiscFeature"
]

for col in none_cols:
    X_test[col] = X_test[col].fillna("None")


# Numerical where NaN means 0
zero_cols = [
    "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF",
    "TotalBsmtSF", "GarageCars", "GarageArea", "MasVnrArea"
]

for col in zero_cols:
    X_test[col] = X_test[col].fillna(0)


# LotFrontage → Neighborhood median
X_test["LotFrontage"] = X_test.groupby("Neighborhood")["LotFrontage"] \
                              .transform(lambda x: x.fillna(x.median()))

# GarageYrBlt → YearBuilt fallback
X_test["GarageYrBlt"] = X_test["GarageYrBlt"].fillna(X_test["YearBuilt"])


In [21]:
# =========================
# 5. FEATURE ENGINEERING (same as training)
# =========================

X_test["TotalSF"] = (
    X_test["TotalBsmtSF"] +
    X_test["1stFlrSF"] +
    X_test["2ndFlrSF"]
)

X_test["TotalBathrooms"] = (
    X_test["FullBath"] +
    0.5 * X_test["HalfBath"] +
    X_test["BsmtFullBath"] +
    0.5 * X_test["BsmtHalfBath"]
)

X_test["HouseAge"] = X_test["YrSold"] - X_test["YearBuilt"]

X_test["Remodeled"] = (X_test["YearRemodAdd"] != X_test["YearBuilt"]).astype(int)

X_test["HasGarage"] = (X_test["GarageArea"] > 0).astype(int)
X_test["HasBasement"] = (X_test["TotalBsmtSF"] > 0).astype(int)
X_test["HasFireplace"] = (X_test["Fireplaces"] > 0).astype(int)


In [22]:
# =========================
# 6. LOG TRANSFORM SKEWED FEATURES
# (same as training)
# =========================

skewed_features = [
    "LotArea", "GrLivArea", "TotalBsmtSF",
    "1stFlrSF", "2ndFlrSF"
]

for col in skewed_features:
    X_test[col] = np.log1p(X_test[col])


In [23]:
# =========================
# 7. Predict (log scale)
# =========================

log_preds = model.predict(X_test)

# =========================
# 8. Exponentiate predictions
# =========================
final_preds = np.expm1(log_preds)


In [24]:
# =========================
# 9. Create submission file
# =========================

submission = pd.DataFrame({
    "Id": test_ids,
    "SalePrice": final_preds
})

submission.to_csv("submission.csv", index=False)

print("submission.csv created successfully!")


submission.csv created successfully!
