<a href="https://colab.research.google.com/github/seliyos/Predicting-Real-Estate-Prices/blob/master/notebooks/final_house_price_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ============================================================
# 0) Imports
# ============================================================
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

%matplotlib inline


In [2]:
# ============================================================
# 1) Load data
# ============================================================

possible_paths = ["train.csv", "./train.csv", "../data/train.csv", "data/train.csv"]
train_path = None
for p in possible_paths:
    if os.path.exists(p):
        train_path = p
        break

if train_path is None:
    raise FileNotFoundError("Could not find train.csv. Upload it in Colab Files (left panel) and try again.")

train_df = pd.read_csv(train_path)
print("Loaded:", train_path)
print(f"Dataset shape: {train_df.shape}")
train_df.head()


FileNotFoundError: Could not find train.csv. Upload it in Colab Files (left panel) and try again.

In [None]:
# ============================================================
# 2) Quick EDA
# ============================================================

print("\nColumns:", train_df.columns.tolist()[:10], "...")
print("\nTarget summary (SalePrice):")
print(train_df["SalePrice"].describe())

plt.figure(figsize=(10, 6))
plt.hist(train_df["SalePrice"], bins=50)
plt.xlabel("Sale Price")
plt.ylabel("Frequency")
plt.title("Distribution of Sale Prices")
plt.show()

# Basic missingness check (top 15 columns with most missing)
missing_counts = train_df.isna().sum().sort_values(ascending=False)
print("\nTop 15 columns with most missing values:")
print(missing_counts.head(15))


In [None]:
# ============================================================
# 3) Define features (X) and target (y)
# ============================================================

# Drop Id
if "Id" in train_df.columns:
    train_df = train_df.drop(columns=["Id"])

y = train_df["SalePrice"]
X = train_df.drop(columns=["SalePrice"])

print("X shape:", X.shape)
print("y shape:", y.shape)


In [None]:
# ============================================================
# 4) Train/Test split (reproducible)
# ============================================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

print("Train size:", X_train.shape[0], "Test size:", X_test.shape[0])


In [None]:
# ============================================================
# 5) Preprocessing pipeline (handles missing + categorical encoding)
# ============================================================

numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

print("Numeric features:", len(numeric_features))
print("Categorical features:", len(categorical_features))


In [None]:
# ============================================================
# Helper: metrics function
# ============================================================

def regression_metrics(y_true, y_pred):
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return rmse, mae, r2


In [None]:
# ============================================================
# 5.5) Dumb baseline (extremely simple): predict median SalePrice
# ============================================================

dumb_pred = np.full(shape=len(y_test), fill_value=np.median(y_train))

dumb_mse = mean_squared_error(y_test, dumb_pred)
dumb_rmse = np.sqrt(dumb_mse)
dumb_mae = mean_absolute_error(y_test, dumb_pred)
dumb_r2 = r2_score(y_test, dumb_pred)

print("Dumb baseline (Median predictor) metrics on TEST:")
print("RMSE:", round(dumb_rmse, 2))
print("MAE :", round(dumb_mae, 2))
print("R^2 :", round(dumb_r2, 4))


In [None]:
# ============================================================
# 6) Baseline model (REQUIRED): Linear Regression
# ============================================================

# Build baseline pipeline
baseline_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LinearRegression())
])

# Train model
baseline_model.fit(X_train, y_train)

# Predict on test set
baseline_pred = baseline_model.predict(X_test)

# ---- Metrics (fixed for sklearn version) ----
mse = mean_squared_error(y_test, baseline_pred)
baseline_rmse = np.sqrt(mse)
baseline_mae = mean_absolute_error(y_test, baseline_pred)
baseline_r2 = r2_score(y_test, baseline_pred)

# Print results
print("Baseline (Linear Regression) metrics on TEST:")
print("RMSE:", round(baseline_rmse, 2))
print("MAE :", round(baseline_mae, 2))
print("R^2 :", round(baseline_r2, 4))


In [None]:
# ============================================================
# 7) Oracle model
# ============================================================

oracle_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", RandomForestRegressor(
        n_estimators=500,
        random_state=42,
        n_jobs=-1
    ))
])

# Train oracle model
oracle_model.fit(X_train, y_train)

# Predict on TRAIN data
oracle_pred_train = oracle_model.predict(X_train)

# ---- Metrics (fixed, no squared argument) ----
oracle_mse = mean_squared_error(y_train, oracle_pred_train)
oracle_rmse = np.sqrt(oracle_mse)
oracle_mae = mean_absolute_error(y_train, oracle_pred_train)
oracle_r2 = r2_score(y_train, oracle_pred_train)

# Print results
print("Oracle (Cheating) metrics on TRAIN (upper bound reference):")
print("RMSE:", round(oracle_rmse, 2))
print("MAE :", round(oracle_mae, 2))
print("R^2 :", round(oracle_r2, 4))


In [None]:
# ============================================================
# 8) Advanced model (main method): Random Forest Regressor
# ============================================================

rf_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", RandomForestRegressor(
        n_estimators=500,
        random_state=42,
        n_jobs=-1
    ))
])

# Train model
rf_model.fit(X_train, y_train)

# Predict on TEST data
rf_pred = rf_model.predict(X_test)

# ---- Metrics (safe version) ----
rf_mse = mean_squared_error(y_test, rf_pred)
rf_rmse = np.sqrt(rf_mse)
rf_mae = mean_absolute_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)

print("Advanced Model (Random Forest) metrics on TEST:")
print("RMSE:", round(rf_rmse, 2))
print("MAE :", round(rf_mae, 2))
print("R^2 :", round(rf_r2, 4))


In [None]:
# ============================================================
# 9) Model comparison table
# ============================================================

results = pd.DataFrame([
    {
        "Model": "Dumb baseline: Median predictor (TEST)",
        "RMSE": dumb_rmse,
        "MAE": dumb_mae,
        "R2": dumb_r2
    },
    {
        "Model": "Baseline: Linear Regression (TEST)",
        "RMSE": baseline_rmse,
        "MAE": baseline_mae,
        "R2": baseline_r2
    },
    {
        "Model": "Oracle: Random Forest (TRAIN, cheating)",
        "RMSE": oracle_rmse,
        "MAE": oracle_mae,
        "R2": oracle_r2
    },
    {
        "Model": "Advanced: Random Forest (TEST)",
        "RMSE": rf_rmse,
        "MAE": rf_mae,
        "R2": rf_r2
    }
])

results


In [None]:
# ============================================================
# 10) VISUALIZATION & ERROR ANALYSIS
# ============================================================

In [None]:
# 10.1 Feature Correlation with SalePrice

corr = train_df.corr(numeric_only=True)

top_corr = corr["SalePrice"].abs().sort_values(ascending=False).head(15).index

plt.figure(figsize=(10,8))
sns.heatmap(corr.loc[top_corr, top_corr], annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Top Feature Correlations with SalePrice")
plt.show()


In [None]:
# 10.2 Predicted vs Actual (Random Forest)

plt.figure(figsize=(6,6))
plt.scatter(y_test, rf_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()],
         [y_test.min(), y_test.max()],
         'r--')
plt.xlabel("Actual SalePrice")
plt.ylabel("Predicted SalePrice")
plt.title("Random Forest: Predicted vs Actual")
plt.show()


In [None]:
# 10.4 Residuals vs Predictions (Random Forest)

plt.figure(figsize=(6,5))
plt.scatter(rf_pred, residuals, alpha=0.5)
plt.axhline(0, color="red", linestyle="--")
plt.xlabel("Predicted SalePrice")
plt.ylabel("Residual (Actual - Predicted)")
plt.title("Residuals vs Predictions (Random Forest)")
plt.show()


In [None]:
# 10.5 Error analysis: Where does the model struggle? (RMSE by Neighborhood)

if "Neighborhood" in X_test.columns:
    err_df = pd.DataFrame({
        "Neighborhood": X_test["Neighborhood"].values,
        "y_true": y_test.values,
        "y_pred": rf_pred
    })
    err_df["residual"] = err_df["y_true"] - err_df["y_pred"]

    by_nbhd = (
        err_df.groupby("Neighborhood")
        .apply(lambda g: pd.Series({
            "count": len(g),
            "rmse": np.sqrt(np.mean((g["y_true"] - g["y_pred"]) ** 2)),
            "mae": np.mean(np.abs(g["y_true"] - g["y_pred"]))
        }))
        .sort_values("rmse", ascending=False)
    )

    print("Worst 10 Neighborhoods by RMSE (Random Forest):")
    print(by_nbhd.head(10))

else:
    print("Neighborhood column not found in X_test. Skipping neighborhood error analysis.")
