# ============================================================
# California Housing Price Prediction
# ============================================================
# Goal: Predict median house values in California districts and understand the
# structural, socioeconomic, and geographic drivers of those values.
# Key Tasks:
#   1. Explore and visualize the housing data.
#   2. Create a robust train/test split using stratified sampling on income.
#   3. Engineer meaningful features (e.g., rooms per household).
#   4. Build preprocessing pipelines for numeric + categorical variables.
#   5. Train and compare multiple models (Linear, Tree, Random Forest).
#   6. Fine-tune a Random Forest with GridSearchCV.
#   7. Evaluate the best model on a held-out test set and save it.
# Tools: Python, pandas, numpy, matplotlib, scikit-learn, joblib
# Dataset: Kaggle (California Housing dataset)

In [None]:
# ---------- IMPORT LIBRARIES ----------
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
from matplotlib.ticker import FuncFormatter
from pandas.plotting import scatter_matrix

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
# ---------- FEATURE LABELS WITH UNITS ----------
feature_labels = {
    "longitude": "Longitude (degrees, west negative)",
    "latitude": "Latitude (degrees)",
    "housing_median_age": "Median Housing Age (years)",
    "total_rooms": "Total Rooms",
    "total_bedrooms": "Total Bedrooms",
    "population": "Population (persons)",
    "households": "Households (count)",
    "median_income": "Median Household Income (10,000s USD)",
    "median_house_value": "Median House Value (USD)",
    "rooms_per_household": "Rooms per Household",
    "bedrooms_ratio": "Bedrooms / Rooms Ratio",
    "population_per_household": "Population per Household",
    "income_cat": "Income Category"
}

# ---------- PROJECT SETUP ----------
project_dir = Path.cwd()

fig_dir     = project_dir / "figures"
models_dir  = project_dir / "models"
outputs_dir = project_dir / "outputs"

for d in [fig_dir, models_dir, outputs_dir]:
    d.mkdir(exist_ok=True)

def save_plot(filename, width=8, height=5, dpi=300):
    """
    Save the current Matplotlib figure into the figures/ folder.
    """
    plt.gcf().set_size_inches(width, height)
    plt.savefig(fig_dir / filename, dpi=dpi, bbox_inches="tight")

def save_output(df, filename):
    """
    Save a pandas DataFrame into the outputs/ directory as CSV.
    """
    filepath = outputs_dir / filename
    df.to_csv(filepath, index=False)
    print(f"Saved output: {filepath}")
    return filepath

# -------------------- PLOT STYLE --------------------
sns.set(style="whitegrid")

In [None]:
# -------------------- LOAD DATA & BASIC INFO --------------------
data_path = project_dir / "california_housing_data.csv"
cali_df = pd.read_csv(data_path)

plt.rc("font", size=14)
plt.rc("axes", labelsize=14, titlesize=14)
plt.rc("legend", fontsize=12)
plt.rc("xtick", labelsize=10)
plt.rc("ytick", labelsize=10)

print("First 5 rows:")
print(cali_df.head(), "\n")

print("Info:")
print(cali_df.info(), "\n")

print("Ocean proximity value counts:")
print(cali_df["ocean_proximity"].value_counts(), "\n")

print("Descriptive statistics:")
print(cali_df.describe(include="all"), "\n")

In [None]:
# -------------------- HISTOGRAMS OF NUMERICAL FEATURES --------------------
def thousands_formatter(x, pos):
    return f"{int(x/1000)}k"

num_cols_hist = [
    "longitude", "latitude", "housing_median_age",
    "total_rooms", "total_bedrooms", "population",
    "households", "median_income", "median_house_value"
]

fig, axes = plt.subplots(3, 3, figsize=(14, 9))
axes = axes.flatten()

right_skew_cols = ["total_rooms", "total_bedrooms", "population", "households"]

for ax, col in zip(axes, num_cols_hist):
    ax.hist(cali_df[col], bins=50)
    ax.set_xlabel(feature_labels.get(col, col))
    ax.set_ylabel("Frequency")
    ax.set_title("")

    # Zoom in on very skewed count variables (show up to 99th percentile)
    if col in right_skew_cols:
        upper = cali_df[col].quantile(0.99)
        ax.set_xlim(cali_df[col].min(), upper)

    # House value: show full range and format ticks
    if col == "median_house_value":
        ax.xaxis.set_major_formatter(FuncFormatter(thousands_formatter))
        ax.set_xlim(0, 500000)

    # Income: keep typical range
    if col == "median_income":
        ax.set_xlim(0, 16)

# Remove unused axes if any
for j in range(len(num_cols_hist), len(axes)):
    fig.delaxes(axes[j])

fig.suptitle("Distribution of Numerical Features", y=1.02)
plt.tight_layout()
save_plot("hist_numerical_features.png", width=14, height=9, dpi=300)
plt.show()

In [None]:
# -------------------- TRAIN / TEST SPLIT (STRATIFIED) --------------------
cali_df["income_cat"] = pd.cut(
    cali_df["median_income"],
    bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
    labels=[1, 2, 3, 4, 5]
)

strat_train_set, strat_test_set = train_test_split(
    cali_df,
    test_size=0.2,
    stratify=cali_df["income_cat"],
    random_state=42
)

# Drop temporary stratification column
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

# Separate features and labels
X_train = strat_train_set.drop("median_house_value", axis=1)
y_train = strat_train_set["median_house_value"].copy()

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

# Copy for visualization / correlations
cali_df_vis = strat_train_set.copy()

In [None]:
# -------------------- DATA VISUALIZATION --------------------
# Basic geographic scatter
ax = cali_df_vis.plot(
    kind="scatter",
    x="longitude",
    y="latitude",
    alpha=0.2,
    grid=True,
    figsize=(8, 6),
    title="California Housing - Locations"
)
ax.set_xlabel(feature_labels["longitude"])
ax.set_ylabel(feature_labels["latitude"])
save_plot("locations_scatter.png", width=8, height=6, dpi=300)
plt.show()

# Geographic scatter with population & price
ax = cali_df_vis.plot(
    kind="scatter",
    x="longitude",
    y="latitude",
    grid=True,
    s=cali_df_vis["population"] / 100,
    c="median_house_value",
    cmap="jet",
    colorbar=True,
    figsize=(10, 7),
    alpha=0.6,
    title="Locations Colored by Median House Value (Size ~ Population)"
)
ax.set_xlabel(feature_labels["longitude"])
ax.set_ylabel(feature_labels["latitude"])
save_plot("locations_population_price.png", width=10, height=7, dpi=300)
plt.show()

# Correlation matrix for selected numeric features
num_cols = [
    "longitude", "latitude", "housing_median_age",
    "total_rooms", "total_bedrooms",
    "population", "households",
    "median_income", "median_house_value"
]

corr_matrix = cali_df_vis[num_cols].corr()

print("Correlation with target (median_house_value):")
print(corr_matrix["median_house_value"].sort_values(ascending=False), "\n")

plt.figure(figsize=(8, 6))
plt.title("Correlation Heatmap (Selected Numeric Features)")
im = plt.imshow(corr_matrix, cmap="coolwarm", interpolation="nearest")
plt.colorbar(im, fraction=0.046, pad=0.04)
plt.xticks(range(len(num_cols)), num_cols, rotation=45, ha="right")
plt.yticks(range(len(num_cols)), num_cols)
plt.tight_layout()
save_plot("correlation_heatmap_selected.png", width=8, height=6, dpi=300)
plt.show()

# -------------------- SCATTER MATRIX --------------------
attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]

scatter_labels = {
    "median_house_value": "Median Value (USD)",
    "median_income": "Median Income (10k USD)",
    "total_rooms": "Total Rooms",
    "housing_median_age": "Median Age (yrs)"
}

sm = scatter_matrix(
    cali_df_vis[attributes],
    figsize=(12, 8), 
    alpha=0.15,
    diagonal="hist"
)

fig = plt.gcf()

# Smaller tick labels
for ax in sm.flatten():
    ax.tick_params(labelsize=8)

# Label bottom row x-axes
for ax, col in zip(sm[-1, :], attributes):
    ax.set_xlabel(scatter_labels.get(col, col), fontsize=10)

# Label left column y-axes
for ax, col in zip(sm[:, 0], attributes):
    ax.set_ylabel(scatter_labels.get(col, col), fontsize=10)

# Format house value axes in thousands
for i in range(len(attributes)):
    for j in range(len(attributes)):
        ax = sm[i, j]
        if attributes[j] == "median_house_value":
            ax.xaxis.set_major_formatter(FuncFormatter(thousands_formatter))
        if attributes[i] == "median_house_value":
            ax.yaxis.set_major_formatter(FuncFormatter(thousands_formatter))

# Manual spacing instead of tight_layout to prevent overlap
fig.suptitle("Scatter Matrix - Key Features", y=0.96)
fig.subplots_adjust(
    left=0.08, right=0.98,
    bottom=0.08, top=0.90,
    hspace=0.35, wspace=0.25
)

save_plot("scatter_matrix_key_features.png", width=12, height=8, dpi=300)
plt.show()

# Median income vs median house value
ax = cali_df_vis.plot(
    kind="scatter",
    x="median_income",
    y="median_house_value",
    alpha=0.1,
    grid=True,
    figsize=(8, 6),
    title="Median Income vs Median House Value"
)
ax.set_xlabel(feature_labels["median_income"])
ax.set_ylabel(feature_labels["median_house_value"])
save_plot("median_income_vs_price.png", width=8, height=6, dpi=300)
plt.show()

In [None]:
# -------------------- FEATURE ENGINEERING --------------------
cali_df_vis["rooms_per_household"] = cali_df_vis["total_rooms"] / cali_df_vis["households"]
cali_df_vis["bedrooms_ratio"] = cali_df_vis["total_bedrooms"] / cali_df_vis["total_rooms"]
cali_df_vis["population_per_household"] = cali_df_vis["population"] / cali_df_vis["households"]

num_cols_new = num_cols + ["rooms_per_household", "bedrooms_ratio", "population_per_household"]
corr_matrix_new = cali_df_vis[num_cols_new].corr()
print("Correlation with target after feature engineering:")
print(corr_matrix_new["median_house_value"].sort_values(ascending=False), "\n")

# -------------------- CUSTOM ATTRIBUTE ADDER --------------------
num_attribs = list(X_train.select_dtypes(include=[np.number]).columns)
cat_attribs = ["ocean_proximity"]

rooms_ix = num_attribs.index("total_rooms")
bedrooms_ix = num_attribs.index("total_bedrooms")
population_ix = num_attribs.index("population")
households_ix = num_attribs.index("households")

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

# -------------------- PREPROCESSING PIPELINES --------------------
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("attribs_adder", CombinedAttributesAdder()),
    ("std_scaler", StandardScaler()),
])

preprocessing = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(drop="first"), cat_attribs),
])

X_train_prepared = preprocessing.fit_transform(X_train)
X_test_prepared = preprocessing.transform(X_test)

In [None]:
# -------------------- MODEL TRAINING & EVALUATION --------------------
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

results = []

# Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train_prepared, y_train)
y_train_lin_pred = lin_reg.predict(X_train_prepared)
lin_train_rmse = rmse(y_train, y_train_lin_pred)
print(f"Linear Regression RMSE (train): {lin_train_rmse:.2f}")

lin_rmse_cv = -cross_val_score(
    lin_reg,
    X_train_prepared,
    y_train,
    scoring="neg_root_mean_squared_error",
    cv=10
)
print(f"Linear Regression RMSE (CV mean): {lin_rmse_cv.mean():.2f}")
results.append({
    "model": "LinearRegression",
    "cv_rmse": lin_rmse_cv.mean()
})

# Decision Tree
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(X_train_prepared, y_train)
tree_pred_train = tree_reg.predict(X_train_prepared)
tree_train_rmse = rmse(y_train, tree_pred_train)
print(f"Decision Tree RMSE (train): {tree_train_rmse:.2f} (likely overfitting)")

tree_rmse_cv = -cross_val_score(
    tree_reg,
    X_train_prepared,
    y_train,
    scoring="neg_root_mean_squared_error",
    cv=10
)
print(f"Decision Tree RMSE (CV mean): {tree_rmse_cv.mean():.2f}")
results.append({
    "model": "DecisionTree",
    "cv_rmse": tree_rmse_cv.mean()
})

# Random Forest
forest_reg = RandomForestRegressor(random_state=42)
forest_rmse_cv = -cross_val_score(
    forest_reg,
    X_train_prepared,
    y_train,
    scoring="neg_root_mean_squared_error",
    cv=10
)
print(f"Random Forest RMSE (CV mean): {forest_rmse_cv.mean():.2f}")
results.append({
    "model": "RandomForest (baseline)",
    "cv_rmse": forest_rmse_cv.mean()
})

results_df = pd.DataFrame(results)
print("\nCross-validated RMSE comparison:")
print(results_df, "\n")

save_output(results_df, "cv_rmse_summary.csv")

plt.figure(figsize=(6, 4))
sns.barplot(data=results_df, x="model", y="cv_rmse")
plt.title("Cross-Validated RMSE by Model (Lower is Better)")
plt.ylabel("RMSE (Median House Value, USD)")
plt.xlabel("Model")
plt.tight_layout()
save_plot("cv_rmse_comparison.png", width=6, height=4, dpi=300)
plt.show()

In [None]:
# -------------------- GRID SEARCH (RANDOM FOREST) --------------------
param_grid = [
    {"n_estimators": [30, 50, 100], "max_features": [4, 6, 8, 10]},
    {"bootstrap": [False], "n_estimators": [30, 50], "max_features": [4, 6, 8]},
]

grid_search = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring="neg_mean_squared_error",
    return_train_score=True,
    n_jobs=-1
)
grid_search.fit(X_train_prepared, y_train)

print("Best parameters from grid search:")
print(grid_search.best_params_, "\n")

best_model = grid_search.best_estimator_

# -------------------- FEATURE IMPORTANCE --------------------
feature_importances = best_model.feature_importances_

extra_attribs = ["rooms_per_household", "population_per_household", "bedrooms_per_room"]
cat_encoder = preprocessing.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])[1:]

attributes_all = num_attribs + extra_attribs + cat_one_hot_attribs

feat_importance_pairs = sorted(
    zip(feature_importances, attributes_all),
    reverse=True
)

print("Top features by importance:")
for importance, name in feat_importance_pairs[:10]:
    print(f"{name}: {importance:.4f}")

feat_importance_df = pd.DataFrame(
    feat_importance_pairs,
    columns=["importance", "feature"]
)
save_output(feat_importance_df, "feature_importances_random_forest.csv")

plt.figure(figsize=(10, 6))
top_feats = feat_importance_pairs[:15]
feat_values, feat_names = zip(*top_feats)
plt.barh(feat_names[::-1], feat_values[::-1])
plt.xlabel("Feature Importance")
plt.title("Random Forest Feature Importance (Top 15)")
plt.tight_layout()
save_plot("feature_importance_random_forest.png", width=10, height=6, dpi=300)
plt.show()

In [None]:
# -------------------- TEST SET EVALUATION --------------------
final_predictions = best_model.predict(X_test_prepared)
final_rmse = rmse(y_test, final_predictions)
final_r2 = r2_score(y_test, final_predictions)

print("\n=== Final Model Summary (Random Forest) ===")
print(f"Test RMSE: {final_rmse:.2f}")
print(f"Test R²:   {final_r2:.3f}")

metrics_df = pd.DataFrame({
    "model": ["RandomForest (tuned)"],
    "test_rmse": [final_rmse],
    "test_r2": [final_r2]
})
save_output(metrics_df, "test_metrics.csv")

test_predictions_df = pd.DataFrame({
    "y_true": y_test.values,
    "y_pred": final_predictions
})
save_output(test_predictions_df, "test_set_predictions.csv")

# Predicted vs True Plot
plt.figure(figsize=(8, 6))
plt.scatter(final_predictions, y_test, alpha=0.3)
min_val, max_val = y_test.min(), y_test.max()
plt.plot([min_val, max_val], [min_val, max_val], "r--", lw=2)
plt.xlabel("Predicted Median House Value (USD)")
plt.ylabel("True Median House Value (USD)")
plt.title("Predicted vs True Median House Value (Test Set)")
plt.tight_layout()
save_plot("predicted_vs_true.png", width=8, height=6, dpi=300)
plt.show()

# --- SAVE BEST MODEL ---
model_path = models_dir / "california_housing_best_model.pkl"
joblib.dump(best_model, model_path)
print(f"\nSaved best model to: {model_path}")

In [None]:
# ---------- DEMO PREDICTIONS ----------
demo_preds = best_model.predict(X_test_prepared[:5])
print("\nDemo predictions (first 5 test rows):")
print("Predicted:", np.round(demo_preds, 2))
print("Actual:", np.round(y_test.iloc[:5].values, 2))

# ---------- PIPELINE COMPLETE ----------
print("Pipeline complete:")
print("- Data loading, cleaning, and exploratory analysis")
print("- Stratified train/test split based on income distribution")
print("- Feature engineering (rooms per household, ratios, population density)")
print("- Preprocessing pipelines (imputation, scaling, one-hot encoding)")
print("- Model training & comparison (Linear, Decision Tree, Random Forest)")
print("- Hyperparameter tuning with GridSearchCV (Random Forest)")
print("- Final model evaluation on held-out test set (RMSE, R²)")
print("- Feature importance analysis and model interpretability")
print("- Artifacts saved to /figures, /outputs, and /models")