In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# 1. Load dataset
url = "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv"
df = pd.read_csv(url)
print(df.head())

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  


In [2]:

# 2. Features & target
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"]

# Identify categorical and numerical features
categorical = X.select_dtypes(include=["object"]).columns
numerical = X.select_dtypes(exclude=["object"]).columns

# 3. Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical)
    ]
)

# 4. Models
models = {
    "XGBoost": XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=6),
    "LightGBM": LGBMRegressor(n_estimators=500, learning_rate=0.05, max_depth=-1),
    "CatBoost": CatBoostRegressor(n_estimators=500, learning_rate=0.05, depth=6, verbose=0)
}

# 5. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train and evaluate
for name, model in models.items():
    pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    print(f"{name} -> RMSE: {rmse:.2f}, R²: {r2:.3f}")


XGBoost -> RMSE: 47365.40, R²: 0.829
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000537 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1846
[LightGBM] [Info] Number of data points in the train set: 16512, number of used features: 12
[LightGBM] [Info] Start training from score 207194.693738
LightGBM -> RMSE: 45699.72, R²: 0.841
CatBoost -> RMSE: 48347.41, R²: 0.822


In [None]:
import os, json, joblib
from datetime import datetime

# 1. Make logs directory
os.makedirs("logs", exist_ok=True)

# 2. Timestamped run folder
run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
run_dir = f"../logs/{run_id}"
os.makedirs(run_dir, exist_ok=True)

# 3. Save metrics
metrics = {
    "rmse": rmse,
    "r2": r2,
    "model": name
}
with open(f"{run_dir}/metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

# 4. Save trained model
joblib.dump(pipeline, f"{run_dir}/{name}_model.pkl")

print(f"Saved run logs to {run_dir}")


Saved run logs to logs/20250921_125610
