# Use Optuna to explore our CatBoost hyperparameter space and efficiently drive validation RMSE down.

### Imports & data loading

In [83]:
import time
import numpy as np
import joblib
import warnings
warnings.filterwarnings('ignore')

from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error, r2_score

import optuna

# 1) Load your preprocessed splits
X_train, y_train = joblib.load("data/processed/train.pkl")
X_val,   y_val   = joblib.load("data/processed/val.pkl")
X_test,  y_test  = joblib.load("data/processed/test.pkl")

# Optionally merge train+val later:
X_train_full = np.vstack([X_train, X_val])
y_train_full = np.concatenate([y_train, y_val])

### Baseline CatBoost

In [85]:
baseline = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.07,        # a strong default
    depth=8,
    random_seed=42,
    loss_function="RMSE",
    verbose=False,
)

# train with early stopping on VAL
t0 = time.time()
baseline.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    early_stopping_rounds=50,
)
baseline_time = time.time() - t0

# inference + metrics
t1 = time.time()
y_pred = baseline.predict(X_test)
pred_time = time.time() - t1

test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
test_r2   = r2_score(y_test, y_pred)

print("🏁 Baseline CatBoost")
print(f"⏱ Train time     : {baseline_time:.2f}s")
print(f"🔎 Test RMSE      : {test_rmse:,.2f}")
print(f"📈 Test R²        : {test_r2:.3f}")
print(f"⚡️ Predict time   : {pred_time:.3f}s for {len(X_test)} samples\n")

🏁 Baseline CatBoost
⏱ Train time     : 13.73s
🔎 Test RMSE      : 5,826.89
📈 Test R²        : 0.940
⚡️ Predict time   : 1.354s for 43165 samples



The out-of-the-box CatBoostRegressor (LR=0.07, depth = 8) already outperforms our tuned XGBoost/LightGBM baselines, achieving an R² of 0.94 and bringing error down to about $5.8k 

### Optuna tuning around that sweet spot

In [86]:
def objective(trial):
    # narrow around 0.02–0.12 and depth 6–10
    lr    = trial.suggest_float("learning_rate", 0.02, 0.12, log=False)
    depth = trial.suggest_int("depth", 6, 10)
    subs  = trial.suggest_float("subsample", 0.6, 1.0)
    colsm = trial.suggest_float("colsample_bylevel", 0.6, 1.0)

    model = CatBoostRegressor(
        iterations=2000,             # allow more rounds but early stop
        learning_rate=lr,
        depth=depth,
        subsample=subs,
        colsample_bylevel=colsm,
        random_seed=42,
        loss_function="RMSE",
        verbose=False,
    )

    # use Pool to avoid pandas wrappers
    train_pool = Pool(X_train_full, y_train_full)
    val_pool   = Pool(X_val, y_val)

    model.fit(
        train_pool,
        eval_set=val_pool,
        early_stopping_rounds=50,
    )
    preds = model.predict(X_val)
    return mean_squared_error(y_val, preds, squared=False)  # RMSE

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30, show_progress_bar=True)

print("\n🔍 Best CatBoost trial:")
best = study.best_trial
print(f"  RMSE={best.value:.2f}")
for k,v in best.params.items():
    print(f"  • {k:<20} = {v}")

[I 2025-05-24 20:25:44,191] A new study created in memory with name: no-name-2be0363f-25cf-43b6-8912-60b9193b542a


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-05-24 20:26:09,298] Trial 0 finished with value: 1286.7508224274256 and parameters: {'learning_rate': 0.10478777287244835, 'depth': 7, 'subsample': 0.6330673671473559, 'colsample_bylevel': 0.7545190672922635}. Best is trial 0 with value: 1286.7508224274256.
[I 2025-05-24 20:26:50,501] Trial 1 finished with value: 2448.0141631667043 and parameters: {'learning_rate': 0.04251427298249675, 'depth': 7, 'subsample': 0.9308164973534913, 'colsample_bylevel': 0.6242298001090923}. Best is trial 0 with value: 1286.7508224274256.
[I 2025-05-24 20:28:13,852] Trial 2 finished with value: 1686.8910284603305 and parameters: {'learning_rate': 0.03288859795526015, 'depth': 10, 'subsample': 0.8647433809800993, 'colsample_bylevel': 0.8200377304336339}. Best is trial 0 with value: 1286.7508224274256.
[I 2025-05-24 20:28:36,239] Trial 3 finished with value: 1624.9710761983627 and parameters: {'learning_rate': 0.10815961329900999, 'depth': 6, 'subsample': 0.7990106721294418, 'colsample_bylevel': 0.83

Optuna’s best search on the validation split finds a sharper configuration (notably a higher learning rate and deeper trees). This single‐trial val‐RMSE (~594) hints at far lower error

### Re-fit on Train+Val with best params

In [89]:
best_params = best.params.copy()
best_params.update({
    "iterations": 2000,
    "loss_function": "RMSE",
    "random_seed": 42,
    "verbose": False
})

final_model = CatBoostRegressor(**best_params)

t2 = time.time()
final_model.fit(
    X_train_full, y_train_full,
    eval_set=(X_val, y_val),
    early_stopping_rounds=50,
)
final_time = time.time() - t2

### Final evaluation on Test

In [91]:
t3 = time.time()
y_pred_final = final_model.predict(X_test)
final_pred_time = time.time() - t3

final_rmse = np.sqrt(mean_squared_error(y_test, y_pred_final))
final_r2   = r2_score(y_test, y_pred_final)

In [93]:
print("\n🏆 Tuned CatBoost + Optuna")
print(f"⏱ Train time     : {final_time:.2f}s")
print(f"🔎 Test RMSE      : {final_rmse:,.2f}")
print(f"📈 Test R²        : {final_r2:.3f}")
print(f"⚡️ Predict time   : {final_pred_time:.3f}s for {len(X_test)} samples")


🏆 Tuned CatBoost + Optuna
⏱ Train time     : 44.32s
🔎 Test RMSE      : 5,194.53
📈 Test R²        : 0.952
⚡️ Predict time   : 1.348s for 43165 samples


After refitting on train + val with the top Optuna parameters, CatBoost drops test RMSE to ~$5.2k and raises R² to 0.952—an additional 11% error reduction from the baseline. This makes it our new best model overall:

✔️ Better generalization vs. baseline CatBoost (R² +0.012)

✔️ Stronger performance vs. tuned XGBoost/LightGBM/NeuralNets

⚡ Inference speed unchanged (~1.35 s)