In [1]:
# Import Libraries
from init import *  # Adds project root to sys.path
from src import config
from src.utils import log_score

import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_log_error

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

import optuna
import logging

import mlflow
import mlflow.sklearn
import mlflow.catboost
from mlflow.models.signature import infer_signature

optuna.logging.set_verbosity(logging.WARNING)
optuna.logging.set_verbosity(logging.ERROR)

In [2]:
# Load data
train = pd.read_pickle(config.DATA_PATH + "processed/X_train_fe.pkl")
test = pd.read_pickle(config.DATA_PATH + "processed/X_test_fe.pkl")
sample = pd.read_csv(config.DATA_PATH + 'raw/sample_submission.csv')

In [3]:
# Data Preaparting only with Best Features from previous notebook
# 1. Define the exact features you want
selected_features = [
    'Sex',
    'Age',
    'Height',
    'Weight',
    'Duration',
    'Heart_Rate',
    'HR_per_min',
    'Age_Group_Adult',
    'Age_Group_Senior'
]

# 2. Extract X and y
X = train[selected_features].copy()
y = train["Calories"]

In [4]:
# Train and validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Hyperparameter Tuning - Optuna

# Random Forest
def rf_objective(trial):
    model = RandomForestRegressor(
        n_estimators=trial.suggest_int('n_estimators', 50, 300),
        max_depth=trial.suggest_int('max_depth', 3, 15),
        max_features=trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        random_state=42,
        n_jobs=-1
    )
    score = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv=3).mean()
    return -score

# HistGradientBoosting
def hgb_objective(trial):
    model = HistGradientBoostingRegressor(
        learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3),
        max_depth=trial.suggest_int('max_depth', 3, 10),
        max_iter=trial.suggest_int('max_iter', 50, 200),
        l2_regularization=trial.suggest_float('l2_regularization', 0.0, 1.0),
        random_state=42
    )
    score = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv=3).mean()
    return -score

# XGBoost
def xgb_objective(trial):
    model = XGBRegressor(
        n_estimators=trial.suggest_int('n_estimators', 50, 200),
        max_depth=trial.suggest_int('max_depth', 3, 10),
        learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3),
        subsample=trial.suggest_float('subsample', 0.6, 1.0),
        colsample_bytree=trial.suggest_float('colsample_bytree', 0.6, 1.0),
        gamma=trial.suggest_float('gamma', 0, 5),
        reg_alpha=trial.suggest_float('reg_alpha', 0.0, 1.0),
        reg_lambda=trial.suggest_float('reg_lambda', 0.0, 1.0),
        random_state=42,
        n_jobs=-1
    )
    score = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv=3).mean()
    return -score

# LightGBM
def lgbm_objective(trial):
    model = LGBMRegressor(
        n_estimators=trial.suggest_int('n_estimators', 50, 300),
        max_depth=trial.suggest_int('max_depth', 3, 15),
        learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3),
        num_leaves=trial.suggest_int('num_leaves', 20, 100),
        subsample=trial.suggest_float('subsample', 0.6, 1.0),
        colsample_bytree=trial.suggest_float('colsample_bytree', 0.6, 1.0),
        reg_alpha=trial.suggest_float('reg_alpha', 0.0, 1.0),
        reg_lambda=trial.suggest_float('reg_lambda', 0.0, 1.0),
        random_state=42,
        verbosity=-1,
        n_jobs=-1
    )
    score = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv=3).mean()
    return -score

# CatBoost
def catboost_objective(trial):
    model = CatBoostRegressor(
        iterations=trial.suggest_int("iterations", 100, 500),
        depth=trial.suggest_int("depth", 4, 10),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.3),
        l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1.0, 10.0),
        random_seed=42,
        verbose=0
    )
    score = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv=3).mean()
    return -score

In [6]:
# Run Optuna on All Models

print("✅ Running Optuna for RandomForest...")
rf_study = optuna.create_study(direction='minimize')
rf_study.optimize(rf_objective, n_trials=100)
print("Best RF Params:", rf_study.best_params)

print("✅ Running Optuna for HGB...")
hgb_study = optuna.create_study(direction='minimize')
hgb_study.optimize(hgb_objective, n_trials=100)
print("Best HGB Params:", hgb_study.best_params)

print("✅ Running Optuna for XGB...")
xgb_study = optuna.create_study(direction='minimize')
xgb_study.optimize(xgb_objective, n_trials=100)
print("Best XGB Params:", xgb_study.best_params)

print("✅ Running Optuna for LGBM...")
lgbm_study = optuna.create_study(direction='minimize')
lgbm_study.optimize(lgbm_objective, n_trials=100)
print("Best LGBM Params:", lgbm_study.best_params)

print("✅ Running Optuna for CatBoost...")
cat_study = optuna.create_study(direction='minimize')
cat_study.optimize(catboost_objective, n_trials=100)
print("Best CatBoost Params:", cat_study.best_params)

✅ Running Optuna for RandomForest...
Best RF Params: {'n_estimators': 291, 'max_depth': 15, 'max_features': 'log2'}
✅ Running Optuna for HGB...
Best HGB Params: {'learning_rate': 0.130994262143399, 'max_depth': 10, 'max_iter': 198, 'l2_regularization': 0.7023407953668039}
✅ Running Optuna for XGB...
Best XGB Params: {'n_estimators': 195, 'max_depth': 10, 'learning_rate': 0.03990489755046095, 'subsample': 0.6270828536610373, 'colsample_bytree': 0.6832104546238639, 'gamma': 1.0687895750485235, 'reg_alpha': 0.04657777019499847, 'reg_lambda': 0.020519254830031176}
✅ Running Optuna for LGBM...
Best LGBM Params: {'n_estimators': 255, 'max_depth': 8, 'learning_rate': 0.0513321899000855, 'num_leaves': 98, 'subsample': 0.7888096900485683, 'colsample_bytree': 0.6183319373406093, 'reg_alpha': 0.5941258351285894, 'reg_lambda': 0.3016808668191065}
✅ Running Optuna for CatBoost...
Best CatBoost Params: {'iterations': 462, 'depth': 10, 'learning_rate': 0.08393764522637687, 'l2_leaf_reg': 1.4330091874

In [7]:
# Set MLflow location and experiment
mlflow.set_tracking_uri("file:../logs/mlruns")
mlflow.set_experiment("Calories - Tuned Models")

# Helper function
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, np.clip(y_pred, 0, None)))

# Start MLflow run
with mlflow.start_run(run_name="tuned_models_all_5"):
    scores = {}

    # 1. Random Forest
    rf = RandomForestRegressor(**rf_study.best_params, random_state=42)
    rf.fit(X_train, y_train)
    val_rf = rf.predict(X_val)
    score_rf = rmsle(y_val, val_rf)
    scores["RF"] = score_rf
    mlflow.log_params({f"rf__{k}": v for k, v in rf_study.best_params.items()})
    mlflow.log_metric("RMSLE_RF", score_rf)
    signature_rf = infer_signature(X_val, val_rf)
    mlflow.sklearn.log_model(rf, artifact_path="rf_model", signature=signature_rf, input_example=X_val.iloc[:1])

    # 2. HGB
    hgb = HistGradientBoostingRegressor(**hgb_study.best_params, random_state=42)
    hgb.fit(X_train, y_train)
    val_hgb = hgb.predict(X_val)
    score_hgb = rmsle(y_val, val_hgb)
    scores["HGB"] = score_hgb
    mlflow.log_params({f"hgb__{k}": v for k, v in hgb_study.best_params.items()})
    mlflow.log_metric("RMSLE_HGB", score_hgb)
    signature_hgb = infer_signature(X_val, val_hgb)
    mlflow.sklearn.log_model(hgb, artifact_path="hgb_model", signature=signature_hgb, input_example=X_val.iloc[:1])

    # 3. XGBoost
    xgb = XGBRegressor(**xgb_study.best_params, random_state=42, verbosity=0)
    xgb.fit(X_train, y_train)
    val_xgb = xgb.predict(X_val)
    score_xgb = rmsle(y_val, val_xgb)
    scores["XGB"] = score_xgb
    mlflow.log_params({f"xgb__{k}": v for k, v in xgb_study.best_params.items()})
    mlflow.log_metric("RMSLE_XGB", score_xgb)
    signature_xgb = infer_signature(X_val, val_xgb)
    mlflow.xgboost.log_model(xgb, artifact_path="xgb_model", signature=signature_xgb, input_example=X_val.iloc[:1])

    # 4. LightGBM
    lgbm = LGBMRegressor(**lgbm_study.best_params, random_state=42, verbosity=-1)
    lgbm.fit(X_train, y_train)
    val_lgbm = lgbm.predict(X_val)
    score_lgbm = rmsle(y_val, val_lgbm)
    scores["LGBM"] = score_lgbm
    mlflow.log_params({f"lgbm__{k}": v for k, v in lgbm_study.best_params.items()})
    mlflow.log_metric("RMSLE_LGBM", score_lgbm)
    signature_lgbm = infer_signature(X_val, val_lgbm)
    mlflow.lightgbm.log_model(lgbm, artifact_path="lgbm_model", signature=signature_lgbm, input_example=X_val.iloc[:1])

    # 5. CatBoost
    cat = CatBoostRegressor(**cat_study.best_params, random_seed=42, verbose=0, train_dir="../logs/catboost_logs")
    cat.fit(X_train, y_train)
    val_cat = cat.predict(X_val)
    score_cat = rmsle(y_val, val_cat)
    scores["CAT"] = score_cat
    mlflow.log_params({f"cat__{k}": v for k, v in cat_study.best_params.items()})
    mlflow.log_metric("RMSLE_CAT", score_cat)
    signature_cat = infer_signature(X_val, val_cat)
    mlflow.catboost.log_model(cat, artifact_path="cat_model", signature=signature_cat, input_example=X_val.iloc[:1])

    # Print scores
    print("\n✅ RMSLE Scores:")
    for model, score in scores.items():
        print(f"RMSLE {model:5s}: {score:.5f}")



✅ RMSLE Scores:
RMSLE RF   : 0.06175
RMSLE HGB  : 0.06453
RMSLE XGB  : 0.06053
RMSLE LGBM : 0.06202
RMSLE CAT  : 0.06094


In [8]:
# Log Scores
log_score("Random Forest Tuned", score_rf, "Optuna-tuned RF, best features, 100 trials")
log_score("HistGradientBoosting Tuned", score_hgb, "Optuna-tuned HGB, best features, 100 trials")
log_score("XGBoost Tuned", score_xgb, "Optuna-tuned XGB, best features, 100 trials")
log_score("LightGBM Tuned", score_lgbm, "Optuna-tuned LGBM, best features, 100 trials")
log_score("CatBoost Tuned", score_cat, "Optuna-tuned CatBoost, best features, 100 trials")

✅ Logged: Random Forest Tuned | Score: 0.06175
✅ Logged: HistGradientBoosting Tuned | Score: 0.06453
✅ Logged: XGBoost Tuned | Score: 0.06053
✅ Logged: LightGBM Tuned | Score: 0.06202
✅ Logged: CatBoost Tuned | Score: 0.06094


In [9]:
# === Simple average ensemble (5 models) ===
val_avg = (val_rf + val_hgb + val_xgb + val_lgbm + val_cat) / 5
print("RMSLE Avg Ensemble:", rmsle(y_val, val_avg))

# === Weighted average (adjust weights based on model performance if known)
val_weighted = (
    0.3 * val_xgb +
    0.25 * val_rf +
    0.25 * val_cat +
    0.1 * val_lgbm +
    0.1 * val_hgb
)
print("RMSLE Weighted Ensemble:", rmsle(y_val, val_weighted))

RMSLE Avg Ensemble: 0.060154986951392736
RMSLE Weighted Ensemble: 0.05993156413706159
