In [1]:
# Import Libraries
from init import *  # Adds project root to sys.path
from src import config
from src.utils import log_score

import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_log_error

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

import optuna
import logging

import mlflow
import mlflow.sklearn
import mlflow.catboost
import mlflow.xgboost
import mlflow.lightgbm
from mlflow.models.signature import infer_signature

import json
from pathlib import Path

optuna.logging.set_verbosity(logging.WARNING)
optuna.logging.set_verbosity(logging.ERROR)

In [2]:
# Load data
train = pd.read_pickle(config.DATA_PATH + "processed/X_train_fe.pkl")
test = pd.read_pickle(config.DATA_PATH + "processed/X_test_fe.pkl")
sample = pd.read_csv(config.DATA_PATH + 'raw/sample_submission.csv')

In [3]:
# Data Preaparting only with Best Features from previous notebook
# 1. Define the exact features you want
selected_features = [
    'Sex',
    'Age',
    'Height',
    'Weight',
    'Duration',
    'Heart_Rate',
    'HR_per_min',
    'Age_Group_Adult',
    'Age_Group_Senior'
]

# 2. Extract X and y
X = train[selected_features].copy()
y = train["Calories"]

In [4]:
# Train and validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Set MLflow location and experiment
mlflow.set_tracking_uri("file:../logs/mlruns")
mlflow.set_experiment("Calories - Optuna Tuning")

# Ensure directory for saving best_params
Path("../logs/best_params").mkdir(parents=True, exist_ok=True)

# Define model configs
model_configs = {
    "rf": {
        "class": RandomForestRegressor,
        "params": lambda t: {
            "n_estimators": t.suggest_int('n_estimators', 50, 300),
            "max_depth": t.suggest_int('max_depth', 3, 15),
            "max_features": t.suggest_categorical('max_features', ['sqrt', 'log2', None]),
            "random_state": 42,
            "n_jobs": -1
        }
    },
    "hgb": {
        "class": HistGradientBoostingRegressor,
        "params": lambda t: {
            "learning_rate": t.suggest_float('learning_rate', 0.01, 0.3),
            "max_depth": t.suggest_int('max_depth', 3, 10),
            "max_iter": t.suggest_int('max_iter', 50, 200),
            "l2_regularization": t.suggest_float('l2_regularization', 0.0, 1.0),
            "random_state": 42
        }
    },
    "xgb": {
        "class": XGBRegressor,
        "params": lambda t: {
            "n_estimators": t.suggest_int('n_estimators', 50, 200),
            "max_depth": t.suggest_int('max_depth', 3, 10),
            "learning_rate": t.suggest_float('learning_rate', 0.01, 0.3),
            "subsample": t.suggest_float('subsample', 0.6, 1.0),
            "colsample_bytree": t.suggest_float('colsample_bytree', 0.6, 1.0),
            "gamma": t.suggest_float('gamma', 0, 5),
            "reg_alpha": t.suggest_float('reg_alpha', 0.0, 1.0),
            "reg_lambda": t.suggest_float('reg_lambda', 0.0, 1.0),
            "random_state": 42,
            "n_jobs": -1
        }
    },
    "lgbm": {
        "class": LGBMRegressor,
        "params": lambda t: {
            "n_estimators": t.suggest_int('n_estimators', 50, 300),
            "max_depth": t.suggest_int('max_depth', 3, 15),
            "learning_rate": t.suggest_float('learning_rate', 0.01, 0.3),
            "num_leaves": t.suggest_int('num_leaves', 20, 100),
            "subsample": t.suggest_float('subsample', 0.6, 1.0),
            "colsample_bytree": t.suggest_float('colsample_bytree', 0.6, 1.0),
            "reg_alpha": t.suggest_float('reg_alpha', 0.0, 1.0),
            "reg_lambda": t.suggest_float('reg_lambda', 0.0, 1.0),
            "random_state": 42,
            "verbosity": -1,
            "n_jobs": -1
        }
    },
    "cat": {
        "class": CatBoostRegressor,
        "params": lambda t: {
            "iterations": t.suggest_int("iterations", 100, 500),
            "depth": t.suggest_int("depth", 4, 10),
            "learning_rate": t.suggest_float("learning_rate", 0.01, 0.3),
            "l2_leaf_reg": t.suggest_float("l2_leaf_reg", 1.0, 10.0),
            "random_seed": 42,
            "verbose": 0
        }
    }
}

# Define objective function for Optuna
def make_objective(model_class, param_func):
    def objective(trial):
        model = model_class(**param_func(trial))
        score = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv=3).mean()
        return -score
    return objective

# Run Optuna tuning, log to MLflow, save best_params
studies = {}
for name, config in model_configs.items():
    print(f"✅ Running Optuna for {name.upper()}...")

    with mlflow.start_run(run_name=f"Tuning_{name.upper()}"):
        study = optuna.create_study(direction='minimize')
        study.optimize(make_objective(config["class"], config["params"]), n_trials=100)

        best_params = study.best_params
        best_score = study.best_value

        # Log to MLflow
        mlflow.log_params({f"{name}__{k}": v for k, v in best_params.items()})
        mlflow.log_metric(f"{name.upper()}_best_score", best_score)

        # Save best_params to JSON
        with open(f"../logs/best_params/{name}.json", "w") as f:
            json.dump(best_params, f, indent=2)

        print(f"✅ {name.upper()} best params saved and logged.")
        studies[name] = study


2025/05/26 18:42:01 INFO mlflow.tracking.fluent: Experiment with name 'Calories - Optuna Tuning' does not exist. Creating a new experiment.


✅ Running Optuna for RF...


In [None]:
# # Set MLflow location and experiment
mlflow.set_tracking_uri("file:../logs/mlruns")
mlflow.set_experiment("Calories - Tuned Models")
Path("../logs/best_params").mkdir(parents=True, exist_ok=True)

# Helper Functions
def save_params_to_json(name, params):
    with open(f"../logs/best_params/{name}.json", "w") as f:
        json.dump(params, f, indent=2)

def train_log_model(name, model, params, pred_val, score, mlflow_module):
    mlflow.log_params({f"{name}__{k}": v for k, v in params.items()})
    mlflow.log_metric(f"RMSLE_{name.upper()}", score)
    signature = infer_signature(X_val, pred_val)
    mlflow_module.log_model(model, artifact_path=f"{name}_model", signature=signature, input_example=X_val.iloc[:1])
    save_params_to_json(name, params)

# Model config
model_configs = [
    ("rf", RandomForestRegressor, rf_study.best_params, mlflow.sklearn),
    ("hgb", HistGradientBoostingRegressor, hgb_study.best_params, mlflow.sklearn),
    ("xgb", XGBRegressor, xgb_study.best_params, mlflow.xgboost),
    ("lgbm", LGBMRegressor, lgbm_study.best_params, mlflow.lightgbm),
    ("cat", CatBoostRegressor, cat_study.best_params, mlflow.catboost)
]

# Start MLflow run
with mlflow.start_run(run_name="tuned_models_all_5"):
    scores = {}

    for name, model_cls, params, mlflow_module in model_configs:
        kwargs = dict(random_state=42) if name != "cat" else dict(random_seed=42, verbose=0, train_dir="../logs/catboost_logs")
        kwargs.update(params)
        model = model_cls(**kwargs)

        model.fit(X_train, y_train)
        val_pred = model.predict(X_val)
        score = rmsle(y_val, val_pred)
        scores[name.upper()] = score

        train_log_model(name, model, params, val_pred, score, mlflow_module)

    # Print RMSLE Scores
    print("\n✅ RMSLE Scores:")
    for model, score in scores.items():
        print(f"RMSLE {model:5s}: {score:.5f}")

In [None]:
# Log Scores
scores = {
    "Random Forest Tuned": score_rf,
    "HistGradientBoosting Tuned": score_hgb,
    "XGBoost Tuned": score_xgb,
    "LightGBM Tuned": score_lgbm,
    "CatBoost Tuned": score_cat
}

for name, score in scores.items():
    log_score(name, score, f"Optuna-tuned {name.split()[0]}, best features, 100 trials V2")


In [9]:
# === Simple average ensemble (5 models) ===
val_avg = (val_rf + val_hgb + val_xgb + val_lgbm + val_cat) / 5
print("RMSLE Avg Ensemble:", rmsle(y_val, val_avg))

# === Weighted average (adjust weights based on model performance if known)
val_weighted = (
    0.3 * val_xgb +
    0.25 * val_rf +
    0.25 * val_cat +
    0.1 * val_lgbm +
    0.1 * val_hgb
)
print("RMSLE Weighted Ensemble:", rmsle(y_val, val_weighted))

RMSLE Avg Ensemble: 0.060154986951392736
RMSLE Weighted Ensemble: 0.05993156413706159


In [1]:
from itertools import product

# Define possible weight values (must sum to 1)
weight_range = [0.1, 0.2, 0.3, 0.4, 0.5]

best_score = float('inf')
best_weights = None

print("🔁 Running grid search for weighted ensemble...\n")

for w in product(weight_range, repeat=5):
    if abs(sum(w) - 1.0) > 0.001:
        continue  # Skip if weights don't sum to ~1

    w_xgb, w_cat, w_rf, w_lgbm, w_hgb = w

    val_pred = (
        w_xgb * val_xgb +
        w_cat * val_cat +
        w_rf * val_rf +
        w_lgbm * val_lgbm +
        w_hgb * val_hgb
    )

    score = rmsle(y_val, val_pred)

    if score < best_score:
        best_score = score
        best_weights = w

        print(f"✅ New best RMSLE: {score:.6f} with weights: XGB={w_xgb}, CAT={w_cat}, RF={w_rf}, LGBM={w_lgbm}, HGB={w_hgb}")

print(f"\n🎯 Best weights found: {best_weights} → RMSLE: {best_score:.6f}")


🔁 Running grid search for weighted ensemble...



NameError: name 'val_xgb' is not defined

In [1]:
!echo "✅ Model tuning is complete" | mail -s "ML Job Done" 4377791620@fido.ca


In [4]:
!echo "Test email" | mail -s "Testing" saayedalam@gmail.com