In [7]:
# Imports
from init import *
from src import config
from src.utils import log_score

import pandas as pd
import numpy as np
from pathlib import Path
import json
import logging
import optuna

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

import mlflow
import mlflow.sklearn
import mlflow.xgboost
import mlflow.lightgbm
import mlflow.catboost
from mlflow.models.signature import infer_signature

optuna.logging.set_verbosity(logging.ERROR)

In [8]:
# Load data
train = pd.read_pickle(config.DATA_PATH + "processed/X_train_fe.pkl")
test = pd.read_pickle(config.DATA_PATH + "processed/X_test_fe.pkl")
sample = pd.read_csv(config.DATA_PATH + 'raw/sample_submission.csv')

In [9]:
# Feature selection
selected_features = [
    'Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate',
    'HR_per_min', 'Age_Group_Adult', 'Age_Group_Senior'
]
X = train[selected_features].copy()
y = train["Calories"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Model Configurations
model_configs = {
    "rf": {
        "class": RandomForestRegressor,
        "params": lambda t: {
            "n_estimators": t.suggest_int('n_estimators', 50, 300),
            "max_depth": t.suggest_int('max_depth', 3, 15),
            "max_features": t.suggest_categorical('max_features', ['sqrt', 'log2', None]),
            "random_state": 42,
            "n_jobs": -1
        }
    },
    "hgb": {
        "class": HistGradientBoostingRegressor,
        "params": lambda t: {
            "learning_rate": t.suggest_float('learning_rate', 0.01, 0.3),
            "max_depth": t.suggest_int('max_depth', 3, 10),
            "max_iter": t.suggest_int('max_iter', 50, 200),
            "l2_regularization": t.suggest_float('l2_regularization', 0.0, 1.0),
            "random_state": 42
        }
    },
    "xgb": {
        "class": XGBRegressor,
        "params": lambda t: {
            "n_estimators": t.suggest_int('n_estimators', 50, 200),
            "max_depth": t.suggest_int('max_depth', 3, 10),
            "learning_rate": t.suggest_float('learning_rate', 0.01, 0.3),
            "subsample": t.suggest_float('subsample', 0.6, 1.0),
            "colsample_bytree": t.suggest_float('colsample_bytree', 0.6, 1.0),
            "gamma": t.suggest_float('gamma', 0, 5),
            "reg_alpha": t.suggest_float('reg_alpha', 0.0, 1.0),
            "reg_lambda": t.suggest_float('reg_lambda', 0.0, 1.0),
            "random_state": 42,
            "n_jobs": -1
        }
    },
    "lgbm": {
        "class": LGBMRegressor,
        "params": lambda t: {
            "n_estimators": t.suggest_int('n_estimators', 50, 300),
            "max_depth": t.suggest_int('max_depth', 3, 15),
            "learning_rate": t.suggest_float('learning_rate', 0.01, 0.3),
            "num_leaves": t.suggest_int('num_leaves', 20, 100),
            "subsample": t.suggest_float('subsample', 0.6, 1.0),
            "colsample_bytree": t.suggest_float('colsample_bytree', 0.6, 1.0),
            "reg_alpha": t.suggest_float('reg_alpha', 0.0, 1.0),
            "reg_lambda": t.suggest_float('reg_lambda', 0.0, 1.0),
            "random_state": 42,
            "verbosity": -1,
            "n_jobs": -1
        }
    },
    "cat": {
        "class": CatBoostRegressor,
        "params": lambda t: {
            "iterations": t.suggest_int("iterations", 100, 500),
            "depth": t.suggest_int("depth", 4, 10),
            "learning_rate": t.suggest_float("learning_rate", 0.01, 0.3),
            "l2_leaf_reg": t.suggest_float("l2_leaf_reg", 1.0, 10.0),
            "random_seed": 42,
            "verbose": 0
        }
    }
}

In [11]:
# Tuning function
def run_optuna_tuning(models_to_run=None, n_trials=100, X=None, y=None, save_dir="../logs/best_params", experiment_name="Calories - Optuna Tuning"):
    Path(save_dir).mkdir(parents=True, exist_ok=True)
    mlflow.set_tracking_uri("file:../logs/mlruns")
    mlflow.set_experiment(experiment_name)

    def make_objective(model_class, param_func):
        def objective(trial):
            model = model_class(**param_func(trial))
            score = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv=3).mean()
            return -score
        return objective

    selected = {k: v for k, v in model_configs.items() if models_to_run is None or k in models_to_run}
    studies = {}
    for name, config in selected.items():
        print(f"✅ Running Optuna for {name.upper()}...")
        with mlflow.start_run(run_name=f"Tuning_{name.upper()}"):
            study = optuna.create_study(direction='minimize')
            study.optimize(make_objective(config["class"], config["params"]), n_trials=n_trials)

            best_params = study.best_params
            best_score = study.best_value
            mlflow.log_params({f"{name}__{k}": v for k, v in best_params.items()})
            mlflow.log_metric(f"{name.upper()}_best_score", best_score)

            with open(f"{save_dir}/{name}.json", "w") as f:
                json.dump(best_params, f, indent=2)
            print(f"✅ {name.upper()} best params saved and logged.")
            studies[name] = study
    return studies

In [12]:
# Helper to log model after training
def save_params_to_json(name, params):
    with open(f"../logs/best_params/{name}.json", "w") as f:
        json.dump(params, f, indent=2)

def train_log_model(name, model, params, pred_val, score, mlflow_module):
    mlflow.log_params({f"{name}__{k}": v for k, v in params.items()})
    mlflow.log_metric(f"RMSLE_{name.upper()}", score)
    signature = infer_signature(X_val, pred_val)
    mlflow_module.log_model(model, artifact_path=f"{name}_model", signature=signature, input_example=X_val.iloc[:1])
    save_params_to_json(name, params)

In [None]:
# Run tuning
run_optuna_tuning(X=X, y=y)

✅ Running Optuna for RF...


In [None]:
# Train + log models
model_classes = {
    "rf": (RandomForestRegressor, mlflow.sklearn),
    "hgb": (HistGradientBoostingRegressor, mlflow.sklearn),
    "xgb": (XGBRegressor, mlflow.xgboost),
    "lgbm": (LGBMRegressor, mlflow.lightgbm),
    "cat": (CatBoostRegressor, mlflow.catboost)
}

with mlflow.start_run(run_name="tuned_models_all_5"):
    scores = {}
    for name, (model_cls, mlflow_module) in model_classes.items():
        with open(f"../logs/best_params/{name}.json", "r") as f:
            params = json.load(f)

        # Adjust params
        const = dict(random_state=42) if name != "cat" else dict(random_seed=42, verbose=0, train_dir="../logs/catboost_logs")
        const.update(params)

        model = model_cls(**const)
        model.fit(X_train, y_train)
        val_pred = model.predict(X_val)
        score = np.sqrt(mean_squared_log_error(y_val, np.clip(val_pred, 0, None)))
        scores[name.upper()] = score
        log_score(
            name.upper() + " Tuned",
            score,
            f"Optuna-tuned {name.upper()}, best features, 100 trials"
        )


        train_log_model(name, model, params, val_pred, score, mlflow_module)

    print("\n✅ RMSLE Scores:")
    for model, score in scores.items():
        print(f"RMSLE {model:5s}: {score:.5f}")

In [None]:
import smtplib
from email.mime.text import MIMEText

def send_sms(phone_email, subject, body):
    msg = MIMEText(body)
    msg['Subject'] = subject
    msg['From'] = "your_email@gmail.com"
    msg['To'] = phone_email

    with smtplib.SMTP_SSL("smtp.gmail.com", 465) as server:
        server.login("saayedalam@gmail.com", "tbac orha quvt mzgs")  # ← Paste app password here
        server.send_message(msg)

# Send to Fido via email-to-SMS
send_sms("saayedalam@gmail.com", "ML Job", "✅ Model tuning is complete!")