In [1]:
# Import Libraries
from init import *  # Adds project root to sys.path
from src import config
from src.utils import log_score

import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.base import clone

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

import optuna
import logging

import mlflow
import mlflow.sklearn
import mlflow.catboost
import mlflow.xgboost
import mlflow.lightgbm
from mlflow.models.signature import infer_signature

import json
from pathlib import Path

optuna.logging.set_verbosity(logging.WARNING)
optuna.logging.set_verbosity(logging.ERROR)

In [2]:
"""
# Load data
train = pd.read_pickle(config.DATA_PATH + "processed/X_train_fe.pkl")
test = pd.read_pickle(config.DATA_PATH + "processed/X_test_fe.pkl")
sample = pd.read_csv(config.DATA_PATH + 'raw/sample_submission.csv')
"""
# Load data
train = pd.read_csv(config.DATA_PATH + 'raw/train.csv')
test = pd.read_csv(config.DATA_PATH + 'raw/test.csv')
sample = pd.read_csv(config.DATA_PATH + 'raw/sample_submission.csv')

In [3]:
# Data Preaparting only with Best Features from previous notebook
"""
selected_features = [
    'Sex',
    'Age',
    'Height',
    'Weight',
    'Duration',
    'Heart_Rate',
    'HR_per_min',
    'Age_Group_Adult',
    'Age_Group_Senior'
]

# Extract X and y
X = train[selected_features].copy()
y = train["Calories"]
X_test = test[selected_features].copy()
"""
# Data Preaparting
train['Sex'] = train['Sex'].map({'male': 1, 'female': 0})
test['Sex'] = test['Sex'].map({'male': 1, 'female': 0})

X = train.drop(columns=["id", "Calories"]) # Drop Target and ID
y = train["Calories"] # Define Target

X_test = test.drop(columns=["id"]) # Drop ID from Test

In [4]:
# Train and validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Set MLflow location and experiment
mlflow.set_tracking_uri("file:../logs/mlruns")
mlflow.set_experiment("Calories - Optuna Tuning - Original Features")

# Ensure directory for saving best_params
Path("../logs/best_params_orig_features").mkdir(parents=True, exist_ok=True)

# Define model configs
model_configs = {
    "rf": {
        "class": RandomForestRegressor,
        "params": lambda t: {
            "n_estimators": t.suggest_int('n_estimators', 50, 300),
            "max_depth": t.suggest_int('max_depth', 3, 15),
            "max_features": t.suggest_categorical('max_features', ['sqrt', 'log2', None]),
            "random_state": 42,
            "n_jobs": -1
        }
    },
    "hgb": {
        "class": HistGradientBoostingRegressor,
        "params": lambda t: {
            "learning_rate": t.suggest_float('learning_rate', 0.01, 0.3),
            "max_depth": t.suggest_int('max_depth', 3, 10),
            "max_iter": t.suggest_int('max_iter', 50, 200),
            "l2_regularization": t.suggest_float('l2_regularization', 0.0, 1.0),
            "random_state": 42
        }
    },
    "xgb": {
        "class": XGBRegressor,
        "params": lambda t: {
            "n_estimators": t.suggest_int('n_estimators', 50, 200),
            "max_depth": t.suggest_int('max_depth', 3, 10),
            "learning_rate": t.suggest_float('learning_rate', 0.01, 0.3),
            "subsample": t.suggest_float('subsample', 0.6, 1.0),
            "colsample_bytree": t.suggest_float('colsample_bytree', 0.6, 1.0),
            "gamma": t.suggest_float('gamma', 0, 5),
            "reg_alpha": t.suggest_float('reg_alpha', 0.0, 1.0),
            "reg_lambda": t.suggest_float('reg_lambda', 0.0, 1.0),
            "random_state": 42,
            "n_jobs": -1
        }
    },
    "lgbm": {
        "class": LGBMRegressor,
        "params": lambda t: {
            "n_estimators": t.suggest_int('n_estimators', 50, 300),
            "max_depth": t.suggest_int('max_depth', 3, 15),
            "learning_rate": t.suggest_float('learning_rate', 0.01, 0.3),
            "num_leaves": t.suggest_int('num_leaves', 20, 100),
            "subsample": t.suggest_float('subsample', 0.6, 1.0),
            "colsample_bytree": t.suggest_float('colsample_bytree', 0.6, 1.0),
            "reg_alpha": t.suggest_float('reg_alpha', 0.0, 1.0),
            "reg_lambda": t.suggest_float('reg_lambda', 0.0, 1.0),
            "random_state": 42,
            "verbosity": -1,
            "n_jobs": -1
        }
    },
    "cat": {
        "class": CatBoostRegressor,
        "params": lambda t: {
            "iterations": t.suggest_int("iterations", 100, 500),
            "depth": t.suggest_int("depth", 4, 10),
            "learning_rate": t.suggest_float("learning_rate", 0.01, 0.3),
            "l2_leaf_reg": t.suggest_float("l2_leaf_reg", 1.0, 10.0),
            "random_seed": 42,
            "verbose": 0
        }
    }
}

# Define objective function for Optuna
def make_objective(model_class, param_func):
    def objective(trial):
        model = model_class(**param_func(trial))
        score = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv=5).mean()
        return -score
    return objective

# Run Optuna tuning, log to MLflow, save best_params
studies = {}
for name, config in model_configs.items():
    print(f"✅ Running Optuna for {name.upper()}...")

    with mlflow.start_run(run_name=f"Tuning_{name.upper()}"):
        study = optuna.create_study(direction='minimize')
        study.optimize(make_objective(config["class"], config["params"]), n_trials=100)

        best_params = study.best_params
        best_score = study.best_value

        # Log to MLflow
        mlflow.log_params({f"{name}__{k}": v for k, v in best_params.items()})
        mlflow.log_metric(f"{name.upper()}_best_score", best_score)

        # Save best_params to JSON
        with open(f"../logs/best_params_orig_features/{name}.json", "w") as f:
            json.dump(best_params, f, indent=2)

        print(f"✅ {name.upper()} best params of original features are saved and logged.")
        studies[name] = study


✅ Running Optuna for RF...
✅ RF best params of original features are saved and logged.
✅ Running Optuna for HGB...
✅ HGB best params of original features are saved and logged.
✅ Running Optuna for XGB...
✅ XGB best params of original features are saved and logged.
✅ Running Optuna for LGBM...
✅ LGBM best params of original features are saved and logged.
✅ Running Optuna for CAT...
✅ CAT best params of original features are saved and logged.


In [7]:
# Set MLflow location and experiment
mlflow.set_tracking_uri("file:../logs/mlruns")
mlflow.set_experiment("Calories - Tuned Models - Original Features")
Path("../logs/best_params_orig_features").mkdir(parents=True, exist_ok=True)

# Helper Functions
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, np.clip(y_pred, 0, None)))

def save_params_to_json(name, params):
    with open(f"../logs/best_params_orig_features/{name}.json", "w") as f:
        json.dump(params, f, indent=2)

def load_params_from_json(name):
    with open(f"../logs/best_params_orig_features/{name}.json", "r") as f:
        return json.load(f)

def train_log_model(name, model, params, pred_val, score, mlflow_module):
    mlflow.log_params({f"{name}__{k}": v for k, v in params.items()})
    mlflow.log_metric(f"RMSLE_{name.upper()}", score)
    signature = infer_signature(X_val, pred_val)
    mlflow_module.log_model(model, artifact_path=f"{name}_model", signature=signature, input_example=X_val.iloc[:1])
    save_params_to_json(name, params)

# Load best parameters from JSON
model_configs = [
    ("rf", RandomForestRegressor, load_params_from_json("rf"), mlflow.sklearn),
    ("hgb", HistGradientBoostingRegressor, load_params_from_json("hgb"), mlflow.sklearn),
    ("xgb", XGBRegressor, load_params_from_json("xgb"), mlflow.xgboost),
    ("lgbm", LGBMRegressor, load_params_from_json("lgbm"), mlflow.lightgbm),
    ("cat", CatBoostRegressor, load_params_from_json("cat"), mlflow.catboost)
]

# Train and log models
with mlflow.start_run(run_name="tuned_models_all_5"):
    scores = {}

    for name, model_cls, params, mlflow_module in model_configs:
        # Extra settings for CatBoost
        kwargs = dict(random_state=42) if name != "cat" else dict(random_seed=42, verbose=0, train_dir="../logs/catboost_logs")
        kwargs.update(params)

        model = model_cls(**kwargs)
        model.fit(X_train, y_train)
        val_pred = model.predict(X_val)
        score = rmsle(y_val, val_pred)
        scores[name.upper()] = score

        train_log_model(name, model, params, val_pred, score, mlflow_module)

    # === Print RMSLE Scores ===
    print("\n✅ RMSLE Scores:")
    for model_name, score in scores.items():
        print(f"RMSLE {model_name:5s}: {score:.5f}")

2025/05/28 10:24:30 INFO mlflow.tracking.fluent: Experiment with name 'Calories - Tuned Models - Original Features' does not exist. Creating a new experiment.



✅ RMSLE Scores:
RMSLE RF   : 0.06126
RMSLE HGB  : 0.06396
RMSLE XGB  : 0.06077
RMSLE LGBM : 0.06278
RMSLE CAT  : 0.06097


In [9]:
# Define model metadata and MLflow modules
model_info = {
    "rf": (RandomForestRegressor, {"random_state": 42}, mlflow.sklearn),
    "hgb": (HistGradientBoostingRegressor, {"random_state": 42}, mlflow.sklearn),
    "xgb": (XGBRegressor, {"random_state": 42}, mlflow.xgboost),
    "lgbm": (LGBMRegressor, {"random_state": 42}, mlflow.lightgbm),
    "cat": (CatBoostRegressor, {"random_seed": 42, "verbose": 0, "train_dir": "../logs/catboost_logs"}, mlflow.catboost)
}

preds = {}
scores = {}

# Run all models and log with MLflow
with mlflow.start_run(run_name="tuned_models_refactored"):
    for name, (cls, const_params, mlflow_module) in model_info.items():
        with open(f"../logs/best_params_orig_features/{name}.json") as f:
            params = json.load(f)
        params.update(const_params)
        model = cls(**params)
        model.fit(X_train, y_train)
        val_pred = model.predict(X_val)
        preds[name] = val_pred

        score = rmsle(y_val, val_pred)
        scores[name.upper()] = score
        log_score(f"{name.upper()} Tuned", score, "Refactored full pipeline - Original Features")

        signature = infer_signature(X_val, val_pred)
        mlflow_module.log_model(model, artifact_path=f"{name}_model", signature=signature, input_example=X_val.iloc[:1])
        mlflow.log_params({f"{name}__{k}": v for k, v in params.items()})
        mlflow.log_metric(f"RMSLE_{name.upper()}", score)

    print("\n✅ RMSLE Individual Scores:")
    for model, score in scores.items():
        print(f"{model:5s}: {score:.5f}")

✅ Logged: RF Tuned | Score: 0.06126
✅ Logged: HGB Tuned | Score: 0.06396
✅ Logged: XGB Tuned | Score: 0.06077
✅ Logged: LGBM Tuned | Score: 0.06278
✅ Logged: CAT Tuned | Score: 0.06097

✅ RMSLE Individual Scores:
RF   : 0.06126
HGB  : 0.06396
XGB  : 0.06077
LGBM : 0.06278
CAT  : 0.06097


In [10]:
# Ensemble evaluations
val_avg = np.mean(list(preds.values()), axis=0)
score_avg = rmsle(y_val, val_avg)
log_score("Ensemble Avg", score_avg, "Simple mean of all 5 models")
mlflow.log_metric("RMSLE_ENSEMBLE_AVG", score_avg)

weights = {"xgb": 0.3, "rf": 0.25, "cat": 0.25, "lgbm": 0.1, "hgb": 0.1}
val_weighted = sum(w * preds[k] for k, w in weights.items())
score_weighted = rmsle(y_val, val_weighted)
log_score("Ensemble Weighted", score_weighted, "Weighted mean based on model strength")
mlflow.log_metric("RMSLE_ENSEMBLE_WEIGHTED", score_weighted)

print("\n✅ RMSLE Ensemble Scores:")
print(f"Avg     : {score_avg:.5f}")
print(f"Weighted: {score_weighted:.5f}")

✅ Logged: Ensemble Avg | Score: 0.05995
✅ Logged: Ensemble Weighted | Score: 0.05976

✅ RMSLE Ensemble Scores:
Avg     : 0.05995
Weighted: 0.05976


In [11]:
from sklearn.linear_model import Ridge

# === 1. Create validation meta-features for stacking
train_meta = np.column_stack([preds[model] for model in ["rf", "hgb", "xgb", "lgbm", "cat"]])
meta_model = Ridge(alpha=1.0, random_state=42)
meta_model.fit(train_meta, y_val)

# === 2. Generate test predictions for each base model
test_preds = {}
for name, (cls, const_params, _) in model_info.items():
    with open(f"../logs/best_params/{name}.json") as f:
        params = json.load(f)
    params.update(const_params)
    model = cls(**params)
    model.fit(X_train, y_train)
    test_preds[name] = model.predict(X_test)

# === 3. Stack test predictions for meta model input
test_meta = np.column_stack([test_preds[k] for k in ["rf", "hgb", "xgb", "lgbm", "cat"]])
final_pred = meta_model.predict(test_meta)

# === 4. Optional: Evaluate on validation set to log stacking performance
val_stack_pred = meta_model.predict(train_meta)
score_stack = rmsle(y_val, val_stack_pred)

log_score("Ensemble Stacking", score_stack, "Ridge meta-model on base model outputs")
mlflow.log_metric("RMSLE_ENSEMBLE_STACKING", score_stack)

print("\n✅ RMSLE Stacking Ensemble:")
print(f"Stacking : {score_stack:.5f}")


✅ Logged: Ensemble Stacking | Score: 0.05996

✅ RMSLE Stacking Ensemble:
Stacking : 0.05996


In [12]:
# === 1. Stacked Ensemble (Ridge)
sample["Calories"] = final_pred
submission_path_stack = "../outputs/submission_stacked_ensemble.csv"
sample.to_csv(submission_path_stack, index=False)
print(f"✅ Submission saved: {submission_path_stack}")

# === 2. Simple Average Ensemble
val_avg = np.mean(list(preds.values()), axis=0)
avg_test = np.mean([test_preds[k] for k in ["rf", "hgb", "xgb", "lgbm", "cat"]], axis=0)

sample["Calories"] = avg_test
submission_path_avg = "../outputs/submission_avg_ensemble.csv"
sample.to_csv(submission_path_avg, index=False)
print(f"✅ Submission saved: {submission_path_avg}")

# === 3. Weighted Average Ensemble
weights = {"xgb": 0.3, "rf": 0.25, "cat": 0.25, "lgbm": 0.1, "hgb": 0.1}
weighted_test = sum(weights[k] * test_preds[k] for k in weights)

sample["Calories"] = weighted_test
submission_path_weighted = "../outputs/submission_weighted_ensemble.csv"
sample.to_csv(submission_path_weighted, index=False)
print(f"✅ Submission saved: {submission_path_weighted}")

✅ Submission saved: ../outputs/submission_stacked_ensemble.csv
✅ Submission saved: ../outputs/submission_avg_ensemble.csv
✅ Submission saved: ../outputs/submission_weighted_ensemble.csv


In [13]:
# Prepare variables for Optuna weighted ensemble
val_xgb = preds["xgb"]
val_cat = preds["cat"]
val_rf = preds["rf"]
val_lgbm = preds["lgbm"]
val_hgb = preds["hgb"]

test_xgb = test_preds["xgb"]
test_cat = test_preds["cat"]
test_rf = test_preds["rf"]
test_lgbm = test_preds["lgbm"]
test_hgb = test_preds["hgb"]

In [14]:
# === Define Optuna objective ===
def weight_objective(trial):
    # Suggest weights for each model between 0 and 1
    w_xgb = trial.suggest_float("w_xgb", 0, 1)
    w_cat = trial.suggest_float("w_cat", 0, 1)
    w_rf = trial.suggest_float("w_rf", 0, 1)
    w_lgbm = trial.suggest_float("w_lgbm", 0, 1)
    w_hgb = trial.suggest_float("w_hgb", 0, 1)

    # Normalize weights to sum to 1
    total = w_xgb + w_cat + w_rf + w_lgbm + w_hgb
    w_xgb /= total
    w_cat /= total
    w_rf /= total
    w_lgbm /= total
    w_hgb /= total

    # Create blended prediction
    val_pred = (
        w_xgb * val_xgb +
        w_cat * val_cat +
        w_rf * val_rf +
        w_lgbm * val_lgbm +
        w_hgb * val_hgb
    )

    # Calculate RMSLE
    return rmsle(y_val, val_pred)

# === Run Optuna ===
optuna.logging.set_verbosity(optuna.logging.WARNING)
study = optuna.create_study(direction="minimize")
study.optimize(weight_objective, n_trials=300)

# === Get best weights ===
best_weights = study.best_params
total = sum(best_weights.values())
normalized_weights = {k: v / total for k, v in best_weights.items()}
print("\n🎯 Best normalized weights found:", normalized_weights)
print("📉 Best RMSLE:", study.best_value)

# === Apply weights to test set ===
test_weighted = (
    normalized_weights['w_xgb'] * test_xgb +
    normalized_weights['w_cat'] * test_cat +
    normalized_weights['w_rf'] * test_rf +
    normalized_weights['w_lgbm'] * test_lgbm +
    normalized_weights['w_hgb'] * test_hgb
)

# === Save submission ===
sample['Calories'] = test_weighted
sample.to_csv('../outputs/submission_tuned_weighted_ensemble.csv', index=False)
print("✅ Submission saved as '../outputs/submission_tuned_weighted_ensemble.csv'")



🎯 Best normalized weights found: {'w_xgb': 0.1477070571366856, 'w_cat': 0.35199589837970613, 'w_rf': 0.3608387220319866, 'w_lgbm': 0.06810229309383056, 'w_hgb': 0.07135602935779119}
📉 Best RMSLE: 0.0596955085912612
✅ Submission saved as '../outputs/submission_tuned_weighted_ensemble.csv'


In [15]:
import smtplib
from email.mime.text import MIMEText

def send_sms(phone_email, subject, body):
    msg = MIMEText(body)
    msg['Subject'] = subject
    msg['From'] = "your_email@gmail.com"
    msg['To'] = phone_email

    with smtplib.SMTP_SSL("smtp.gmail.com", 465) as server:
        server.login("saayedalam@gmail.com", "tbac orha quvt mzgs")  # ← Paste app password here
        server.send_message(msg)

# Send to Fido via email-to-SMS
send_sms("saayedalam@gmail.com", "ML Job", "✅ Model tuning is complete!")