In [19]:
# Import Libraries
from init import *  # Adds project root to sys.path
from src import config
from src.utils import log_score

import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

import optuna
import logging

optuna.logging.set_verbosity(logging.WARNING)
optuna.logging.set_verbosity(logging.ERROR)

In [20]:
# Load data
train = pd.read_pickle(config.DATA_PATH + "processed/X_train_fe.pkl")
test = pd.read_pickle(config.DATA_PATH + "processed/X_test_fe.pkl")
sample = pd.read_csv(config.DATA_PATH + 'raw/sample_submission.csv')

In [21]:
# Data Preaparting only with Best Features from previous notebook
# 1. Define the exact features you want
selected_features = [
    'Sex',
    'Age',
    'Height',
    'Weight',
    'Duration',
    'Heart_Rate',
    'HR_per_min',
    'Age_Group_Adult',
    'Age_Group_Senior'
]

# 2. Extract X and y
X = train[selected_features].copy()
y = train["Calories"]

In [22]:
# Train and validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# Hyperparameter Tuning - Optuna

# Random Forest
def rf_objective(trial):
    model = RandomForestRegressor(
        n_estimators=trial.suggest_int('n_estimators', 50, 300),
        max_depth=trial.suggest_int('max_depth', 3, 15),
        max_features=trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        random_state=42,
        n_jobs=-1
    )
    score = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv=3).mean()
    return -score

# HistGradientBoosting
def hgb_objective(trial):
    model = HistGradientBoostingRegressor(
        learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3),
        max_depth=trial.suggest_int('max_depth', 3, 10),
        max_iter=trial.suggest_int('max_iter', 50, 200),
        l2_regularization=trial.suggest_float('l2_regularization', 0.0, 1.0),
        random_state=42
    )
    score = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv=3).mean()
    return -score

# XGBoost
def xgb_objective(trial):
    model = XGBRegressor(
        n_estimators=trial.suggest_int('n_estimators', 50, 200),
        max_depth=trial.suggest_int('max_depth', 3, 10),
        learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3),
        subsample=trial.suggest_float('subsample', 0.6, 1.0),
        colsample_bytree=trial.suggest_float('colsample_bytree', 0.6, 1.0),
        gamma=trial.suggest_float('gamma', 0, 5),
        reg_alpha=trial.suggest_float('reg_alpha', 0.0, 1.0),
        reg_lambda=trial.suggest_float('reg_lambda', 0.0, 1.0),
        random_state=42,
        n_jobs=-1
    )
    score = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv=3).mean()
    return -score

# LightGBM
def lgbm_objective(trial):
    model = LGBMRegressor(
        n_estimators=trial.suggest_int('n_estimators', 50, 300),
        max_depth=trial.suggest_int('max_depth', 3, 15),
        learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3),
        num_leaves=trial.suggest_int('num_leaves', 20, 100),
        subsample=trial.suggest_float('subsample', 0.6, 1.0),
        colsample_bytree=trial.suggest_float('colsample_bytree', 0.6, 1.0),
        reg_alpha=trial.suggest_float('reg_alpha', 0.0, 1.0),
        reg_lambda=trial.suggest_float('reg_lambda', 0.0, 1.0),
        random_state=42,
        n_jobs=-1
    )
    score = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv=3).mean()
    return -score

# CatBoost
def catboost_objective(trial):
    model = CatBoostRegressor(
        iterations=trial.suggest_int("iterations", 100, 500),
        depth=trial.suggest_int("depth", 4, 10),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.3),
        l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1.0, 10.0),
        random_seed=42,
        verbose=0
    )
    score = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv=3).mean()
    return -score

In [None]:
# Run Optuna on All Models

print("✅ Running Optuna for RandomForest...")
rf_study = optuna.create_study(direction='minimize')
rf_study.optimize(rf_objective, n_trials=20)
print("Best RF Params:", rf_study.best_params)

print("✅ Running Optuna for HGB...")
hgb_study = optuna.create_study(direction='minimize')
hgb_study.optimize(hgb_objective, n_trials=20)
print("Best HGB Params:", hgb_study.best_params)

print("✅ Running Optuna for XGB...")
xgb_study = optuna.create_study(direction='minimize')
xgb_study.optimize(xgb_objective, n_trials=20)
print("Best XGB Params:", xgb_study.best_params)

print("✅ Running Optuna for LGBM...")
lgbm_study = optuna.create_study(direction='minimize')
lgbm_study.optimize(lgbm_objective, n_trials=20)
print("Best LGBM Params:", lgbm_study.best_params)

print("✅ Running Optuna for CatBoost...")
cat_study = optuna.create_study(direction='minimize')
cat_study.optimize(catboost_objective, n_trials=100)
print("Best CatBoost Params:", cat_study.best_params)

✅ Running Optuna for RandomForest...
