In [2]:
# Import Libraries
from init import *  # Adds project root to sys.path
from src import config

import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split
import optuna

In [3]:
# Load data
train = pd.read_pickle(config.DATA_PATH + "processed/X_train_fe.pkl")
test = pd.read_pickle(config.DATA_PATH + "processed/X_test_fe.pkl")
sample = pd.read_csv(config.DATA_PATH + 'raw/sample_submission.csv')

In [4]:
# Data Preaparting
X = train.drop(columns=["Calories"]) # Drop Target and ID
y = train["Calories"] # Define Target

cat_features = ["Sex"] # Identify Categorical features

In [5]:
# Train/val split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Define all candidate features
all_features = list(X.columns)  

In [9]:
# Optuna objective for feature selection
def objective(trial):
    selected_features = [
        feature for feature in all_features
        if trial.suggest_categorical(feature, [True, False])
    ]

    if len(selected_features) == 0:
        return float("inf")  # Skip trial if no features are selected

    model = CatBoostRegressor(random_state=42, verbose=0)
    model.fit(X_train[selected_features], y_train)
    y_pred = model.predict(X_val[selected_features])
    y_pred = np.clip(y_pred, 0, None) 
    rmsle = np.sqrt(mean_squared_log_error(y_val, y_pred))
    return rmsle

In [10]:
# Run Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

[I 2025-05-25 17:39:06,289] A new study created in memory with name: no-name-e7827549-9079-4563-ae2c-e285c4bca79e
[I 2025-05-25 17:39:36,759] Trial 0 finished with value: 0.09116449427112001 and parameters: {'Sex': True, 'Age': True, 'Height': False, 'Weight': False, 'Duration': False, 'Heart_Rate': False, 'Body_Temp': True, 'BMI': True, 'Duration_squared': False, 'Duration_Weight': True, 'Duration_HeartRate': False, 'Weight_Height_Ratio': True, 'HR_per_min': True, 'Age_Group_Adult': False, 'Age_Group_Mid-Age': False, 'Age_Group_Senior': False}. Best is trial 0 with value: 0.09116449427112001.
[I 2025-05-25 17:40:03,468] Trial 1 finished with value: 0.09461636575095163 and parameters: {'Sex': False, 'Age': False, 'Height': False, 'Weight': True, 'Duration': True, 'Heart_Rate': True, 'Body_Temp': True, 'BMI': True, 'Duration_squared': True, 'Duration_Weight': False, 'Duration_HeartRate': False, 'Weight_Height_Ratio': False, 'HR_per_min': True, 'Age_Group_Adult': False, 'Age_Group_Mid-Ag

In [11]:
# Best result
print(f"\nBest RMSLE: {study.best_value:.5f}")
best_features = [f for f in all_features if study.best_trial.params[f]]
print("Selected features:", best_features)


Best RMSLE: 0.06150
Selected features: ['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'HR_per_min', 'Age_Group_Adult', 'Age_Group_Senior']


In [12]:
# === Optional: Retrain final model on best subset ===
model = CatBoostRegressor(random_state=42, verbose=0)
model.fit(X_train[best_features], y_train)
y_val_pred = model.predict(X_val[best_features])
rmsle_final = np.sqrt(mean_squared_log_error(y_val, y_val_pred))
print(f"\nFinal RMSLE with best feature subset: {rmsle_final:.5f}")


Final RMSLE with best feature subset: 0.06150


In [None]:
log_score(
    model_name="CatBoost + Optuna FS",
    score=rmsle_final,
    notes="Feature subset selected using Optuna, default CatBoost params"
)
