In [3]:
# Import Libraries
from init import *  # Adds project root to sys.path
from src import config
from src.utils import log_score

import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error

In [4]:
# Load data
train = pd.read_pickle(config.DATA_PATH + "processed/X_train_fe.pkl")
test = pd.read_pickle(config.DATA_PATH + "processed/X_test_fe.pkl")
sample = pd.read_csv(config.DATA_PATH + 'raw/sample_submission.csv')

In [7]:
# Data Preaparting
X = train.drop(columns=["Calories"]) # Drop Target and ID
y = train["Calories"] # Define Target

cat_features = ["Sex"] # Identify Categorical features

In [8]:
# Train and validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Create CatBoost pools (better performance)
train_pool = Pool(X_train, y_train, cat_features=cat_features)
val_pool = Pool(X_val, y_val, cat_features=cat_features)
test_pool = Pool(X_test, cat_features=cat_features)

In [10]:
# Initialize and train CatBoost baseline model
model = CatBoostRegressor(random_state=42, verbose=0, train_dir="../logs/catboost_logs")
model.fit(train_pool)
print("✅ Model trained.")

✅ Model trained.


In [11]:
# Evaluate on validation set
y_val_pred = model.predict(val_pool)
rmsle = np.sqrt(mean_squared_log_error(y_val, y_val_pred))
print(f"✅ Baseline CatBoost RMSLE: {rmsle:.5f}")

✅ Baseline CatBoost RMSLE: 0.06237


In [12]:
# Save submission
sample["Calories"] = model.predict(test_pool)
submission_path = "../outputs/submission_baseline_catboost_fe.csv"
sample.to_csv(submission_path, index=False)
print(f"✅ Submission saved: {submission_path}")

✅ Submission saved: ../outputs/submission_baseline_catboost_fe.csv


In [13]:
# Log Metrics
log_score(
    model_name="FE CatBoost Baseline",
    score=rmsle,
    notes="feature engineering, default params"
)

✅ Logged: FE CatBoost Baseline | Score: 0.06237


In [19]:
# Get all feature importances
importances = model.get_feature_importance()
features = X_train.columns

# Create DataFrame for full view
feat_imp = pd.DataFrame({
    "Feature": features,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

# Display Top Features
feat_imp.head(17)

Unnamed: 0,Feature,Importance
10,Duration_HeartRate,67.960891
1,Age,11.106664
5,Heart_Rate,8.178572
8,Duration_squared,6.00779
0,Sex,1.9631
4,Duration,1.419432
9,Duration_Weight,1.189932
11,Weight_Height_Ratio,0.854405
3,Weight,0.726958
12,HR_per_min,0.351597


In [21]:
# Remove very low-impact features
low_impact = [
    "BMI",
    "Height",
    "Body_Temp",
    "HR_per_min",
    "Age_Group_Senior",
    "Age_Group_Adult",
    "Age_Group_Mid-Age"
]

X_dropped = X.drop(columns=low_impact)

In [22]:
# Train and validation split
X_train, X_val, y_train, y_val = train_test_split(X_dropped, y, test_size=0.2, random_state=42)

In [23]:
# Create CatBoost pools (better performance)
train_pool = Pool(X_train, y_train, cat_features=cat_features)
val_pool = Pool(X_val, y_val, cat_features=cat_features)
test_pool = Pool(X_test, cat_features=cat_features)

In [24]:
# Initialize and train CatBoost baseline model
model = CatBoostRegressor(random_state=42, verbose=0, train_dir="../logs/catboost_logs")
model.fit(train_pool)
print("✅ Model trained.")

✅ Model trained.


In [25]:
# Evaluate on validation set
y_val_pred = model.predict(val_pool)
rmsle = np.sqrt(mean_squared_log_error(y_val, y_val_pred))
print(f"✅ FE Baseline CatBoost RMSLE: {rmsle:.5f}")

✅ Baseline CatBoost RMSLE: 0.06198


In [26]:
# Save submission
sample["Calories"] = model.predict(test_pool)
submission_path = "../outputs/submission_baseline_catboost_fe_updated.csv"
sample.to_csv(submission_path, index=False)
print(f"✅ Submission saved: {submission_path}")

✅ Submission saved: ../outputs/submission_baseline_catboost_fe_updated.csv


In [27]:
# Log Metrics
log_score(
    model_name="FE CatBoost Baseline Updated",
    score=rmsle,
    notes="feature engineering, removed low importance features, default params"
)

✅ Logged: FE CatBoost Baseline Updated | Score: 0.06198
