In [1]:
# Import Libraries
from init import *  # Adds project root to sys.path
from src import config
from src.utils import log_score

import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error

In [2]:
# Load data
train = pd.read_csv(config.DATA_PATH + 'raw/train.csv')
test = pd.read_csv(config.DATA_PATH + 'raw/test.csv')
sample = pd.read_csv(config.DATA_PATH + 'raw/sample_submission.csv')

In [3]:
# Data Preaparting
X = train.drop(columns=["id", "Calories"]) # Drop Target and ID
y = train["Calories"] # Define Target

X_test = test.drop(columns=["id"]) # Drop ID from Test
cat_features = ["Sex"] # Identify Categorical features

In [4]:
# Train and validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Create CatBoost pools (better performance)
train_pool = Pool(X_train, y_train, cat_features=cat_features)
val_pool = Pool(X_val, y_val, cat_features=cat_features)
test_pool = Pool(X_test, cat_features=cat_features)

In [7]:
# Initialize and train CatBoost baseline model
model = CatBoostRegressor(random_state=42, verbose=0, train_dir="../logs/catboost_logs")
model.fit(train_pool)
print("✅ Model trained.")

✅ Model trained.


In [8]:
# Evaluate on validation set
y_val_pred = model.predict(val_pool)
rmsle = np.sqrt(mean_squared_log_error(y_val, y_val_pred))
print(f"✅ Baseline CatBoost RMSLE: {rmsle:.5f}")

✅ Baseline CatBoost RMSLE: 0.06168


In [9]:
# Save submission
sample["Calories"] = model.predict(test_pool)
submission_path = "../outputs/submission_baseline_catboost.csv"
sample.to_csv(submission_path, index=False)
print(f"✅ Submission saved: {submission_path}")

✅ Submission saved: ../outputs/submission_baseline_catboost.csv


In [10]:
# Log Metrics
log_score(
    model_name="CatBoost Baseline",
    score=rmsle,
    notes="No feature engineering, default params"
)

✅ Logged: CatBoost Baseline | Score: 0.06168
