In [1]:
import os
os.chdir('../..')

In [2]:
import logging
import sys

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Avoid duplicate handlers
if not logger.handlers:
    handler = logging.StreamHandler(sys.stdout)  # stdout works better than stderr in Jupyter
    handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
    logger.addHandler(handler)

In [9]:
import numpy as np
from sklearn.model_selection import KFold
import time
from sklearn.metrics import root_mean_squared_error, mean_squared_log_error
import lightgbm as lgb


def rmsle(y_true, y_pred):
    """Calculates Root Mean Squared Logarithmic Error safely."""
    y_pred_clipped = np.maximum(y_pred, 0)
    y_true_clipped = np.maximum(y_true, 0)
    return np.sqrt(mean_squared_log_error(y_true_clipped + 1, y_pred_clipped + 1))

class BaseModelWrapper:
    def __init__(self, model_cls, model_params, name):
        self.model_cls = model_cls
        self.model_params = model_params
        self.name = name
        self.models = []
        self.oof_preds = None

    def fit(self, X, y, folds=5):
        start_time = time.time()
        logger.info(f"Starting training of {self.name} model with {folds} folds")
        
        self.oof_preds = np.zeros(len(X))
        self.models = []
        kf = KFold(n_splits=folds, shuffle=True, random_state=42)

        for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
            fold_start = time.time()
            logger.info(f"Training {self.name} - Fold {fold}/{folds}")
            
            X_train = X.iloc[train_idx]
            X_val   = X.iloc[val_idx]
            y_train = y.iloc[train_idx]
            y_val = y.iloc[val_idx]

            model = self.model_cls(**self.model_params)
            model.fit(X_train, y_train, eval_set=[(X_val, y_val)])
            self.oof_preds[val_idx] = model.predict(X_val)

            # Calculate oof fold validation using RMSE
            fold_rmse = rmsle(y_val, self.oof_preds[val_idx])
            logger.info(f"Fold {fold} RMSLE: {fold_rmse:.4f}")

            self.models.append(model)
            
            fold_time = time.time() - fold_start
            logger.info(f"Completed {self.name} - Fold {fold}/{folds} in {fold_time:.2f} seconds")

        total_time = time.time() - start_time
        
        # Calculate oof validation using RMSE
        rmse = rmsle(y, self.oof_preds)
        logger.info(f"Out-of-fold RMSLE: {rmse:.4f}")

        logger.info(f"Completed training of {self.name} model in {total_time:.2f} seconds")

    def predict(self, X):
        preds = [model.predict(X) for model in self.models]
        return np.mean(np.column_stack(preds), axis=1)

    def retrain_full(self, X, y):
        start_time = time.time()
        logger.info(f"Starting full retraining of {self.name} model")
        
        model = self.model_cls(**self.model_params)
        model.fit(X, y)
        self.models = [model]
        
        total_time = time.time() - start_time
        logger.info(f"Completed full retraining of {self.name} model in {total_time:.2f} seconds")

In [13]:
import pandas as pd

df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

# Transform Sex to 0 and 1
df_train['Sex'] = df_train['Sex'].map({'male': 0, 'female': 1})
df_test['Sex'] = df_test['Sex'].map({'male': 0, 'female': 1})

# Transform target (Calories) with log1p
# df_train['Calories'] = np.log1p(df_train['Calories'])

# Add BMI as a feature by dividing weight by height/100 squared
df_train['BMI'] = df_train['Weight'] / ((df_train['Height']/100) ** 2)
df_test['BMI'] = df_test['Weight'] / ((df_test['Height']/100) ** 2)

# Exercise intensity (heart rate / age)
df_train['Exercise_Intensity'] = df_train['Heart_Rate'] / df_train['Age']
df_test['Exercise_Intensity'] = df_test['Heart_Rate'] / df_test['Age']

# Duration per BMI
df_train['Duration_per_BMI'] = df_train['Duration'] / df_train['BMI']
df_test['Duration_per_BMI'] = df_test['Duration'] / df_test['BMI']

# Basal Metabolic Rate
df_train['BMR'] = 10 * df_train['Weight'] + 6.25 * (df_train['Height']/100) - 5 * df_train['Age'] + 5
df_test['BMR'] = 10 * df_test['Weight'] + 6.25 * (df_test['Height']/100) - 5 * df_test['Age'] + 5

# Body Fat Percentage
df_train['Body_Fat_Percentage'] = (1.2 * df_train['BMI']) + (0.23 * df_train['Age']) - (10.8 * df_train['Sex']) - 5.4
df_test['Body_Fat_Percentage'] = (1.2 * df_test['BMI']) + (0.23 * df_test['Age']) - (10.8 * df_test['Sex']) - 5.4

# Resting Heart Rate
df_train['Resting_Heart_Rate'] = 0.7 * df_train['Heart_Rate'] + (0.3 * df_train['Age']) - 5.4
df_test['Resting_Heart_Rate'] = 0.7 * df_test['Heart_Rate'] + (0.3 * df_test['Age']) - 5.4

# Resting Energy Expenditure
df_train['Resting_Energy_Expenditure'] = df_train['BMR'] * df_train['Body_Fat_Percentage']
df_test['Resting_Energy_Expenditure'] = df_test['BMR'] * df_test['Body_Fat_Percentage']

# Duration per heart rate
df_train['Duration_div_HR'] = df_train['Duration'] / df_train['Heart_Rate']
df_test['Duration_div_HR'] = df_test['Duration'] / df_test['Heart_Rate']


# Add polynomial features using PolynomialFeatures
from sklearn.preprocessing import PolynomialFeatures
# Columns to exclude from polynomial expansion
exclude_cols = ['id', 'Sex', 'Calories']
test_exclude_cols = [col for col in exclude_cols if col != 'Calories']

# Separate out excluded columns
extra_train = df_train[exclude_cols].copy()
extra_test = df_test[test_exclude_cols].copy()

# Apply PolynomialFeatures to numeric columns
numeric_cols = [col for col in df_train.columns if col not in exclude_cols]
poly = PolynomialFeatures(degree=2, include_bias=False)

train_poly = poly.fit_transform(df_train[numeric_cols])
test_poly = poly.transform(df_test[numeric_cols])

# Create DataFrames with new polynomial features
df_train = pd.DataFrame(train_poly, columns=poly.get_feature_names_out(numeric_cols), index=df_train.index)
df_test = pd.DataFrame(test_poly, columns=poly.get_feature_names_out(numeric_cols), index=df_test.index)

df_train = pd.concat([df_train, extra_train], axis=1)
df_test = pd.concat([df_test, extra_test], axis=1)

# Prepare features and target
X_train = df_train.drop(['Calories', 'id'], axis=1)
y_train = df_train['Calories']

In [20]:
import xgboost as xgb

model = BaseModelWrapper(
    xgb.XGBRegressor,
    {
        'objective': 'reg:squarederror',
        # 'eval_metric': 'rmsle',
        # 'early_stopping_rounds': 75,
        'tree_method': "auto",
        'seed': 42,
        'n_estimators': 1500,
        'learning_rate': 0.022,
        'max_depth': 7,
        'subsample': 0.85,
        'colsample_bytree': 0.5,
        'min_child_weight': 1,
        'gamma': 6.73,
        'lambda': 5.64,
        'alpha': 0.117,
        'verbose': 1
    },
    name='xgb'
)

model.fit(X_train, y_train, folds=3)

2025-05-09 20:11:01,140 - INFO - Starting training of xgb model with 3 folds
2025-05-09 20:11:01,200 - INFO - Training xgb - Fold 1/3


Exception ignored on calling ctypes callback function: <bound method DataIter._next_wrapper of <xgboost.data.SingleBatchInternalIter object at 0x0000029137E4D820>>
Traceback (most recent call last):
  File "c:\Users\rvhoo\Documents\projects\calorie-expenditure-predictor\.venv\Lib\site-packages\xgboost\core.py", line 585, in _next_wrapper
    def _next_wrapper(self, this: None) -> int:  # pylint: disable=unused-argument

KeyboardInterrupt: 


XGBoostError: [20:11:09] C:\actions-runner\_work\xgboost\xgboost\src\data\quantile_dmatrix.cc:174: Check failed: accumulated_rows == info.num_row_ (1000000 vs. 500000) : 

In [16]:
# save model.oof_preds to a numpy array file
np.save('data/xgb/xgb1_oof_preds.npy', model.oof_preds)

In [21]:
model.retrain_full(X_train, y_train)

2025-05-09 20:11:09,843 - INFO - Starting full retraining of xgb model


Parameters: { "verbose" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


2025-05-09 20:17:08,202 - INFO - Completed full retraining of xgb model in 358.36 seconds


In [23]:
# Save the 'id' column before dropping it
test_ids = df_test['id']

# Drop 'id' column before prediction
df_test = df_test.drop('id', axis=1)

# Make predictions on test data
test_preds = model.predict(df_test)

# Create submission file
submission = pd.DataFrame({
    'id': test_ids,
    'Calories': test_preds
})

# Save submission file
submission.to_csv('submissions/submission_xgb.csv', index=False)