Calorie Burn Prediction - Comprehensive Model Training
Author: Sheryar & Shamoon Waheed
Purpose: Train and compare multiple ML models for calorie prediction

Notebook Overview:
Import libraries and setup
Load and explore data
Feature engineering (MET values)
Train multiple models
Compare performance
Generate visualizations
Save best model

[CODE] - Imports

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import json

print("All libraries imported successfully!")

Setup Paths and Configuration

In [None]:
print("="*60)
print("CALORIE BURN PREDICTION - MODEL TRAINING")
print("="*60)

# Setup paths (relative to notebook location)
DATA_PATH = Path('../data/enhanced_calories.csv')
MODEL_DIR = Path('../models')
MODEL_DIR.mkdir(exist_ok=True)

# MET values for workouts
MET_MAPPING = {
    'Pushups': 3.8,
    'Pullups': 4.0,
    'Cycling': 6.8,
    'Hill_Up': 9.0,
    'Hill_Down': 5.0,
    'Hill_Straight': 7.0,
    'Jumping_Jacks': 8.0,
    'Burpees': 8.0,
    'Running_in_Place': 7.0,
    'Walking': 3.5,
    'Yoga': 2.5
}

print(f"Paths configured")
print(f"Data: {DATA_PATH}")
print(f"Models: {MODEL_DIR}")

Load and Explore Data

In [None]:
print("Loading dataset...")
df = pd.read_csv(DATA_PATH)
print(f"✓ Loaded {len(df)} rows")

# Quick exploration
print("\nDataset Overview:")
print(df.head())
print(f"\nShape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nMissing values: {df.isnull().sum().sum()}")

Feature Engineering

In [None]:
df['MET'] = df['Workout_Type'].map(MET_MAPPING)
print(f"✓ MET values added")
print(f"\nMET Distribution:")
print(df.groupby('Workout_Type')['MET'].first().sort_values(ascending=False))

Prepare Features and Split Data

In [None]:
print("\nPreparing features...")
feature_cols = ['Gender', 'Age', 'Height', 'Weight', 'Duration', 
                'Heart_Rate', 'Body_Temp', 'MET']
X = df[feature_cols]
y = df['Calories']
print(f"✓ Features shape: {X.shape}")

# Create preprocessor
num_cols = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'MET']
cat_cols = ['Gender']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), cat_cols)
    ]
)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"✓ Train: {len(X_train)}, Test: {len(X_test)}")

# Preprocess
print("\nPreprocessing data...")
X_train_pre = preprocessor.fit_transform(X_train)
X_test_pre = preprocessor.transform(X_test)
print(f"Preprocessed shape: {X_train_pre.shape}")

Train Multiple Models
We'll train 3 different models and compare:

Linear Regression (baseline)
Random Forest (ensemble)
XGBoost (gradient boosting)

In [None]:
print("\nTraining Multiple Models...")
models_performance = {}

# Model 1: Linear Regression
print("\n1. Linear Regression (Baseline)...")
lr_model = LinearRegression()
lr_model.fit(X_train_pre, y_train)
y_pred_lr_train = lr_model.predict(X_train_pre)
y_pred_lr_test = lr_model.predict(X_test_pre)

models_performance['Linear Regression'] = {
    'model': lr_model,
    'r2_train': r2_score(y_train, y_pred_lr_train),
    'r2_test': r2_score(y_test, y_pred_lr_test),
    'mae': mean_absolute_error(y_test, y_pred_lr_test),
    'rmse': np.sqrt(mean_squared_error(y_test, y_pred_lr_test)),
    'predictions': y_pred_lr_test
}
print(f"   R² Test: {models_performance['Linear Regression']['r2_test']:.4f}")

# Model 2: Random Forest
print("\n2. Random Forest...")
rf_model = RandomForestRegressor(
    n_estimators=150,
    max_depth=15,
    min_samples_split=5,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train_pre, y_train)
y_pred_rf_train = rf_model.predict(X_train_pre)
y_pred_rf_test = rf_model.predict(X_test_pre)

models_performance['Random Forest'] = {
    'model': rf_model,
    'r2_train': r2_score(y_train, y_pred_rf_train),
    'r2_test': r2_score(y_test, y_pred_rf_test),
    'mae': mean_absolute_error(y_test, y_pred_rf_test),
    'rmse': np.sqrt(mean_squared_error(y_test, y_pred_rf_test)),
    'predictions': y_pred_rf_test
}
print(f"   R² Test: {models_performance['Random Forest']['r2_test']:.4f}")

# Model 3: XGBoost
print("\n3. XGBoost...")
xgb_model = XGBRegressor(
    n_estimators=150,
    max_depth=7,
    learning_rate=0.1,
    random_state=42,
    n_jobs=-1
)
xgb_model.fit(X_train_pre, y_train)
y_pred_xgb_train = xgb_model.predict(X_train_pre)
y_pred_xgb_test = xgb_model.predict(X_test_pre)

models_performance['XGBoost'] = {
    'model': xgb_model,
    'r2_train': r2_score(y_train, y_pred_xgb_train),
    'r2_test': r2_score(y_test, y_pred_xgb_test),
    'mae': mean_absolute_error(y_test, y_pred_xgb_test),
    'rmse': np.sqrt(mean_squared_error(y_test, y_pred_xgb_test)),
    'predictions': y_pred_xgb_test
}
print(f"   R² Test: {models_performance['XGBoost']['r2_test']:.4f}")

print("\nAll models trained!")

Model Comparison

In [None]:
print("\nModel Comparison:")
print("="*70)
print(f"{'Model':<20} {'R² Train':<12} {'R² Test':<12} {'MAE':<10} {'RMSE':<10}")
print("="*70)
for name, perf in models_performance.items():
    print(f"{name:<20} {perf['r2_train']:<12.4f} {perf['r2_test']:<12.4f} {perf['mae']:<10.2f} {perf['rmse']:<10.2f}")
print("="*70)

# Select best model
best_model_name = max(models_performance.keys(), key=lambda k: models_performance[k]['r2_test'])
best_model_perf = models_performance[best_model_name]
model = best_model_perf['model']

print(f"\nBest Model: {best_model_name}")
print(f"   R² Test: {best_model_perf['r2_test']:.4f}")
print(f"   MAE: {best_model_perf['mae']:.2f} kcal")

r2_train = best_model_perf['r2_train']
r2_test = best_model_perf['r2_test']
mae = best_model_perf['mae']
rmse = best_model_perf['rmse']
y_pred_test = best_model_perf['predictions']

Visualizations

In [None]:
print("\nCreating comprehensive analysis plots...")

fig = plt.figure(figsize=(20, 12))

# 1. Model Comparison Bar Chart
ax1 = plt.subplot(3, 3, 1)
model_names = list(models_performance.keys())
r2_scores = [models_performance[m]['r2_test'] for m in model_names]
colors = ['#FF6B6B' if m != best_model_name else '#51CF66' for m in model_names]
bars = ax1.bar(model_names, r2_scores, color=colors)
ax1.set_ylabel('R² Score')
ax1.set_title('Model Comparison (R² Test Score)')
ax1.set_ylim([min(r2_scores) - 0.01, 1.0])
for i, bar in enumerate(bars):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height,
             f'{r2_scores[i]:.4f}', ha='center', va='bottom', fontsize=10)
ax1.grid(axis='y', alpha=0.3)

# ... (continue with all other plots from Code 1)

plt.tight_layout()
plt.savefig(MODEL_DIR / 'comprehensive_analysis.png', dpi=200, bbox_inches='tight')
plt.show()
print("Comprehensive analysis saved")

Save Model and Artifacts

In [None]:
print("\nSaving model artifacts...")

# Save model
joblib.dump(model, MODEL_DIR / 'calories_model.pkl')
print("Model saved")

# Save preprocessor
joblib.dump(preprocessor, MODEL_DIR / 'preprocessor.pkl')
print("Preprocessor saved")

# Save MET mapping
joblib.dump(MET_MAPPING, MODEL_DIR / 'met_mapping.pkl')
print("MET mapping saved")

# Save metadata
metadata = {
    'best_model': best_model_name,
    'r2_test': float(r2_test),
    'mae_test': float(mae),
    'rmse_test': float(rmse)
}
with open(MODEL_DIR / 'model_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)
print("Metadata saved")

print("\nTRAINING COMPLETE!")
print(f"Model Accuracy: R²={r2_test:.4f}, Error=±{mae:.1f} kcal")