# Calorie Predictor

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('../data/raw_data.csv')

## Preprocessing

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [6]:
df['BMR'] = np.where(
    df['Gender'] == 'Male',
    10 * df['Weight (kg)'] + 6.25 * (df['Height (m)'] * 100) - 5 * df['Age'] + 5,
    10 * df['Weight (kg)'] + 6.25 * (df['Height (m)'] * 100) - 5 * df['Age'] - 161)
df['Weight_Duration'] = df['Weight (kg)'] * df['Session_Duration (hours)']
df['BMI_Duration'] = df['BMI'] * df['Session_Duration (hours)']
df['Workout_Intensity'] = df['Experience_Level'] * df['Workout_Frequency (days/week)']
df['Fat_Category'] = pd.cut(df['Fat_Percentage'], 
                           bins=[0, 15, 25, 35, 100], 
                           labels=['Low', 'Moderate', 'High', 'Very High'])

df.dtypes

Age                                 int64
Gender                             object
Weight (kg)                       float64
Height (m)                        float64
Max_BPM                             int64
Avg_BPM                             int64
Resting_BPM                         int64
Session_Duration (hours)          float64
Calories_Burned                   float64
Workout_Type                       object
Fat_Percentage                    float64
Water_Intake (liters)             float64
Workout_Frequency (days/week)       int64
Experience_Level                    int64
BMI                               float64
BMR                               float64
Weight_Duration                   float64
BMI_Duration                      float64
Workout_Intensity                   int64
Fat_Category                     category
dtype: object

In [9]:
encoding_ct = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(sparse_output=False), ['Gender', 'Workout_Type' ]),
        ('ordinal', OrdinalEncoder(), ['Fat_Category'])
    ], remainder='passthrough'
)


## Modelling

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [14]:
X= df.drop(columns=['Calories_Burned', 'Max_BPM', 'Avg_BPM', 'Resting_BPM'])
y = df['Calories_Burned']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [None]:
pipeline = Pipeline(steps=[
    ('preprocessor', encoding_ct),
    ('model', RandomForestRegressor())
])

In [28]:
param_grid = {
    'model__n_estimators': [100, 300, 500],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['sqrt', 'log2']
}

grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV R2:", grid.best_score_)# Cross Validation


Best params: {'model__max_depth': None, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100}
Best CV R2: 0.8415003232471318


## Evaluation

In [30]:
y_pred = grid.predict(X_test)
print("R2 Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))

R2 Score: 0.8519913568800711
MSE: 11776.437136643837
MAE: 87.63763698630137


## Deployment

In [31]:
import joblib
best_model = grid.best_estimator_
joblib.dump(best_model, '../models/calorie_predictor_model.pkl')

['../models/calorie_predictor_model.pkl']