# Calorie Predictor

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
df = pd.read_csv('../data/raw_data.csv')

## Preprocessing

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, FunctionTransformer

In [22]:
def featureEngineering(df):
    df = df.copy()
    df["BMR"] = np.where(
        df["gender"] == "Male",
        10 * df["weight"] + 6.25 * (df["height"] * 100) - 5 * df["age"] + 5,
        10 * df["weight"] + 6.25 * (df["height"] * 100) - 5 * df["age"] - 161
    )
    df["weight_duration"] = df["weight"] * df["session_duration"]
    df["bmi_duration"] = df["bmi"] * df["session_duration"]
    df["workout_intensity"] = df["experience_level"] * df["workout_frequency"]
    df["fat_category"] = pd.cut(
        df["fat_percentage"],
        bins=[0, 15, 25, 35, 100],
        labels=["Low", "Moderate", "High", "Very High"]
    )
    return df


In [25]:
#renaming the attributes
df = df.rename(columns={
    "Age": "age",
    "Gender": "gender",
    "Weight (kg)": "weight",
    "Height (m)": "height",
    "Workout_Type": "workout_type",
    "Fat_Percentage": "fat_percentage",
    "Experience_Level": "experience_level",
    "BMI": "bmi",
    "Workout_Intensity": "intensity_level",
    "Session_Duration (hours)": "session_duration",
    "Workout_Frequency (days/week)" : "workout_frequency"
})

In [26]:
df.dtypes

age                        int64
gender                    object
weight                   float64
height                   float64
Max_BPM                    int64
Avg_BPM                    int64
Resting_BPM                int64
session_duration         float64
Calories_Burned          float64
workout_type              object
fat_percentage           float64
Water_Intake (liters)    float64
workout_frequency          int64
experience_level           int64
bmi                      float64
dtype: object

In [27]:
df = featureEngineering(df)

In [33]:
feature_eng_pipe = FunctionTransformer(featureEngineering, validate=False)

In [29]:
encoding_ct = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(sparse_output=False), ['gender', 'workout_type' ]),
        ('ordinal', OrdinalEncoder(), ['fat_category'])
    ], remainder='passthrough'
)


## Modelling

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [31]:
X= df.drop(columns=['Calories_Burned', 'Max_BPM', 'Avg_BPM', 'Resting_BPM', 'Water_Intake (liters)'])
y = df['Calories_Burned']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [40]:
X.dtypes

age                     int64
gender                 object
weight                float64
height                float64
session_duration      float64
workout_type           object
fat_percentage        float64
workout_frequency       int64
experience_level        int64
bmi                   float64
BMR                   float64
weight_duration       float64
bmi_duration          float64
workout_intensity       int64
fat_category         category
dtype: object

In [34]:
pipeline = Pipeline(steps=[
    ('feature_engineering', feature_eng_pipe),
    ('preprocessor', encoding_ct),
    ('model', RandomForestRegressor())
])

In [35]:
param_grid = {
    'model__n_estimators': [100, 300, 500],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['sqrt', 'log2']
}

grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV R2:", grid.best_score_)# Cross Validation


Best params: {'model__max_depth': 20, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100}
Best CV R2: 0.8427794468819535


## Evaluation

In [36]:
y_pred = grid.predict(X_test)
print("R2 Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))

R2 Score: 0.8505551855769766
MSE: 11890.70736243491
MAE: 87.8316000761035


## Deployment

In [38]:
import joblib
best_model = grid.best_estimator_
joblib.dump(best_model, '../ml_models/calorie_predictor_model.pkl')

['../ml_models/calorie_predictor_model.pkl']