# FitMetrics Intelligence
### Gym Analytics and Predictive Modeling
---
**Author:** Shivam  
**Dataset:** 973 gym members, 15 features  
**Scope:** EDA, Regression, Classification, Clustering

## 1. Environment Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

sns.set_theme(style='whitegrid')
PALETTE = ['#2C4A6E', '#4A7BA7', '#6BA3BE', '#A8C5DA', '#D4E6F1']
ACCENT = '#2C4A6E'
print('Environment ready.')

## 2. Data Loading and Validation

In [None]:
df = pd.read_csv('../data/gym_members_exercise_tracking.csv')
print(f'Shape: {df.shape}')
df.head()

In [None]:
print('Data types:')
print(df.dtypes)
print(f'\nMissing values:')
print(df.isnull().sum())
print(f'\nBasic statistics:')
df.describe().round(2)

## 3. Feature Engineering

In [None]:
# HR Reserve (Karvonen intensity measure)
df['HR_Reserve'] = df['Max_BPM'] - df['Resting_BPM']

# Workout intensity as % of max HR
df['HR_Intensity_%'] = (df['Avg_BPM'] / df['Max_BPM'] * 100).round(2)

# Normalised calorie efficiency
df['Calories_Per_Hour'] = (df['Calories_Burned'] / df['Session_Duration (hours)']).round(2)

# Weekly caloric output
df['Weekly_Caloric_Load'] = (df['Calories_Burned'] * df['Workout_Frequency (days/week)']).round(2)

# BMI category (WHO)
def bmi_cat(bmi):
    if bmi < 18.5: return 'Underweight'
    elif bmi < 25: return 'Normal'
    elif bmi < 30: return 'Overweight'
    return 'Obese'
df['BMI_Category'] = df['BMI'].apply(bmi_cat)

# Experience label
df['Experience_Label'] = df['Experience_Level'].map({1: 'Beginner', 2: 'Intermediate', 3: 'Expert'})

# Encoded categoricals
df['Gender_enc'] = (df['Gender'] == 'Male').astype(int)
df['WorkoutType_enc'] = df['Workout_Type'].astype('category').cat.codes

print(f'Engineered features added. Total columns: {len(df.columns)}')
df[['HR_Reserve','HR_Intensity_%','Calories_Per_Hour','Weekly_Caloric_Load','BMI_Category']].describe().round(2)

## 4. Exploratory Data Analysis

### 4.1 Member Demographics

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Member Overview: Demographics and Activity', fontsize=16, fontweight='bold', color=ACCENT)

wt = df['Workout_Type'].value_counts()
axes[0,0].bar(wt.index, wt.values, color=PALETTE[:4], edgecolor='white')
axes[0,0].set_title('Workout Type Distribution', fontweight='bold', color=ACCENT)
for i, v in enumerate(wt.values):
    axes[0,0].text(i, v+3, str(v), ha='center', color=ACCENT)

el = df['Experience_Label'].value_counts()
axes[0,1].pie(el.values, labels=el.index, colors=PALETTE[:3], autopct='%1.1f%%',
              wedgeprops={'edgecolor':'white','linewidth':2})
axes[0,1].set_title('Experience Level Breakdown', fontweight='bold', color=ACCENT)

axes[1,0].hist(df['Age'], bins=20, color=PALETTE[1], edgecolor='white')
axes[1,0].axvline(df['Age'].mean(), color=PALETTE[0], linestyle='--', linewidth=2,
                   label=f"Mean: {df['Age'].mean():.0f}")
axes[1,0].set_title('Age Distribution', fontweight='bold', color=ACCENT)
axes[1,0].legend()

gender_wt = df.groupby(['Workout_Type','Gender']).size().unstack()
gender_wt.plot(kind='bar', ax=axes[1,1], color=[PALETTE[0], PALETTE[2]], edgecolor='white')
axes[1,1].set_title('Gender Split by Workout Type', fontweight='bold', color=ACCENT)
axes[1,1].tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.show()

### 4.2 Calorie and Performance Analysis

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Calorie Burn and Performance Analysis', fontsize=16, fontweight='bold', color=ACCENT)

order = df.groupby('Workout_Type')['Calories_Burned'].median().sort_values(ascending=False).index
sns.violinplot(data=df, x='Workout_Type', y='Calories_Burned', order=order,
               hue='Workout_Type', palette=PALETTE[:4], ax=axes[0,0], inner='quartile', legend=False)
axes[0,0].set_title('Calories Burned by Workout Type', fontweight='bold', color=ACCENT)

sc = axes[0,1].scatter(df['Session_Duration (hours)'], df['Calories_Burned'],
                c=df['Experience_Level'], cmap='Blues', alpha=0.6, s=30)
axes[0,1].set_title('Session Duration vs Calories', fontweight='bold', color=ACCENT)
plt.colorbar(sc, ax=axes[0,1], label='Experience Level')

sns.boxplot(data=df, x='Experience_Label', y='Calories_Burned',
            order=['Beginner','Intermediate','Expert'],
            hue='Experience_Label', palette=PALETTE[:3], ax=axes[1,0], legend=False)
axes[1,0].set_title('Calories by Experience Level', fontweight='bold', color=ACCENT)

hr_data = df.groupby('Experience_Label')[['Resting_BPM','Avg_BPM','Max_BPM']].mean().loc[['Beginner','Intermediate','Expert']]
x = np.arange(3)
w = 0.25
axes[1,1].bar(x-w, hr_data['Resting_BPM'], w, label='Resting BPM', color=PALETTE[3])
axes[1,1].bar(x, hr_data['Avg_BPM'], w, label='Avg BPM', color=PALETTE[1])
axes[1,1].bar(x+w, hr_data['Max_BPM'], w, label='Max BPM', color=PALETTE[0])
axes[1,1].set_xticks(x); axes[1,1].set_xticklabels(['Beginner','Intermediate','Expert'])
axes[1,1].set_title('Heart Rate Profile by Experience Level', fontweight='bold', color=ACCENT)
axes[1,1].legend()

plt.tight_layout()
plt.show()

### 4.3 Correlation Matrix and BMI Analysis

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
num_cols = ['Age','Weight (kg)','Height (m)','Max_BPM','Avg_BPM','Resting_BPM',
            'Session_Duration (hours)','Calories_Burned','Fat_Percentage',
            'Water_Intake (liters)','Workout_Frequency (days/week)','Experience_Level','BMI']
corr = df[num_cols].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, cmap='Blues', annot=True, fmt='.2f',
            linewidths=0.5, ax=axes[0], annot_kws={'size':7})
axes[0].set_title('Correlation Matrix', fontweight='bold', color=ACCENT)
axes[0].tick_params(axis='x', rotation=45, labelsize=8)

for gender, color in zip(['Male','Female'], [PALETTE[0], PALETTE[2]]):
    axes[1].hist(df[df['Gender']==gender]['BMI'], bins=25, alpha=0.6, color=color, label=gender, edgecolor='white')
axes[1].set_title('BMI Distribution by Gender', fontweight='bold', color=ACCENT)
axes[1].legend()
plt.tight_layout()
plt.show()

## 5. Predictive Modeling

### 5.1 Calorie Prediction (Regression)

In [None]:
MODEL_FEATURES = ['Age','Gender_enc','Weight (kg)','Height (m)','Max_BPM','Avg_BPM',
                  'Resting_BPM','Session_Duration (hours)','Fat_Percentage',
                  'Water_Intake (liters)','Workout_Frequency (days/week)',
                  'Experience_Level','BMI','WorkoutType_enc','HR_Reserve','HR_Intensity_%','Calories_Per_Hour']

X = df[MODEL_FEATURES]
y = df['Calories_Burned']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_reg = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
rf_reg.fit(X_train, y_train)
y_pred = rf_reg.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f'R2 Score:  {r2:.4f}')
print(f'MAE:       {mae:.2f} calories')

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

imp = pd.Series(rf_reg.feature_importances_, index=MODEL_FEATURES).sort_values(ascending=True).tail(10)
imp.plot(kind='barh', ax=axes[0], color=PALETTE[1])
axes[0].set_title(f'Feature Importance | R2={r2:.3f} MAE={mae:.1f}', fontweight='bold', color=ACCENT)

axes[1].scatter(y_test, y_pred, alpha=0.4, color=PALETTE[1], s=20)
mn, mx = y_test.min(), y_test.max()
axes[1].plot([mn,mx],[mn,mx], color=PALETTE[0], linewidth=2, linestyle='--')
axes[1].set_title('Actual vs Predicted Calories', fontweight='bold', color=ACCENT)
axes[1].set_xlabel('Actual'); axes[1].set_ylabel('Predicted')
axes[1].text(0.05, 0.92, f'R2 = {r2:.3f}', transform=axes[1].transAxes, color=ACCENT)

plt.tight_layout()
plt.show()

### 5.2 Experience Level Classification

In [None]:
X_c = df[MODEL_FEATURES]
y_c = df['Experience_Level']
X_tr2, X_te2, y_tr2, y_te2 = train_test_split(X_c, y_c, test_size=0.2, random_state=42)

rf_clf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf_clf.fit(X_tr2, y_tr2)
y_pred_c = rf_clf.predict(X_te2)
cv = cross_val_score(rf_clf, X_c, y_c, cv=5)

print(f'CV Accuracy: {cv.mean():.4f} (+/- {cv.std():.4f})')
print()
print(classification_report(y_te2, y_pred_c, target_names=['Beginner','Intermediate','Expert']))

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

cm = confusion_matrix(y_te2, y_pred_c)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=['Beginner','Intermediate','Expert'],
            yticklabels=['Beginner','Intermediate','Expert'])
axes[0].set_title('Confusion Matrix', fontweight='bold', color=ACCENT)

axes[1].bar(range(1,6), cv, color=PALETTE[1], edgecolor='white')
axes[1].axhline(cv.mean(), color=PALETTE[0], linewidth=2, linestyle='--', label=f'Mean: {cv.mean():.3f}')
axes[1].set_title('5-Fold Cross-Validation Accuracy', fontweight='bold', color=ACCENT)
axes[1].set_ylim(0, 1.05); axes[1].legend()

plt.tight_layout()
plt.show()

### 5.3 Member Segmentation (K-Means Clustering)

In [None]:
CLUSTER_FEATURES = ['Calories_Burned','Session_Duration (hours)','Workout_Frequency (days/week)',
                    'Fat_Percentage','BMI','Water_Intake (liters)']
X_cl = df[CLUSTER_FEATURES].copy()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_cl)

inertias = [KMeans(n_clusters=k, random_state=42, n_init=10).fit(X_scaled).inertia_ for k in range(2,9)]

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].plot(range(2,9), inertias, marker='o', color=PALETTE[0], linewidth=2)
axes[0].axvline(3, color=PALETTE[2], linestyle='--', linewidth=1.5, label='Optimal k=3')
axes[0].set_title('Elbow Method', fontweight='bold', color=ACCENT)
axes[0].legend()

km = KMeans(n_clusters=3, random_state=42, n_init=10)
df['Cluster'] = km.fit_predict(X_scaled)

colors = [PALETTE[0], PALETTE[1], PALETTE[3]]
for c in range(3):
    mask = df['Cluster'] == c
    axes[1].scatter(df[mask]['Session_Duration (hours)'], df[mask]['Calories_Burned'],
                   color=colors[c], alpha=0.5, s=25, label=f'Segment {c+1}')
axes[1].set_title('Segments: Duration vs Calories', fontweight='bold', color=ACCENT)
axes[1].set_xlabel('Duration (hours)'); axes[1].set_ylabel('Calories Burned')
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
segment_profile = df.groupby('Cluster')[CLUSTER_FEATURES + ['Calories_Burned']].mean().round(2)
segment_profile.index = ['Segment 1','Segment 2','Segment 3']
print('Cluster Profiles:')
segment_profile

## 6. Business Recommendations

1. **Target Segment 1 (High Intensity)** with premium memberships, personal training upsells, and performance tracking tools.
2. **Engage Segment 3 (Low Engagement)** with motivational campaigns, group classes, and flexible scheduling to improve retention.
3. **HIIT and Strength programs** are the highest calorie burners. Marketing these classes can attract fitness-focused members.
4. **Session duration is the top calorie predictor.** Offer extended session passes or late-closing hours to increase output.
5. **Beginners have elevated resting BPM.** Introduce structured onboarding with monitored heart rate zones to reduce injury risk and improve retention.
6. **BMI is not a reliable fitness proxy.** Avoid using it as the primary health metric in member communications.

---
*FitMetrics Intelligence | Author: Shivam*