# FitMetrics Intelligence
## Gym Analytics and Predictive Modeling

**Author:** Shivam  
**Dataset:** 973 gym member records  

---

This notebook provides a complete end-to-end analysis including EDA, three machine learning models, and business recommendations.

In [None]:
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, classification_report, confusion_matrix
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

PALETTE = {
    'primary':   '#2D6A9F',
    'secondary': '#4DBFA8',
    'accent':    '#F4A261',
    'danger':    '#E76F51',
    'neutral':   '#8D99AE',
}
sns.set_theme(style='whitegrid', font_scale=1.1)
print("Libraries loaded successfully")

## 1. Data Loading and Profiling

In [None]:
df = pd.read_csv('../data/gym_members_exercise_tracking.csv')
df.columns = [c.strip() for c in df.columns]

print(f"Shape: {df.shape}")
print(f"Null values: {df.isnull().sum().sum()}")
print(f"\nData types:")
print(df.dtypes)

In [None]:
df.describe().round(2)

In [None]:
# Encode categoricals
le = LabelEncoder()
df['Gender_enc']      = le.fit_transform(df['Gender'])
df['WorkoutType_enc'] = le.fit_transform(df['Workout_Type'])
df['Exp_Label'] = df['Experience_Level'].map({1:'Beginner', 2:'Intermediate', 3:'Expert'})
print("Encoding complete")
print("\nWorkout type distribution:")
print(df['Workout_Type'].value_counts())

## 2. Exploratory Data Analysis

### 2.1 Calories Burned by Workout Type

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
order = df.groupby('Workout_Type')['Calories_Burned'].median().sort_values(ascending=False).index.tolist()
WORKOUT_COLORS = {'Cardio': PALETTE['primary'], 'HIIT': PALETTE['danger'],
                  'Strength': PALETTE['secondary'], 'Yoga': PALETTE['accent']}
colors = [WORKOUT_COLORS[w] for w in order]
parts = ax.violinplot([df[df['Workout_Type']==w]['Calories_Burned'].values for w in order],
                      positions=range(len(order)), showmedians=True, showextrema=False)
for pc, c in zip(parts['bodies'], colors):
    pc.set_facecolor(c); pc.set_alpha(0.6)
parts['cmedians'].set_color('#2B2D42'); parts['cmedians'].set_linewidth(2)
for i, w in enumerate(order):
    sub = df[df['Workout_Type']==w]['Calories_Burned']
    ax.scatter(np.random.normal(i, 0.06, len(sub)), sub, s=10, color=WORKOUT_COLORS[w], alpha=0.35)
ax.set_xticks(range(len(order))); ax.set_xticklabels(order, fontsize=12)
ax.set_ylabel('Calories Burned', fontsize=12)
ax.set_title('Caloric Output by Workout Type', fontsize=14, fontweight='bold')
plt.tight_layout(); plt.show()

### 2.2 Session Duration vs Calories by Experience

In [None]:
EXP_COLORS = ['#AED6F1', '#2D86D9', '#1A3A6B']
fig, ax = plt.subplots(figsize=(10, 6))
for lvl, color, label in zip([1,2,3], EXP_COLORS, ['Beginner','Intermediate','Expert']):
    sub = df[df['Experience_Level']==lvl]
    ax.scatter(sub['Session_Duration (hours)'], sub['Calories_Burned'],
               c=color, s=50, alpha=0.7, label=label, edgecolors='white', linewidths=0.4)
    m, b = np.polyfit(sub['Session_Duration (hours)'], sub['Calories_Burned'], 1)
    xs = np.linspace(sub['Session_Duration (hours)'].min(), sub['Session_Duration (hours)'].max(), 100)
    ax.plot(xs, m*xs+b, color=color, linewidth=2)
ax.set_xlabel('Session Duration (hours)', fontsize=12)
ax.set_ylabel('Calories Burned', fontsize=12)
ax.set_title('Session Duration vs Calories Burned by Experience Level', fontsize=14, fontweight='bold')
ax.legend(title='Experience')
plt.tight_layout(); plt.show()

### 2.3 Correlation Heatmap

In [None]:
num_cols = ['Age','Weight (kg)','Height (m)','Max_BPM','Avg_BPM','Resting_BPM',
            'Session_Duration (hours)','Calories_Burned','Fat_Percentage',
            'Water_Intake (liters)','Workout_Frequency (days/week)','BMI']
corr = df[num_cols].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
fig, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corr, mask=mask, annot=True, fmt='.2f', cmap='coolwarm',
            center=0, vmin=-1, vmax=1, ax=ax, linewidths=0.5, annot_kws={'size':8})
ax.set_title('Feature Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout(); plt.show()

## 3. Machine Learning Models

### 3.1 Calorie Burn Regression

In [None]:
feat_reg = ['Age','Weight (kg)','Height (m)','Max_BPM','Avg_BPM','Resting_BPM',
            'Session_Duration (hours)','Fat_Percentage','Water_Intake (liters)',
            'Workout_Frequency (days/week)','BMI','Gender_enc','WorkoutType_enc','Experience_Level']
X_r = df[feat_reg]; y_r = df['Calories_Burned']
X_tr, X_te, y_tr, y_te = train_test_split(X_r, y_r, test_size=0.2, random_state=42)

rf_reg = RandomForestRegressor(n_estimators=200, random_state=42)
rf_reg.fit(X_tr, y_tr)
y_pred = rf_reg.predict(X_te)

r2 = r2_score(y_te, y_pred)
mae = mean_absolute_error(y_te, y_pred)
print(f"R-squared: {r2:.4f}")
print(f"Mean Absolute Error: {mae:.2f} kcal")

In [None]:
imp = pd.Series(rf_reg.feature_importances_, index=feat_reg).sort_values(ascending=True)
fig, ax = plt.subplots(figsize=(10, 7))
colors_bar = [PALETTE['danger'] if v > imp.median() else PALETTE['primary'] for v in imp.values]
ax.barh(imp.index, imp.values, color=colors_bar, edgecolor='white', alpha=0.85)
ax.set_xlabel('Feature Importance', fontsize=12)
ax.set_title(f'Feature Importance: Calories Burned Prediction (R-squared = {r2:.3f})', fontsize=13, fontweight='bold')
plt.tight_layout(); plt.show()

### 3.2 Experience Level Classification

In [None]:
feat_cls = ['Age','Weight (kg)','BMI','Fat_Percentage','Calories_Burned',
            'Session_Duration (hours)','Workout_Frequency (days/week)',
            'Max_BPM','Avg_BPM','Resting_BPM','Water_Intake (liters)','Gender_enc','WorkoutType_enc']
X_cl = df[feat_cls]; y_cl = df['Experience_Level']
X_tr2, X_te2, y_tr2, y_te2 = train_test_split(X_cl, y_cl, test_size=0.2, random_state=42, stratify=y_cl)

rf_cls = RandomForestClassifier(n_estimators=200, random_state=42)
rf_cls.fit(X_tr2, y_tr2)
y_pred2 = rf_cls.predict(X_te2)
acc = (y_pred2 == y_te2).mean()

print(f"Accuracy: {acc*100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_te2, y_pred2, target_names=['Beginner','Intermediate','Expert']))

In [None]:
cm = confusion_matrix(y_te2, y_pred2)
fig, ax = plt.subplots(figsize=(7, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
            xticklabels=['Beginner','Intermediate','Expert'],
            yticklabels=['Beginner','Intermediate','Expert'], linewidths=0.5)
ax.set_ylabel('Actual'); ax.set_xlabel('Predicted')
ax.set_title(f'Experience Level Confusion Matrix (Accuracy = {acc*100:.1f}%)', fontsize=13, fontweight='bold')
plt.tight_layout(); plt.show()

### 3.3 Member Segmentation (K-Means)

In [None]:
features_cluster = ['Age','BMI','Fat_Percentage','Session_Duration (hours)',
                    'Calories_Burned','Workout_Frequency (days/week)']
X_c = df[features_cluster].dropna()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_c)

kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
labels_k = kmeans.fit_predict(X_scaled)

pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)

cluster_names = {0:'High Performers', 1:'Casual Members', 2:'Intensive Trainers', 3:'Balanced Athletes'}
cluster_colors = [PALETTE['primary'], PALETTE['accent'], PALETTE['danger'], PALETTE['secondary']]

fig, ax = plt.subplots(figsize=(10, 7))
for cid, (name, color) in enumerate(zip(cluster_names.values(), cluster_colors)):
    mask = labels_k == cid
    ax.scatter(X_pca[mask, 0], X_pca[mask, 1], c=color, s=60,
               alpha=0.7, label=name, edgecolors='white', linewidths=0.3)
centers_pca = pca.transform(kmeans.cluster_centers_)
ax.scatter(centers_pca[:,0], centers_pca[:,1], c='#2B2D42', s=200, marker='X', zorder=5, label='Centroids')
ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% variance)')
ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% variance)')
ax.set_title('Member Segmentation via K-Means (PCA Projection)', fontsize=14, fontweight='bold')
ax.legend(framealpha=0.8)
plt.tight_layout(); plt.show()

In [None]:
cluster_df = X_c.copy()
cluster_df['Segment'] = [cluster_names[c] for c in labels_k]
profile = cluster_df.groupby('Segment')[features_cluster].mean().round(2)
print("Cluster Profiles (Mean Values):")
profile

## 4. Key Takeaways

- Session Duration and Avg BPM are the dominant drivers of caloric output
- Experience level is highly predictable from behavioral metrics alone
- Four distinct member personas exist with clear business implications
- High water intake correlates with lower body fat percentage
- Workout frequency of 5+ days/week consistently yields the best body composition outcomes

---
**Prepared by Shivam**