# Peloton Data - Initial Exploration

This notebook provides an initial exploration of your Peloton workout data.

## Setup

First, make sure you've run `python scripts/fetch_all_workouts.py` to download your data.

In [None]:
import sys
import json
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set style
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Add src to path
sys.path.insert(0, str(Path.cwd().parent))

## Load Data

In [None]:
# Load workout data
data_file = Path.cwd().parent / 'data' / 'raw' / 'workouts_latest.json'

if not data_file.exists():
    print(f"❌ Data file not found: {data_file}")
    print("Run: python scripts/fetch_all_workouts.py")
else:
    with open(data_file) as f:
        workouts_raw = json.load(f)
    
    print(f"✓ Loaded {len(workouts_raw)} workouts")

## Convert to DataFrame

In [None]:
# Extract key fields
workouts = []

for workout in workouts_raw:
    ride = workout.get('ride', {})
    instructor = ride.get('instructor', {})
    
    workouts.append({
        'workout_id': workout.get('id'),
        'created_at': pd.to_datetime(workout.get('created_at'), unit='s'),
        'ride_id': ride.get('id'),
        'ride_title': ride.get('title'),
        'fitness_discipline': ride.get('fitness_discipline'),
        'instructor_name': instructor.get('name'),
        'duration_minutes': ride.get('duration', 0) / 60,
        'total_work_kj': workout.get('total_work', 0) / 1000,
        'device_type': workout.get('device_type'),
        'status': workout.get('status'),
    })

df = pd.DataFrame(workouts)

# Sort by date
df = df.sort_values('created_at').reset_index(drop=True)

print(f"DataFrame shape: {df.shape}")
df.head()

## Basic Statistics

In [None]:
print("=" * 60)
print("WORKOUT STATISTICS")
print("=" * 60)

print(f"\nTotal Workouts: {len(df)}")
print(f"Date Range: {df['created_at'].min().date()} to {df['created_at'].max().date()}")
print(f"Total Days: {(df['created_at'].max() - df['created_at'].min()).days}")

print(f"\nTotal Output: {df['total_work_kj'].sum():.1f} kJ")
print(f"Total Time: {df['duration_minutes'].sum():.0f} minutes ({df['duration_minutes'].sum() / 60:.1f} hours)")

print(f"\nAverage Output per Workout: {df['total_work_kj'].mean():.1f} kJ")
print(f"Average Workout Duration: {df['duration_minutes'].mean():.1f} minutes")

## Workouts by Type

In [None]:
# Count by fitness discipline
discipline_counts = df['fitness_discipline'].value_counts()

plt.figure(figsize=(10, 6))
discipline_counts.plot(kind='bar')
plt.title('Workouts by Type', fontsize=14, fontweight='bold')
plt.xlabel('Fitness Discipline')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

print("\nWorkouts by Type:")
print(discipline_counts)

## Workout Frequency Over Time

In [None]:
# Workouts per month
df['year_month'] = df['created_at'].dt.to_period('M')
monthly_counts = df.groupby('year_month').size()

plt.figure(figsize=(14, 6))
monthly_counts.plot(kind='bar')
plt.title('Workouts per Month', fontsize=14, fontweight='bold')
plt.xlabel('Month')
plt.ylabel('Number of Workouts')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## Output Over Time

In [None]:
# Filter to only cycling workouts with output data
cycling = df[(df['fitness_discipline'] == 'cycling') & (df['total_work_kj'] > 0)].copy()

if len(cycling) > 0:
    plt.figure(figsize=(14, 6))
    plt.scatter(cycling['created_at'], cycling['total_work_kj'], alpha=0.5)
    
    # Add 30-day rolling average
    cycling['rolling_avg'] = cycling['total_work_kj'].rolling(window=30, min_periods=1).mean()
    plt.plot(cycling['created_at'], cycling['rolling_avg'], color='red', linewidth=2, label='30-workout average')
    
    plt.title('Total Output Over Time (Cycling)', fontsize=14, fontweight='bold')
    plt.xlabel('Date')
    plt.ylabel('Total Output (kJ)')
    plt.legend()
    plt.tight_layout()
    plt.show()
else:
    print("No cycling workouts with output data found")

## Top Instructors

In [None]:
# Count by instructor
instructor_counts = df['instructor_name'].value_counts().head(10)

plt.figure(figsize=(10, 6))
instructor_counts.plot(kind='barh')
plt.title('Top 10 Instructors', fontsize=14, fontweight='bold')
plt.xlabel('Number of Workouts')
plt.ylabel('Instructor')
plt.tight_layout()
plt.show()

print("\nTop 10 Instructors:")
print(instructor_counts)

## Workout Duration Distribution

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(df['duration_minutes'], bins=30, edgecolor='black')
plt.title('Workout Duration Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Duration (minutes)')
plt.ylabel('Frequency')
plt.axvline(df['duration_minutes'].mean(), color='red', linestyle='--', label=f"Mean: {df['duration_minutes'].mean():.1f} min")
plt.legend()
plt.tight_layout()
plt.show()

## Next Steps

Now that you have a basic understanding of your data, you can:

1. **Dive deeper into performance metrics**: Analyze output per minute, compare workouts by duration
2. **Time-based analysis**: Look at best times of day, day of week patterns
3. **Instructor analysis**: Compare difficulty and output by instructor
4. **Goal tracking**: Track personal bests, streaks, milestones
5. **Advanced visualizations**: Create dashboards, heatmaps, trend lines

Check out the other notebooks for more analysis!