In [1]:
import sys
import requests
import json
import pandas as pd
import os
from pprint import pprint

In [2]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path+"/codes")

import peloton_api_toolkit as api_tool
import peloton_data_toolkit as data_tool

In [3]:
api_base_url = "https://api.onepeloton.com"
session = requests.Session()
userID = api_tool.get_user_id(session)

path_user_workout = f"/api/user/{userID}/workouts"

params_user_workout = {
    'page': 0,
    'limit': 1,
    'joins': 'ride',
    'sort_by': 'created'}

response = session.get(api_base_url + path_user_workout, params = params_user_workout)

response_json = json.loads(response.text)    

In [5]:
pprint(response_json, depth = 1)

{'aggregate_stats': [],
 'count': 1,
 'data': [...],
 'limit': 1,
 'next': {...},
 'page': 0,
 'page_count': 1768,
 'show_next': True,
 'show_previous': False,
 'sort_by': '-device_time_created_at,-pk',
 'summary': {...},
 'total': 1768,
 'total_heart_rate_zone_durations': {...}}


There are total of 1768 workouts, shown in chunk of 10 items in 176 pages.
To extract all workout, we should loop through each page, and save each page as a separate json file in tmp folder.

See `get_all_user_workouts` function [here](~/codes/peloton_api_toolkit.py)

In [7]:
# Example usage (replace with your actual values):
api_base_url = "https://api.onepeloton.com"
session = requests.Session()
user_id = api_tool.get_user_id(session) # Assuming api_tool.get_user_id() is defined

all_workouts_data = api_tool.get_all_user_workouts(api_base_url, user_id, session)

if all_workouts_data:
    print(f"Total workouts retrieved: {len(all_workouts_data)}")
    # Process the all_workouts_data as needed
else:
    print("Failed to retrieve workout data.")

Total workouts retrieved: 1768


Let's save the output for future purposes.

In [9]:
filepath = os.path.join("./all_workouts_data.json")

# to save
with open(filepath, 'w') as f:
    json.dump(all_workouts_data, f, indent=4) 

# to load
# with open(filepath, 'r') as f:
#     all_workouts_data = json.load(f)

The output looks simliar to workout data we extracted in the previous section. Unfortunately for us, this output does not include leaderboard metrics (boo...). We'll explore `performance_metrics` endpoint to get this data in later section, but let's forge on.

Example output `pprint(all_workouts_data[10], depth = 1)`

```
{'created': 1738117901,
 'created_at': 1738117901,
 'device_time_created_at': 1738099901,
 'device_type': 't21n8m2',
 'effort_zones': {...},
 'end_time': 1738119791,
 'fitbit_id': None,
 'fitness_discipline': 'strength',
 'has_leaderboard_metrics': False,
 'has_pedaling_metrics': False,
 'is_outdoor': False,
 'is_splits_personal_record': False,
 'is_total_work_personal_record': False,
 'metrics_type': None,
 'name': 'Strength Workout',
 'platform': 'tiger',
 'ride': {...},
 'service_id': None,
 'start_time': 1738117991,
 'status': 'COMPLETE',
 'strava_id': None,
 'timezone': 'America/New_York',
 'title': None,
 'total_music_audio_buffer_seconds': None,
 'total_music_audio_play_seconds': None,
 'total_video_buffering_seconds': 0,
 'total_video_watch_time_seconds': 0,
 'total_work': 0.0,
 'v2_total_video_buffering_seconds': 7,
 'v2_total_video_watch_time_seconds': 1855,
 'workout_type': 'class'}
```

Informations such as heart rate zone, workout name and description is is nested within the ride and effort_zone sections of the JSON response.  This nested structure makes direct extraction a bit more complex.  To simplify this process, I've created a helper function called extract_json_values. This function takes a dictionary and a "specification" input, which defines the JSON path to the values we want to extract. This allows us to easily navigate the nested structure and retrieve the desired data.

In [92]:
specifications = {
    'workout_id': 'id',
    'ride_id': 'ride.id',
    'instructor_id': 'ride.instructor_id',
    'workout_name': 'ride.title',
    'workout_type': 'name',
    'workout_average_difficulty': 'ride.difficulty_rating_avg',
    'worktout_difficulty_level': 'ride.difficulty_level',    
    'workout_description': 'ride.description',
    'workout_start_time': 'start_time',
    'workout_duration': 'ride.duration',
    'total_work': 'total_work',
    'total_effort_points': 'effort_zones.total_effort_points',
    'heart_rate_z1_duration': 'effort_zones.heart_rate_zone_durations.heart_rate_z1_duration',
    'heart_rate_z2_duration': 'effort_zones.heart_rate_zone_durations.heart_rate_z2_duration',
    'heart_rate_z3_duration': 'effort_zones.heart_rate_zone_durations.heart_rate_z3_duration',
    'heart_rate_z4_duration': 'effort_zones.heart_rate_zone_durations.heart_rate_z4_duration',
    'heart_rate_z5_duration': 'effort_zones.heart_rate_zone_durations.heart_rate_z5_duration',
    'is_total_work_personal_record': 'is_total_work_personal_record'
} 

workout_compiled = []

for workout_data in all_workouts_data:
    output = data_tool.extract_json_values(workout_data, specifications)
    workout_compiled.append(output)

df_workout_compiled = pd.DataFrame(workout_compiled)

# remove sensitive information
display_column_list = list(df_workout_compiled.columns[~df_workout_compiled.columns.str.contains('workout_id')])


df_workout_compiled[display_column_list].head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
ride_id,6a0a65e3bc2e4dcbbac20ae766a8abb3,a0219517a0354cb79161b3b0163046d7,05d9dc4fc17046ebb552d27dabd96cfb,6a0a65e3bc2e4dcbbac20ae766a8abb3,7fe6bb4ae4b442ed822047193bc742ba,16ffab115f524f5a8c539e2eb9931ca1,05d9dc4fc17046ebb552d27dabd96cfb,75b75ca4d7604b868d24a99c97aa498c,7721cefa7a0d4aa58d2166452dbedbd3,6a0a65e3bc2e4dcbbac20ae766a8abb3
instructor_id,c9fa21c2004c4544a7c35c28a6196c77,c9fa21c2004c4544a7c35c28a6196c77,c9fa21c2004c4544a7c35c28a6196c77,c9fa21c2004c4544a7c35c28a6196c77,c9fa21c2004c4544a7c35c28a6196c77,1f4d39cd181c4805a00cd0a53f6c9562,c9fa21c2004c4544a7c35c28a6196c77,7f3de5e78bb44d8591a0f77f760478c3,304389e2bfe44830854e071bffc137c9,c9fa21c2004c4544a7c35c28a6196c77
workout_name,10 min Full Body Stretch,"30 min Density Training: Week 3, Day 3",5 min Full Body Warm Up,10 min Full Body Stretch,"20 min Density Training: Week 3, Day 2",30 min Intervals Row,5 min Full Body Warm Up,10 min Lower Body Stretch,45 min Power Zone Endurance Ride,10 min Full Body Stretch
workout_type,Stretching Workout,Strength Workout,Strength Workout,Stretching Workout,Strength Workout,Rowing Workout,Strength Workout,Stretching Workout,Cycling Workout,Stretching Workout
workout_average_difficulty,3.5994,7.821,3.8109,3.5994,7.4426,7.6909,3.8109,4.1108,6.0165,3.5994
worktout_difficulty_level,,intermediate,,,intermediate,,,,,
workout_description,You're not finished yet. Take this 10-min. str...,This full-body workout uses a series of heavy-...,This 5-min. full body warm up is designed to p...,You're not finished yet. Take this 10-min. str...,This accessory-based workout uses hypertrophy ...,Increase your strength and endurance in this i...,This 5-min. full body warm up is designed to p...,Recovery starts here! Your muscles will thank ...,Train smart with 7 zones of output customized ...,You're not finished yet. Take this 10-min. str...
workout_start_time,1738449690,1738447754,1738447362,1738291058,1738289699,1738287802,1738287294,1738203646,1738200781,1738119790
workout_duration,600,1800,300,600,1200,1800,300,600,2700,600
total_work,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,607471.01,0.0


Unfortunately, a quick exploratory data analysis (EDA) revealed significant missing data: a large portion of heart rate zone information is absent, and over half of the workouts have zero total effort points, indicating incompleteness.

In [93]:
percentage_missing = (df_workout_compiled.isna().sum() / len(df_workout_compiled)) * 100
percentage_zero = ((df_workout_compiled == 0).sum() / len(df_workout_compiled)) * 100
df_percent_missing_or_zero = pd.DataFrame({'pct_value_missing': percentage_missing, 'pct_value_zero': percentage_zero})
df_percent_missing_or_zero

Unnamed: 0,pct_value_missing,pct_value_zero
workout_id,0.0,0.0
ride_id,0.0,0.0
instructor_id,4.524887,0.0
workout_name,0.0,0.0
workout_type,0.0,0.0
workout_average_difficulty,4.524887,0.056561
worktout_difficulty_level,78.61991,0.0
workout_description,4.524887,0.0
workout_start_time,0.0,0.0
workout_duration,0.0,0.056561


I initially suspected inconsistent tracking of certain workout types, like stretching, might explain the missing data. However, the high number of missing values in strength workouts contradicts this, as I consistently wear a heart rate monitor during strength training to track intensity.

In [94]:
missing_effort = df_workout_compiled[df_workout_compiled['heart_rate_z1_duration'].isna()]\
    .groupby('workout_type')['workout_id']\
    .count()

total_workout = df_workout_compiled\
    .groupby('workout_type')['workout_id']\
    .count()

pct_missing_effort = round(missing_effort / total_workout * 100, 2)

df_total_effort_missing = pd.DataFrame({'count_missing': missing_effort, 'total_count': total_workout, 'pct_missing': pct_missing_effort})
df_total_effort_missing

Unnamed: 0_level_0,count_missing,total_count,pct_missing
workout_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bike Bootcamp Workout,1,1,100.0
Cardio Workout,8,17,47.06
Cycling Workout,204,726,28.1
Lanebreak Ride,10,23,43.48
Meditation Workout,18,23,78.26
Rowing Workout,7,31,22.58
Running Workout,9,10,90.0
Strength Workout,206,396,52.02
Stretching Workout,239,474,50.42
Walking Workout,50,66,75.76


Perhaps the `user-workout` endpoint is not as stable as I have hoped. 

In [114]:
workoutIds = list(df_workout_compiled[df_workout_compiled['heart_rate_z1_duration'].isna()].query('workout_type == "Strength Workout"')['workout_id'])
len(workoutIds)

206

In [108]:
api_base_url = "https://api.onepeloton.com"
session = requests.Session()
userID = api_tool.get_user_id(session)

missing_value_workouts = [] 

for workoutId in workoutIds:

    workoutId = workoutId
    path_workout = f"/api/workout/{workoutId}"
        
    response = session.get(api_base_url + path_workout)
    response_json = json.loads(response.text) 
    missing_value_workouts.append(response_json)

In [112]:
specifications = {
    'workout_id': 'id',
    'ride_id': 'ride.id',
    'instructor_id': 'ride.instructor_id',
    'workout_name': 'ride.title',
    'workout_type': 'name',
    'workout_average_difficulty': 'ride.difficulty_rating_avg',
    'worktout_difficulty_level': 'ride.difficulty_level',    
    'workout_description': 'ride.description',
    'workout_start_time': 'start_time',
    'workout_duration': 'ride.duration',
    'total_work': 'total_work',
    'avg_effort_points': 'effort_zones.average_effort_score',
    'heart_rate_z1_duration': 'total_heart_rate_zone_durations.heart_rate_z1_duration',
    'heart_rate_z2_duration': 'total_heart_rate_zone_durations.heart_rate_z2_duration',
    'heart_rate_z3_duration': 'total_heart_rate_zone_durations.heart_rate_z3_duration',
    'heart_rate_z4_duration': 'total_heart_rate_zone_durations.heart_rate_z4_duration',
    'heart_rate_z5_duration': 'total_heart_rate_zone_durations.heart_rate_z5_duration',
    'is_total_work_personal_record': 'is_total_work_personal_record'
} 

workout_compiled = []

for workout_data in missing_value_workouts:
    output = data_tool.extract_json_values(workout_data, specifications)
    workout_compiled.append(output)

df_missing_workout_compiled = pd.DataFrame(workout_compiled)

# remove sensitive information
#display_column_list = list(df_missing_workout_compiled.columns[~df_missing_workout_compiled.columns.str.contains('workout_id')])

percentage_missing = (df_missing_workout_compiled.isna().sum() / len(df_missing_workout_compiled)) * 100
percentage_zero = ((df_missing_workout_compiled == 0).sum() / len(df_missing_workout_compiled)) * 100
df_percent_missing_or_zero = pd.DataFrame({'pct_value_missing': percentage_missing, 'pct_value_zero': percentage_zero})
df_percent_missing_or_zero

Unnamed: 0,pct_value_missing,pct_value_zero
workout_id,0.0,0.0
ride_id,0.0,0.0
instructor_id,0.0,0.0
workout_name,0.0,0.0
workout_type,0.0,0.0
workout_average_difficulty,0.0,0.0
worktout_difficulty_level,34.466019,0.0
workout_description,0.0,0.0
workout_start_time,0.0,0.0
workout_duration,0.0,0.0


Unfortuantely, agumenting heartrate zone `workout` endpoint will still result in about 20% of workout to be missing heart rates.
It is not immediately clear to me why these information may be missing, and I'll keep an eye on this rate.

In [131]:
type_dict = {
    'workout_start_time':'datetime'  
}

df_missing_workout_compiled = data_tool.coerce_columns(df_missing_workout_compiled, type_dict = type_dict, date_unit = 's')


df_heart_rate_missing = df_missing_workout_compiled.query("heart_rate_z1_duration.isna()")
df_heart_rate_missing['year'] = df_heart_rate_missing['workout_start_time'].dt.year
df_heart_rate_missing['month'] = df_heart_rate_missing['workout_start_time'].dt.month

df_heart_rate_missing.groupby(['year', 'month'])\
    .size().reset_index(name='count')\
    .pivot_table(
    index = 'month',
    columns = 'year',
    values = 'count')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_heart_rate_missing['year'] = df_heart_rate_missing['workout_start_time'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_heart_rate_missing['month'] = df_heart_rate_missing['workout_start_time'].dt.month


year,2020,2021,2023,2024,2025
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,,17.0,3.0,2.0,4.0
2,,13.0,2.0,,
3,,1.0,2.0,2.0,
4,7.0,,,1.0,
5,9.0,,,1.0,
6,2.0,,,2.0,
7,,,,1.0,
9,,,,5.0,
10,,,4.0,2.0,
11,,,,1.0,


Another potential endpoint to use is `workout performance_graph`.

In [150]:
workoutIds = list(df_heart_rate_missing.query("year == 2025")['workout_id'])

api_base_url = "https://api.onepeloton.com"
session = requests.Session()
userID = api_tool.get_user_id(session)

missing_value_workouts = [] 

for workoutId in workoutIds:

    workoutId = workoutId
    path_workout = f"/api/workout/{workoutId}/performance_graph"
        
    response = session.get(api_base_url + path_workout)
    response_json = json.loads(response.text) 
    missing_value_workouts.append(response_json)

specifications = {
    'workout_summaries': 'summaries'
} 

workout_compiled = []

for workout_data in missing_value_workouts:
    output = data_tool.extract_json_values(workout_data, specifications)
    workout_compiled.append(output)

df_missing_workout_compiled = pd.DataFrame(workout_compiled)
df_missing_workout_compiled

Unnamed: 0,workout_summaries
0,"[{'display_name': 'Calories', 'display_unit': ..."
1,"[{'display_name': 'Calories', 'display_unit': ..."
2,"[{'display_name': 'Calories', 'display_unit': ..."
3,"[{'display_name': 'Calories', 'display_unit': ..."
