In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scienceplots

sns.set_palette("viridis")
palette = sns.color_palette()
plt.style.use(['science', 'grid'])

# cleaning physiological cycles

In [3]:
physio = pd.read_csv('raw/physiological_cycles.csv')

physio.head()

Unnamed: 0,Cycle start time,Cycle end time,Cycle timezone,Recovery score %,Resting heart rate (bpm),Heart rate variability (ms),Skin temp (celsius),Blood oxygen %,Day Strain,Energy burned (cal),...,Respiratory rate (rpm),Asleep duration (min),In bed duration (min),Light sleep duration (min),Deep (SWS) duration (min),REM duration (min),Awake duration (min),Sleep need (min),Sleep debt (min),Sleep efficiency %
0,2023-03-22 23:34:04,,UTCZ,65.0,53.0,61.0,34.39,94.85,,,...,14.1,511.0,569.0,288.0,111.0,112.0,58.0,548.0,47.0,89.0
1,2023-03-22 01:32:38,2023-03-22 23:34:04,UTCZ,34.0,63.0,44.0,34.79,94.21,13.5,2252.0,...,15.7,407.0,484.0,248.0,95.0,64.0,77.0,511.0,35.0,84.0
2,2023-03-21 01:54:36,2023-03-22 01:32:38,UTCZ,24.0,63.0,41.0,35.2,95.53,2.5,1014.0,...,16.1,465.0,526.0,254.0,128.0,83.0,61.0,545.0,71.0,88.0
3,2023-03-20 01:32:34,2023-03-20 23:20:32,UTCZ,34.0,60.0,53.0,34.5,97.08,11.1,1989.0,...,15.5,346.0,376.0,186.0,90.0,70.0,30.0,491.0,0.0,92.0
4,2023-03-18 23:20:32,2023-03-20 01:32:34,UTCZ,95.0,51.0,88.0,34.7,96.0,11.5,2286.0,...,14.1,518.0,570.0,293.0,110.0,115.0,52.0,506.0,6.0,91.0


In [4]:
# convert dates to datetime and times to datetime
physio = physio.drop(columns = ['Cycle start time', 'Cycle end time', 'Cycle timezone'])

date_columns = ['Sleep onset', 'Wake onset']

for col in date_columns:
    physio[col] = pd.to_datetime(physio[col])

physio.columns = (
    physio.columns.
    str.lower().
    str.replace(' ', '_').
    str.replace('(', '').
    str.replace(')', '')
)

physio = physio.dropna()

physio.to_csv('physiological_cycles_cleaned.csv', index = False)

# cleaning sleep data

In [61]:
sleep = pd.read_csv('raw/sleeps.csv')

sleep.drop(columns = ['Cycle start time', 'Cycle end time', 'Cycle timezone'], inplace = True)

sleep.columns = (
    sleep.columns.
    str.lower().
    str.replace(' ', '_').
    str.replace('(', '').
    str.replace(')', '')
)

sleep = sleep.dropna()

sleep.to_csv('sleeps_cleaned.csv', index = False)

# cleaning workout data

In [5]:
workout = pd.read_csv('raw/workouts.csv')

workout.drop(columns = ['Cycle start time', 'Cycle end time', 'Cycle timezone', 'GPS enabled'], inplace = True)

workout.columns = (
    workout.columns.
    str.lower().
    str.replace(' ', '_').
    str.replace('(', '').
    str.replace(')', '')
)

workout = workout.dropna()

workout.to_csv('workouts_cleaned.csv', index = False)

In [2]:
strava = pd.read_csv('raw/strava_activities.csv')

print(len(strava))

columns_to_drop = [col for col in strava.columns if '<span' in col]

# Drop the identified columns
strava = strava.drop(columns=columns_to_drop)

strava = strava.dropna(axis = 1, thresh = 0.5 * len(strava))

strava.columns = (
    strava.columns.
    str.lower().
    str.replace(' ', '_').
    str.replace('(', '').
    str.replace(')', '')
)

def categorize_text(text):
    if 'light' in text.lower() or 'restorative' in text.lower():
        return 'restorative'
    elif 'moderate' in text.lower() or 'optimal' in text.lower():
        return 'optimal'
    elif 'overreaching' in text.lower():
        return 'overreaching'
    return text  # Keep original if no match

# Apply the function to the column
strava['activity_description'] = strava['activity_description'].apply(lambda x: categorize_text(x) if isinstance(x, str) else x).astype('category')

strava['activity_type'] = strava['activity_type'].astype('category')

strava['activity_date'] = pd.to_datetime(strava['activity_date'])

strava = strava.drop(columns = ['activity_id', 'activity_name', 'media', 'from_upload', 'filename', 'commute', 'commute.1', 'distance', 'elapsed_time', 'relative_effort'])

strava = strava.dropna()

strava.columns = (
    strava.columns.
    str.replace('.1', '')
)

strava.to_csv('strava_activities_cleaned.csv', index = False)

165


  strava['activity_date'] = pd.to_datetime(strava['activity_date'])
