# Cycling Analysis

This Jupyter Notebook contains some analysis of several months of data taken off of my Wahoo. I initially wanted to take a look at this data as Strava wasn't showing me my cadence, and ended up having some fun generating additional charts that I couldn't get from Strava.

Note: You will need to open this notebook in nbviewer.jupyter.org to see the plotly plots.

In [1]:
# imports
import pandas as pd
import numpy as np
import plotly.offline as py
import plotly.graph_objs as go
from plotly import tools
import warnings

# libraries for linear regression
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

# parsing .fit files
from fitparse import FitFile, FitParseError

# system imports
import glob
import itertools

In [2]:
# settings
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', -1)

# initialize plotly for offline use
py.init_notebook_mode(connected=True)

# supress warnings
warnings.filterwarnings('ignore')

In [3]:
fit_files = glob.glob("./data/*.fit")
csv_file = glob.glob("./data/*.csv")

In [4]:
# loop through our fit files and store the data into a dataframe - only if the csv hasn't been generated

if csv_file:
    sessions = pd.read_csv("./data/sessions_data.csv", parse_dates=["start_time"])
else:
    ind = 0

    for file in fit_files:
        fit_file = FitFile(file)

        # skip files that don't parse properly
        try:
            fit_file.parse()
        except FitParseError:
            continue

        session = [ses for ses in fit_file.get_messages('session')][0]
        session = session.get_values()

        # clean up the hr zone column
        if "time_in_hr_zone" in session.keys():
            session["time_in_hr_zone1"] = session["time_in_hr_zone"][0]
            session["time_in_hr_zone2"] = session["time_in_hr_zone"][1]
            session["time_in_hr_zone3"] = session["time_in_hr_zone"][2]
            session["time_in_hr_zone4"] = session["time_in_hr_zone"][3]
            session["time_in_hr_zone5"] = session["time_in_hr_zone"][4]
            del session["time_in_hr_zone"]

        if ind == 0:
            sessions = pd.DataFrame(columns=session.keys())
        session = pd.DataFrame(session, index=[ind])
        sessions = pd.concat([session, sessions], sort=False)
        ind += 1
    # write to csv so that we only need to parse once
    sessions.to_csv("./data/sessions_data.csv")
    
    

In [5]:
sessions = sessions.sort_values(by="start_time", ascending=False)

In [6]:
# some unit conversions and datatype cleansing
for col in sessions.columns:
    if "speed" in col:
        sessions[col] = sessions[col] * 2.23694
    elif "altitude" in col:
        sessions[col] = sessions[col] * 3.2808
    elif "distance" in col:
        sessions[col] = sessions[col] * 0.000621371
    # fix the cadence datatype
    elif "cadence" in col:
        sessions[col] = pd.to_numeric(sessions[col])
        
sessions["total_ascent"] = pd.to_numeric(sessions["total_ascent"])

# filter out the rides that weren't workouts - I know this because I only wear a hearbeat sensor on workout rides
sessions = sessions[sessions.time_in_hr_zone1.notnull()]

In [7]:
# output our dtypes to ensure everything looks accurate
sessions.dtypes

Unnamed: 0               int64         
event                    object        
event_type               object        
timestamp                object        
start_time               datetime64[ns]
total_elapsed_time       float64       
total_timer_time         float64       
enhanced_avg_speed       float64       
avg_speed                float64       
enhanced_max_speed       float64       
max_speed                float64       
total_distance           float64       
avg_cadence              int64         
max_cadence              int64         
min_heart_rate           float64       
avg_heart_rate           float64       
max_heart_rate           float64       
enhanced_min_altitude    float64       
min_altitude             float64       
enhanced_avg_altitude    float64       
avg_altitude             float64       
enhanced_max_altitude    float64       
max_altitude             float64       
max_neg_grade            float64       
avg_grade                float64       


In [8]:
# output summary statistics
sessions.describe()

Unnamed: 0.1,Unnamed: 0,total_elapsed_time,total_timer_time,enhanced_avg_speed,avg_speed,enhanced_max_speed,max_speed,total_distance,avg_cadence,max_cadence,min_heart_rate,avg_heart_rate,max_heart_rate,enhanced_min_altitude,min_altitude,enhanced_avg_altitude,avg_altitude,enhanced_max_altitude,max_altitude,max_neg_grade,avg_grade,max_pos_grade,total_calories,avg_temperature,max_temperature,total_ascent,total_descent,num_laps,threshold_power,time_in_hr_zone1,time_in_hr_zone2,time_in_hr_zone3,time_in_hr_zone4,time_in_hr_zone5
count,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0
mean,30.947368,9417.657895,6749.5,15.325982,15.325982,31.652583,31.652583,27.962647,72.473684,116.684211,94.631579,154.842105,178.894737,231.279133,231.279133,353.117684,353.117684,544.405592,544.405592,-4.106579,0.116053,5.282632,1646.763158,24.421053,30.026316,210.552632,205.447368,1.0,200.0,329.071474,1164.293526,3240.473526,1937.332553,16.417105
std,18.778866,5276.339124,4280.741472,2.342435,2.342435,16.830624,16.830624,16.682182,8.401212,7.443863,21.023052,14.402564,13.939618,944.082504,944.082504,1039.997377,1039.997377,1266.433861,1266.433861,2.792409,0.305911,3.366061,1115.182119,6.219549,8.128983,352.330365,349.401937,0.0,0.0,457.823415,1294.941541,2262.736471,2295.042843,54.880922
min,0.0,376.0,309.0,6.977016,6.977016,17.779199,17.779199,1.461135,46.0,104.0,36.0,88.0,111.0,-143.04288,-143.04288,-112.85952,-112.85952,-72.83376,-72.83376,-13.1,-0.06,1.32,22.0,12.0,17.0,8.0,8.0,1.0,200.0,0.0,0.0,0.0,0.0,0.0
25%,15.5,4550.5,4014.0,14.931015,14.931015,24.831152,24.831152,18.607685,69.0,112.0,86.0,149.25,177.25,-18.37248,-18.37248,1.96848,1.96848,30.51144,30.51144,-4.8925,0.01,2.7275,975.0,20.25,22.25,37.25,34.5,1.0,200.0,60.7035,381.309,1819.27975,480.302,0.0
50%,32.5,8771.5,4781.5,16.062348,16.062348,27.356658,27.356658,20.85057,74.5,116.5,98.5,158.5,182.0,1.31232,1.31232,27.55872,27.55872,58.72632,58.72632,-3.135,0.015,3.265,1180.0,24.0,30.5,51.5,50.0,1.0,200.0,122.0195,765.685,2533.8035,1529.5515,0.0
75%,47.25,14183.5,8417.75,16.834092,16.834092,30.320044,30.320044,33.138663,77.0,120.0,111.5,162.0,186.75,18.0444,18.0444,61.02288,61.02288,184.05288,184.05288,-2.555,0.0275,7.8975,1977.75,29.0,36.0,224.25,211.5,1.0,200.0,348.8645,1388.5595,4233.0275,2487.11225,0.0
max,63.0,20299.0,18253.0,17.929074,17.929074,121.110169,121.110169,66.370476,86.0,139.0,136.0,180.0,194.0,5482.87296,5482.87296,5777.4888,5777.4888,6335.2248,6335.2248,-0.97,1.6,11.94,5152.0,36.0,48.0,1324.0,1268.0,1.0,200.0,1656.278,6188.227,9268.64,12686.415,289.625


In [9]:
# Linear regression analysis function - this should work on any dataframe with numeric data,
# or on date data for the x_var with the x_time parameter set to True

def linear_regression(df, x_var, y_var, x_time=False):

    shuffled = df.sample(frac=1).reset_index(drop=True)

    X_keep = shuffled[x_var]
    test_size = int(round(X_keep.count()*.2))

    if x_time:
        X = (max(X_keep) - X_keep).dt.total_seconds()
        X = X.values.reshape(-1, 1)
    else:
        X = X_keep
    Y = shuffled[y_var]


    # set up training sets
    X_train = X[:-test_size]
    X_test = X[-test_size:]
    Y_train = Y[:-test_size]
    Y_test = Y[-test_size:]

    regr = linear_model.LinearRegression()

    # train the model
    regr.fit(X_train, Y_train)

    # make predictions with the testing set
    Y_pred = regr.predict(X_test)

    # # The coefficients
    # print('Coefficients: \n', regr.coef_)
    # # The mean squared error
    # print("Mean squared error: %.2f"
    #       % mean_squared_error(Y_test, Y_pred))
    # # Explained variance score: 1 is perfect prediction
    # print('Variance score: %.2f' % r2_score(Y_test, Y_pred))

    Y_pred = regr.predict(X)

    return (X_keep, Y_pred)

In [10]:
def time_depencies(df, x_time, variables):
    data = []
    buttons = []
    visible = [False]*len(variables)*2
    for i, var in enumerate(variables):
        trace = go.Scatter(
            x = sessions[x_time],
            y = sessions[var],
            mode = 'markers',
            visible = not (i > 0)
        )
        data.append(trace)

        # Trend line:
        X, Y = linear_regression(sessions, "start_time", var, x_time=True)

        trace_fit = go.Scatter(
            x = X,
            y = Y,
            mode = 'lines',
            name ='trend',
            visible = not (i > 0)
        )
        data.append(trace_fit)
        
        # Generate the update menu buttons
        new_visible = visible[:]
        new_visible[i*2] = True
        new_visible[i*2+1] = True
        button = dict(label = var,
                       method = 'update',
                       args = [{'visible':new_visible}, {'title': str(var) + " Over Time"}])
        buttons.append(button)
        
    updatemenus = list([
        dict(
             buttons=buttons,
             x=.1,
             xanchor="left",
             y=1.1,
             yanchor="top",
             showactive=False,
        ),
    ])


    layout = go.Layout(
        xaxis = dict(
            title=x_time
        ),
        title=str(variables[0]) + " Over Time",
        updatemenus=updatemenus,
    )
    return go.Figure(data=data, layout=layout)

py.iplot(time_depencies(sessions, "start_time", ["avg_cadence", "avg_speed", "total_distance", "total_ascent"]), filename="trends")

In [11]:
# Plot the fraction of time spend in various HR zones
zones = pd.melt(sessions, id_vars=["start_time"], value_vars=["time_in_hr_zone1", 
                                                              "time_in_hr_zone2", 
                                                              "time_in_hr_zone3",
                                                              "time_in_hr_zone4",
                                                              "time_in_hr_zone5"])


zgpby = zones.groupby(by="variable").value.sum()
zgpby = zgpby/zgpby.sum()
zgpby = zgpby.sort_index(ascending=False)
colors = ["#d90000", "#ff3d3d", "#ff7777", "#ffb2b2", "#ffeded"]

# Rename hr zone values
zgpby = zgpby.rename(lambda x: x.replace("time_in_hr_zone", "HR Zone ") ,axis="index")

trace = go.Pie(
    values = zgpby,
    labels = zgpby.index,
    name = "HR Zones",
    hoverinfo = "label+name+percent",
    hole = .4 ,
    marker = dict(colors=colors),
    sort = False
)

layout = go.Layout(
    annotations = [dict(
        font={"size": 20},
        showarrow=False,
        text="HR<br>Zones",
        x=.5,
        y=.5
    )]
)

data = [trace]

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename="hr_zones")

In [12]:
# What about a histogram of speed?

trace1 = go.Histogram( 
    x = sessions.avg_cadence,
    name = "Average Cadence"
)

trace2 = go.Histogram(
    x = sessions.avg_speed,
    name = "Average Speed"
)

layout = go.Layout(
    title = "X"
)

fig = tools.make_subplots(rows=1, cols=2, subplot_titles=(["Average Cadence Distribution", "Average Speed Distribution"]))
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig['layout'].update(height=400, width= 900, xaxis={"title":"RPM"}, xaxis2={"title":"MPH"},
                    yaxis={"title":"Number of Rides"})
py.iplot(fig, filename="hists")

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]



In [13]:
# Finally, let's see how many rides per week I've been on - seems they have been pretty steady - no surprise there

sessions_ts = sessions.set_index("start_time")
sessions_gpby = sessions_ts.groupby(pd.Grouper(freq='W')).avg_speed.count()

trace = go.Scatter(
    x = sessions_gpby.index,
    y = sessions_gpby,
    mode='lines'
)

data = [trace]

layout = go.Layout(
    title="Rides Per Week",
    xaxis=dict(
        title="Week Of"
    ),
    yaxis=dict(
        title="# of Rides"
    )
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename="rides")

In [14]:
# prepare a separate csv file output for a dash app
rename_cols = {
    "avg_cadence":"Average Cadence",
    "avg_speed":"Average Speed",
    "avg_grade":"Average Grade",
    "avg_heart_rate":"Average Heart Rate",
    "start_time":"Ride Date",
    "total_elapsed_time":"Ride Duration"
}

sessions.rename(rename_cols, axis="columns",inplace=True)
sessions[list(rename_cols.values())].to_csv("./data/dash_data.csv", index=False)