In [65]:
# imports
import pandas as pd
import numpy as np
import plotly.offline as py
import plotly.graph_objs as go

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

# Parsing fit files
from fitparse import FitFile, FitParseError

# system imports
import glob
import itertools

In [2]:
# settings
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', -1)

# initialize plotly for offline use
py.init_notebook_mode(connected=True)

In [3]:
fit_files = glob.glob("./data/*.fit")
csv_file = glob.glob("./data/*.csv")

In [6]:
# loop through our fit files and store the data into a dataframe - only if the csv hasn't been generated

if csv_file:
    sessions = pd.read_csv("./data/sessions_data.csv", parse_dates=True)
else:
    ind = 0

    for file in fit_files:
        fit_file = FitFile(file)

        # skip files that don't parse properly
        try:
            fit_file.parse()
        except FitParseError:
            continue

        session = [ses for ses in fit_file.get_messages('session')][0]
        session = session.get_values()

        # clean up the hr zone column
        if "time_in_hr_zone" in session.keys():
            session["time_in_hr_zone1"] = session["time_in_hr_zone"][0]
            session["time_in_hr_zone2"] = session["time_in_hr_zone"][1]
            session["time_in_hr_zone3"] = session["time_in_hr_zone"][2]
            session["time_in_hr_zone4"] = session["time_in_hr_zone"][3]
            session["time_in_hr_zone5"] = session["time_in_hr_zone"][4]
            del session["time_in_hr_zone"]

        if ind == 0:
            sessions = pd.DataFrame(columns=session.keys())
        session = pd.DataFrame(session, index=[ind])
        sessions = pd.concat([session, sessions], sort=False)
        ind += 1
    # write to csv so that we only need to parse once
    sessions.to_csv("./data/sessions_data.csv")
    
    

In [7]:
sessions = sessions.sort_values(by="start_time", ascending=False)

In [9]:
# some unit conversions and datatype cleansing
for col in sessions.columns:
    if "speed" in col:
        sessions[col] = sessions[col] * 2.23694
    elif "altitude" in col:
        sessions[col] = sessions[col] * 3.2808
    elif "distance" in col:
        sessions[col] = sessions[col] * 0.000621371
    # fix the cadence datatype
    elif "cadence" in col:
        sessions[col] = pd.to_numeric(sessions[col])
        
# filter out the rides that weren't workouts - I know this because I only wear a hearbeat sensor on workout rides
sessions = sessions[sessions.time_in_hr_zone1.notnull()]

In [10]:
sessions.describe()

Unnamed: 0,total_elapsed_time,total_timer_time,enhanced_avg_speed,avg_speed,enhanced_max_speed,max_speed,total_distance,avg_cadence,max_cadence,enhanced_min_altitude,min_altitude,enhanced_avg_altitude,avg_altitude,enhanced_max_altitude,max_altitude,max_neg_grade,avg_grade,max_pos_grade,time_in_hr_zone1,time_in_hr_zone2,time_in_hr_zone3,time_in_hr_zone4,time_in_hr_zone5
count,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0
mean,9417.657895,6749.5,15.325982,15.325982,31.652583,31.652583,27.962647,72.473684,116.684211,231.279133,231.279133,353.117684,353.117684,544.405592,544.405592,-4.106579,0.116053,5.282632,329.071474,1164.293526,3240.473526,1937.332553,16.417105
std,5276.339124,4280.741472,2.342435,2.342435,16.830624,16.830624,16.682182,8.401212,7.443863,944.082504,944.082504,1039.997377,1039.997377,1266.433861,1266.433861,2.792409,0.305911,3.366061,457.823415,1294.941541,2262.736471,2295.042843,54.880922
min,376.0,309.0,6.977016,6.977016,17.779199,17.779199,1.461135,46.0,104.0,-143.04288,-143.04288,-112.85952,-112.85952,-72.83376,-72.83376,-13.1,-0.06,1.32,0.0,0.0,0.0,0.0,0.0
25%,4550.5,4014.0,14.931015,14.931015,24.831152,24.831152,18.607685,69.0,112.0,-18.37248,-18.37248,1.96848,1.96848,30.51144,30.51144,-4.8925,0.01,2.7275,60.7035,381.309,1819.27975,480.302,0.0
50%,8771.5,4781.5,16.062348,16.062348,27.356658,27.356658,20.85057,74.5,116.5,1.31232,1.31232,27.55872,27.55872,58.72632,58.72632,-3.135,0.015,3.265,122.0195,765.685,2533.8035,1529.5515,0.0
75%,14183.5,8417.75,16.834092,16.834092,30.320044,30.320044,33.138663,77.0,120.0,18.0444,18.0444,61.02288,61.02288,184.05288,184.05288,-2.555,0.0275,7.8975,348.8645,1388.5595,4233.0275,2487.11225,0.0
max,20299.0,18253.0,17.929074,17.929074,121.110169,121.110169,66.370476,86.0,139.0,5482.87296,5482.87296,5777.4888,5777.4888,6335.2248,6335.2248,-0.97,1.6,11.94,1656.278,6188.227,9268.64,12686.415,289.625


In [115]:
# Linear regression analysis function

def linear_regression(df, x_var, y_var, x_time=False):

    shuffled = df.sample(frac=1).reset_index(drop=True)

    X_keep = shuffled[x_var]
    test_size = int(round(X_keep.count()*.2))

    if x_time:
        X = (max(X_keep) - X_keep).dt.total_seconds()
        X = X.values.reshape(-1, 1)
    else:
        X = X_keep
    Y = shuffled[y_var]


    # set up training sets
    X_train = X[:-test_size]
    X_test = X[-test_size:]
    Y_train = Y[:-test_size]
    Y_test = Y[-test_size:]

    regr = linear_model.LinearRegression()

    # train the model
    regr.fit(X_train, Y_train)

    # make predictions with the testing set
    Y_pred = regr.predict(X_test)

    # # The coefficients
    # print('Coefficients: \n', regr.coef_)
    # # The mean squared error
    # print("Mean squared error: %.2f"
    #       % mean_squared_error(Y_test, Y_pred))
    # # Explained variance score: 1 is perfect prediction
    # print('Variance score: %.2f' % r2_score(Y_test, Y_pred))

    Y_pred = regr.predict(X)

    return (X_keep, Y_pred)

In [116]:
# Let's group by time and look at how some different statistics vary

#sessions["bubble_display"] = 

#sessions_ts = sessions.set_index("start_time")
#sessions_gpby = sessions_ts.groupby(pd.Grouper(freq='W'))

def time_depencies(df, x_time, variables):
    data = []
    visible = [false]*len(variables)*2 
    for i, var in enumerate(variables):
        trace = go.Scatter(
            x = sessions[x_time],
            y = sessions[var],
            mode = 'markers',
            name ='cadence',
#             marker = dict(
#                 size = sessions.total_distance,
#                 sizemode = "area",
#                 sizeref = 2.*max(sessions.total_distance)/(40.**2),
#                 sizemin = 4
#             )
        )
        data.append(trace)

        X, Y = linear_regression(sessions, "start_time", var, x_time=True)

        trace_fit = go.Scatter(
            x = X,
            y = Y,
            mode = 'lines',
            name ='trend'
        )
        data.append(trace_fit)
        
        # Generate the update menu buttons
        new_visible = visible
        new_visible[i*2] = True
        new_visible[i*2-1] = True
        buttons = dict(label = var,
                       method = 'update',
                       args = [{'visible':new_visible},
                               {'title': str(var) + " Over Time"}])
        
    updatemenus = list([
        dict(
             buttons=buttons,
             x=.5,
             xanchor="left",
             y=1.1,
             yanchor="top",
             showactive=True,
        ),
    ])


    layout = go.Layout(
        title="Cadence Over Time",
        xaxis = dict(
            title=x_time
        ),
        updatemenus=updatemenus
    )
    return go.Figure(data=data, layout=layout)

py.iplot(time_depencies(sessions, "start_time", ["avg_cadence", "avg_speed", "total_distnace", "total_"]))

In [109]:
# Function to dynamically generate correlation plots with numeric data
def generate_crossfilter(df, variables):
    
    trace = go.Scatter(
        x = df[variables[0]],
        y = df[variables[1]],
        mode = 'markers',
        name =str(variables[0]) + " vs. " + str(variables[1]),
    )
    
    data = [trace]

    # Generate update menus
    # X-Axis:
    updatemenu_dicts_x = []
    updatemenu_dicts_y = []
    for var in variables:  
        updatemenu_dict_x = dict(label = var,
                               method = 'update',
                               args = [{
                                   'x':df[var], 'y':df["avg_cadence"]
                               }])
        updatemenu_dict_y = dict(label = var,
                               method = 'update',
                               args = [{
                                   'y':df[var]
                               }])
        updatemenu_dicts_x.append(updatemenu_dict_x)
        updatemenu_dicts_y.append(updatemenu_dict_y)
    
    
    updatemenus = list([
        dict(
             buttons=updatemenu_dicts_y,
             x=.5,
             xanchor="left",
             y=1.1,
             yanchor="top",
             showactive=True,
        ),
        dict(
             buttons=updatemenu_dicts_y,
             x=.05,
             xanchor="left",
             y=1.1,
             yanchor="top",
             showactive=True
        ),
    ])
    
    layout = go.Layout(
#         xaxis = dict(
#             title="X-Axis Selection"
#         ),
#         yaxis = dict(
#             title="Y-Axis Selection"
#         ),
        updatemenus=updatemenus
    )
    
    return (data, layout)
data, layout = generate_crossfilter(sessions, ["avg_speed", "avg_cadence", "total_distance"])
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)    

In [None]:
# Dynamically Generate Correlation Plots for Different variables:
# Speed
# Cadence
# Elevation


In [None]:
# Dynamically Generate Histograms

In [None]:
# finally, how does distance correlate with speed?

In [117]:
# additional study?
sessions

Unnamed: 0,event,event_type,timestamp,start_time,total_elapsed_time,total_timer_time,enhanced_avg_speed,avg_speed,enhanced_max_speed,max_speed,total_distance,avg_cadence,max_cadence,min_heart_rate,avg_heart_rate,max_heart_rate,enhanced_min_altitude,min_altitude,enhanced_avg_altitude,avg_altitude,enhanced_max_altitude,max_altitude,max_neg_grade,avg_grade,max_pos_grade,total_calories,avg_temperature,max_temperature,total_ascent,total_descent,sport,num_laps,threshold_power,time_in_hr_zone1,time_in_hr_zone2,time_in_hr_zone3,time_in_hr_zone4,time_in_hr_zone5
17,session,stop,2018-07-08 01:15:48,2018-07-07 19:37:29,20299.0,16418.0,9.978989,9.978989,42.993987,42.993987,45.510355,51,110,116,149,190,34.12032,34.12032,782.79888,782.79888,1633.18224,1633.18224,-13.1,0.16,10.45,3874,26,39,1324,1268,cycling,1,200,508.114,5342.748,6332.567,3680.217,38.429
59,session,stop,2018-07-06 19:52:41,2018-07-06 18:30:37,4924.0,4758.0,15.886748,15.886748,25.83442,25.83442,20.996188,67,119,101,148,172,-18.37248,-18.37248,9.8424,9.8424,43.96272,43.96272,-1.85,0.02,2.78,1027,30,35,50,41,cycling,1,200,353.174,1690.943,2554.565,148.512,0.0
37,session,stop,2018-07-04 21:11:56,2018-07-04 17:48:46,12190.0,8475.0,15.812929,15.812929,121.110169,121.110169,37.226964,70,139,82,151,183,13.1232,13.1232,40.68192,40.68192,78.7392,78.7392,-3.85,0.01,6.49,1935,25,48,88,70,cycling,1,200,248.254,2576.915,4980.874,644.893,0.0
13,session,stop,2018-07-04 02:46:14,2018-07-04 01:19:45,5189.0,4735.0,16.032149,16.032149,26.530108,26.530108,21.08625,66,109,86,150,171,6.5616,6.5616,32.808,32.808,59.71056,59.71056,-3.61,0.02,5.58,1058,23,25,47,38,cycling,1,200,335.936,1205.063,3008.668,175.129,0.0
42,session,stop,2018-07-01 23:15:31,2018-07-01 18:43:28,16323.0,8246.0,15.233561,15.233561,27.196717,27.196717,34.893555,67,114,88,160,182,5.24928,5.24928,38.71344,38.71344,68.8968,68.8968,-5.05,0.01,6.75,2089,36,40,73,62,cycling,1,200,77.326,968.732,4843.55,2327.905,0.0
30,session,stop,2018-06-30 21:38:57,2018-06-30 17:33:46,14711.0,5510.0,16.390059,16.390059,23.483396,23.483396,25.086742,76,115,87,164,187,12.46704,12.46704,36.0888,36.0888,54.46128,54.46128,-0.97,0.01,2.51,1440,35,41,22,11,cycling,1,200,222.369,233.016,2155.973,2882.015,0.0
36,session,stop,2018-06-24 20:47:42,2018-06-24 16:50:21,14241.0,8022.0,14.126276,14.126276,25.606252,25.606252,31.475933,59,126,109,159,194,-18.37248,-18.37248,0.0,0.0,24.27792,24.27792,-3.31,0.02,4.42,1992,34,42,72,76,cycling,1,200,191.37,1827.998,3336.945,2469.545,169.45
40,session,stop,2018-06-23 02:25:22,2018-06-23 01:14:40,4242.0,4086.0,17.262466,17.262466,27.516599,27.516599,19.591747,71,111,92,162,186,-6.5616,-6.5616,12.46704,12.46704,45.9312,45.9312,-2.08,0.01,2.75,1067,33,36,38,34,cycling,1,200,77.827,382.986,2061.932,1552.351,0.0
48,session,stop,2018-06-17 20:13:20,2018-06-17 18:53:37,4783.0,4314.0,15.783849,15.783849,22.552829,22.552829,18.914527,75,117,94,133,159,0.0,0.0,17.06016,17.06016,44.61888,44.61888,-2.23,0.02,2.56,660,23,26,32,33,cycling,1,200,1352.792,2499.104,443.289,0.0,0.0
38,session,stop,2018-06-16 21:58:16,2018-06-16 18:18:20,13196.0,12221.0,17.190884,17.190884,30.880957,30.880957,58.360811,75,120,119,158,183,-8.53008,-8.53008,64.30368,64.30368,183.06864,183.06864,-5.57,0.02,9.36,3081,29,36,243,243,cycling,1,200,54.678,1686.749,7957.601,2492.968,0.0
