In [1]:
%matplotlib notebook

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sns.set_style('white')

df_run = pd.read_pickle('df_run.p')

df_run.head()

Unnamed: 0_level_0,id,athlete_count,distance,avg_speed,avg_hr,type,elevation_gain,latlng,pace,duration,zone
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2017-07-07 18:00:38,1176094669,1,5022.2,2.589,,Run,21.9,"(55.95, -3.19)",6.437492,32.333333,
2017-07-05 17:50:32,1173071844,1,4977.9,2.469,148.3,Run,20.5,"(55.95, -3.19)",6.750371,33.6,2.0
2017-07-01 16:30:48,1173071487,1,2889.1,2.586,160.1,Run,20.4,"(55.95, -3.19)",6.44496,18.616667,3.0
2017-06-28 19:55:15,1163118846,1,414.8,2.942,,Run,0.0,"(55.95, -3.19)",5.66508,2.35,
2017-06-28 19:30:59,1163118747,1,2732.6,2.598,,Run,75.0,"(55.96, -3.16)",6.415191,17.533333,


In [2]:
df_run['zone'].notnull().sum()

39

In [3]:
df_run['zone'].isnull().sum()

90

Problem: High proportion of Null for Heart Rate

Solution: Use ML to predict zone

In [4]:
# Seperate data
zone_train = df_run[df_run['zone'].notnull()]
zone_test = df_run[df_run['zone'].isnull()]
zone_train.head()

Unnamed: 0_level_0,id,athlete_count,distance,avg_speed,avg_hr,type,elevation_gain,latlng,pace,duration,zone
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2017-07-05 17:50:32,1173071844,1,4977.9,2.469,148.3,Run,20.5,"(55.95, -3.19)",6.750371,33.6,2.0
2017-07-01 16:30:48,1173071487,1,2889.1,2.586,160.1,Run,20.4,"(55.95, -3.19)",6.44496,18.616667,3.0
2017-06-26 17:58:31,1159027781,1,5041.2,2.778,163.3,Run,22.3,"(55.95, -3.19)",5.99952,30.25,4.0
2017-06-19 17:33:55,1159027395,1,4171.9,2.741,152.0,Run,17.8,"(55.95, -3.19)",6.080506,25.366667,2.0
2017-06-13 17:17:31,1138444604,1,3025.8,2.921,161.2,Run,19.9,"(55.95, -3.19)",5.705809,17.266667,3.0


In [5]:
# Drop uneccessary cols
# pace correlates with distance and duration, so drop
col_drop = ['id', 'athlete_count', 'avg_speed', 'type', 'latlng', 'avg_hr', 'pace']

zone_train = zone_train.drop(col_drop, axis=1)
zone_test = zone_test.drop(col_drop, axis=1)
zone_train.head()

Unnamed: 0_level_0,distance,elevation_gain,duration,zone
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-07-05 17:50:32,4977.9,20.5,33.6,2.0
2017-07-01 16:30:48,2889.1,20.4,18.616667,3.0
2017-06-26 17:58:31,5041.2,22.3,30.25,4.0
2017-06-19 17:33:55,4171.9,17.8,25.366667,2.0
2017-06-13 17:17:31,3025.8,19.9,17.266667,3.0


In [6]:
# Check correlation

zone_train.corr()

Unnamed: 0,distance,elevation_gain,duration,zone
distance,1.0,0.56788,0.994291,-0.050964
elevation_gain,0.56788,1.0,0.554582,0.3703
duration,0.994291,0.554582,1.0,-0.053661
zone,-0.050964,0.3703,-0.053661,1.0


In [8]:
zone_train.shape
# Data size too small

(39, 4)

In [36]:
sns.lmplot(x='avg_hr', y='pace', data=df_run)

<IPython.core.display.Javascript object>

<seaborn.axisgrid.FacetGrid at 0x185ac993780>

In [37]:
# Drop outliers


In [38]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [46]:
X = zone_train.drop('zone', axis=1)
y = zone_train['zone']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size= 0.10,
                                                    random_state=1)

### Using Linear Regression Model

In [47]:
model_lr = LinearRegression().fit(X_train, y_train)
print('Training set accuracy: {:.2f}:'.format(model_lr.score(X_train, y_train)))
print('Test set accuracy: {:.2f}:'.format(model_lr.score(X_test, y_test)))

Training set accuracy: 0.20:
Test set accuracy: 0.35:


In [48]:
model_forest = RandomForestRegressor(n_estimators=6).fit(X_train, y_train)
print('Training set accuracy: {:.2f}:'.format(model_forest.score(X_train, y_train)))
print('Test set accuracy: {:.2f}:'.format(model_forest.score(X_test, y_test)))

Training set accuracy: 0.73:
Test set accuracy: -0.00:


In [49]:
y_test

date
2017-06-26 17:58:31    4.0
2016-10-31 17:11:02    2.0
2017-06-19 17:33:55    2.0
2016-11-26 15:46:12    5.0
Name: zone, dtype: float64

In [50]:
X_test

Unnamed: 0_level_0,distance,elevation_gain,duration
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-06-26 17:58:31,5041.2,22.3,30.25
2016-10-31 17:11:02,3359.6,30.2,20.466667
2017-06-19 17:33:55,4171.9,17.8,25.366667
2016-11-26 15:46:12,8696.3,160.1,54.483333


In [51]:
X_test['predicted zone'] = model_forest.predict(X_test)
X_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0_level_0,distance,elevation_gain,duration,predicted zone
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-06-26 17:58:31,5041.2,22.3,30.25,3.333333
2016-10-31 17:11:02,3359.6,30.2,20.466667,3.666667
2017-06-19 17:33:55,4171.9,17.8,25.366667,3.333333
2016-11-26 15:46:12,8696.3,160.1,54.483333,3.666667


In [None]:
# Time to get a new HRM