In [35]:
import glob

import pandas as pd
import numpy as np

from joblib import Parallel, delayed, Memory

from skcycling.io import bikeread
from skcycling.extraction import acceleration
from skcycling.extraction import gradient_activity
from skcycling.extraction import gradient_elevation
from skcycling.extraction import gradient_heart_rate

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import QuantileTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import cross_validate

In [2]:
path_data = '/home/lemaitre/Documents/data/cycling/user_2/2014/*.fit'
filenames = sorted(glob.glob(path_data))
filenames[:5]

['/home/lemaitre/Documents/data/cycling/user_2/2014/2014-01-04-11-40-46.fit',
 '/home/lemaitre/Documents/data/cycling/user_2/2014/2014-01-05-08-15-15.fit',
 '/home/lemaitre/Documents/data/cycling/user_2/2014/2014-01-07-15-09-05.fit',
 '/home/lemaitre/Documents/data/cycling/user_2/2014/2014-01-08-15-05-10.fit',
 '/home/lemaitre/Documents/data/cycling/user_2/2014/2014-01-09-15-09-11.fit']

Cache the results of the reading to avoid losing time to reload data

In [3]:
memory = Memory(location='bikereadcache')
bikeread_cached = memory.cache(bikeread, verbose=1)

In [5]:
data = Parallel(n_jobs=-1)(delayed(bikeread_cached)(f) for f in filenames)

Only keep the data with all information

In [8]:
# filter the activity which do not contain the required information
fields = ['elevation', 'cadence', 'distance', 'heart-rate', 'power', 'speed']

valid_data = []
for activity in data:
    if set(fields).issubset(activity.columns):
        if not pd.isnull(activity).any().any():
            valid_data.append(activity)

In [9]:
len(valid_data)

52

In [10]:
len(data)

122

In [11]:
data = valid_data

In [12]:
data[0].head()

Unnamed: 0,elevation,cadence,distance,heart-rate,power,speed
2014-01-04 10:40:46,110.4,33.0,2.85,112.0,70.0,2.854
2014-01-04 10:40:47,110.4,35.0,5.96,112.0,51.0,3.11
2014-01-04 10:40:48,110.4,38.0,9.26,113.0,60.0,3.297
2014-01-04 10:40:49,110.4,40.0,12.77,113.0,50.0,3.506
2014-01-04 10:40:50,110.0,50.0,16.46,113.0,63.0,3.669


Compute the elevation, speed, and heart-rate gradient. In addition, we will add information about the 10 past record using the derivative.

In [13]:
for activity_idx in range(len(data)):
    # compute acceleration
    data[activity_idx] = acceleration(data[activity_idx])
    # compute gradient elevation
    data[activity_idx] = gradient_elevation(data[activity_idx])
    # compute gradient heart-rate
    data[activity_idx] = gradient_heart_rate(data[activity_idx])
    # compute the gradient information over 10 sec for the some fields
    fields = ['elevation', 'cadence', 'heart-rate', 'speed',
              'gradient-elevation', 'gradient-heart-rate', 'acceleration']
    data[activity_idx] = gradient_activity(data[activity_idx], periods=range(1, 11), columns=fields)

In [14]:
data[0]

Unnamed: 0_level_0,original,original,original,original,original,original,original,original,original,gradient_1,...,gradient_9,gradient_9,gradient_9,gradient_10,gradient_10,gradient_10,gradient_10,gradient_10,gradient_10,gradient_10
Unnamed: 0_level_1,elevation,cadence,distance,heart-rate,power,speed,acceleration,gradient-elevation,gradient-heart-rate,elevation,...,gradient-elevation,gradient-heart-rate,acceleration,elevation,cadence,heart-rate,speed,gradient-elevation,gradient-heart-rate,acceleration
2014-01-04 10:40:46,110.4,33.0,2.85,112.0,70.0,2.854,,,,,...,,,,,,,,,,
2014-01-04 10:40:47,110.4,35.0,5.96,112.0,51.0,3.110,,,,0.0,...,,,,,,,,,,
2014-01-04 10:40:48,110.4,38.0,9.26,113.0,60.0,3.297,,,,0.0,...,,,,,,,,,,
2014-01-04 10:40:49,110.4,40.0,12.77,113.0,50.0,3.506,,,,0.0,...,,,,,,,,,,
2014-01-04 10:40:50,110.0,50.0,16.46,113.0,63.0,3.669,,,,-0.4,...,,,,,,,,,,
2014-01-04 10:40:51,110.0,63.0,20.27,114.0,1.0,3.834,0.980,-0.022962,2.0,0.0,...,,,,,,,,,,
2014-01-04 10:40:52,110.0,50.0,24.03,113.0,25.0,3.763,0.653,-0.022136,1.0,0.0,...,,,,,,,,,,
2014-01-04 10:40:53,110.0,50.0,28.13,114.0,251.0,4.099,0.802,-0.021198,1.0,0.0,...,,,,,,,,,,
2014-01-04 10:40:54,110.0,85.0,32.15,115.0,58.0,4.015,0.509,-0.020640,2.0,0.0,...,,,,,,,,,,
2014-01-04 10:40:55,110.0,124.0,36.43,115.0,1.0,4.285,0.616,0.000000,2.0,0.0,...,,,,,,,,,,


In [43]:
for activity in data:
    activity.replace([np.inf, -np.inf], np.nan, inplace=True)
    activity.fillna(activity.mean(), inplace=True)

In [44]:
data[0].head()

Unnamed: 0_level_0,original,original,original,original,original,original,original,original,original,gradient_1,...,gradient_9,gradient_9,gradient_9,gradient_10,gradient_10,gradient_10,gradient_10,gradient_10,gradient_10,gradient_10
Unnamed: 0_level_1,elevation,cadence,distance,heart-rate,power,speed,acceleration,gradient-elevation,gradient-heart-rate,elevation,...,gradient-elevation,gradient-heart-rate,acceleration,elevation,cadence,heart-rate,speed,gradient-elevation,gradient-heart-rate,acceleration
2014-01-04 10:40:46,110.4,33.0,2.85,112.0,70.0,2.854,-0.001179,0.004354,0.019304,-0.00171,...,-3e-05,-0.005137,-0.002028,-0.016822,-0.069438,0.038875,-0.001875,-2.8e-05,-0.005505,-0.002005
2014-01-04 10:40:47,110.4,35.0,5.96,112.0,51.0,3.11,-0.001179,0.004354,0.019304,0.0,...,-3e-05,-0.005137,-0.002028,-0.016822,-0.069438,0.038875,-0.001875,-2.8e-05,-0.005505,-0.002005
2014-01-04 10:40:48,110.4,38.0,9.26,113.0,60.0,3.297,-0.001179,0.004354,0.019304,0.0,...,-3e-05,-0.005137,-0.002028,-0.016822,-0.069438,0.038875,-0.001875,-2.8e-05,-0.005505,-0.002005
2014-01-04 10:40:49,110.4,40.0,12.77,113.0,50.0,3.506,-0.001179,0.004354,0.019304,0.0,...,-3e-05,-0.005137,-0.002028,-0.016822,-0.069438,0.038875,-0.001875,-2.8e-05,-0.005505,-0.002005
2014-01-04 10:40:50,110.0,50.0,16.46,113.0,63.0,3.669,-0.001179,0.004354,0.019304,-0.4,...,-3e-05,-0.005137,-0.002028,-0.016822,-0.069438,0.038875,-0.001875,-2.8e-05,-0.005505,-0.002005


Concatenate all the information and keep track of the group to avoid leaking

In [45]:
data_concat = pd.concat(data)
y = data_concat['original']['power']
X = data_concat.drop('power', axis=1, level=1)
groups = []
for group_idx, activity in enumerate(data):
    groups += [group_idx] * activity.shape[0]
groups = np.array(groups)

In [46]:
group

array([ 0,  0,  0, ..., 51, 51, 51])

In [47]:
X

Unnamed: 0_level_0,original,original,original,original,original,original,original,original,gradient_1,gradient_1,...,gradient_9,gradient_9,gradient_9,gradient_10,gradient_10,gradient_10,gradient_10,gradient_10,gradient_10,gradient_10
Unnamed: 0_level_1,elevation,cadence,distance,heart-rate,speed,acceleration,gradient-elevation,gradient-heart-rate,elevation,cadence,...,gradient-elevation,gradient-heart-rate,acceleration,elevation,cadence,heart-rate,speed,gradient-elevation,gradient-heart-rate,acceleration
2014-01-04 10:40:46,110.4,33.0,2.85,112.0,2.854,-0.001179,0.004354,0.019304,-0.00171,-0.00403,...,-0.000030,-0.005137,-0.002028,-0.016822,-0.069438,0.038875,-0.001875,-0.000028,-0.005505,-0.002005
2014-01-04 10:40:47,110.4,35.0,5.96,112.0,3.110,-0.001179,0.004354,0.019304,0.00000,2.00000,...,-0.000030,-0.005137,-0.002028,-0.016822,-0.069438,0.038875,-0.001875,-0.000028,-0.005505,-0.002005
2014-01-04 10:40:48,110.4,38.0,9.26,113.0,3.297,-0.001179,0.004354,0.019304,0.00000,3.00000,...,-0.000030,-0.005137,-0.002028,-0.016822,-0.069438,0.038875,-0.001875,-0.000028,-0.005505,-0.002005
2014-01-04 10:40:49,110.4,40.0,12.77,113.0,3.506,-0.001179,0.004354,0.019304,0.00000,2.00000,...,-0.000030,-0.005137,-0.002028,-0.016822,-0.069438,0.038875,-0.001875,-0.000028,-0.005505,-0.002005
2014-01-04 10:40:50,110.0,50.0,16.46,113.0,3.669,-0.001179,0.004354,0.019304,-0.40000,10.00000,...,-0.000030,-0.005137,-0.002028,-0.016822,-0.069438,0.038875,-0.001875,-0.000028,-0.005505,-0.002005
2014-01-04 10:40:51,110.0,63.0,20.27,114.0,3.834,0.980000,-0.022962,2.000000,0.00000,13.00000,...,-0.000030,-0.005137,-0.002028,-0.016822,-0.069438,0.038875,-0.001875,-0.000028,-0.005505,-0.002005
2014-01-04 10:40:52,110.0,50.0,24.03,113.0,3.763,0.653000,-0.022136,1.000000,0.00000,-13.00000,...,-0.000030,-0.005137,-0.002028,-0.016822,-0.069438,0.038875,-0.001875,-0.000028,-0.005505,-0.002005
2014-01-04 10:40:53,110.0,50.0,28.13,114.0,4.099,0.802000,-0.021198,1.000000,0.00000,0.00000,...,-0.000030,-0.005137,-0.002028,-0.016822,-0.069438,0.038875,-0.001875,-0.000028,-0.005505,-0.002005
2014-01-04 10:40:54,110.0,85.0,32.15,115.0,4.015,0.509000,-0.020640,2.000000,0.00000,35.00000,...,-0.000030,-0.005137,-0.002028,-0.016822,-0.069438,0.038875,-0.001875,-0.000028,-0.005505,-0.002005
2014-01-04 10:40:55,110.0,124.0,36.43,115.0,4.285,0.616000,0.000000,2.000000,0.00000,39.00000,...,-0.000030,-0.005137,-0.002028,-0.016822,-0.069438,0.038875,-0.001875,-0.000028,-0.005505,-0.002005


In [48]:
pipe = make_pipeline(QuantileTransformer(), GradientBoostingRegressor(random_state=42))
scores = cross_validate(pipe, X, y, groups=groups, scoring=['r2', 'neg_median_absolute_error'],
                        cv=GroupKFold(n_splits=3), n_jobs=1, return_train_score=True)

In [49]:
scores

{'fit_time': array([ 192.9417398 ,  193.04851174,  184.49550223]),
 'score_time': array([ 3.6782012 ,  4.08030486,  4.14024472]),
 'test_neg_median_absolute_error': array([-31.02544704, -28.34900344, -30.29612706]),
 'test_r2': array([ 0.64734564,  0.6027255 ,  0.59363491]),
 'train_neg_median_absolute_error': array([-29.22017815, -26.87621544, -27.39974912]),
 'train_r2': array([ 0.62038711,  0.65533043,  0.65137509])}