EHN/IO Read all useful data from FIT file (#19)

scikit-cycling · Jan 8, 2018 · 6f15bf3 · 6f15bf3
1 parent 4d56123
commit 6f15bf3
Show file tree

Hide file tree

Showing 3 changed files with 49 additions and 16 deletions.
diff --git a/skcycling/io/base.py b/skcycling/io/base.py
@@ -8,15 +8,21 @@
 
 from .fit import load_power_from_fit
 
+DROP_OPTIONS = ('columns', 'rows', 'both')
 
-def bikeread(filename):
+
+def bikeread(filename, drop_nan=None):
     """Read power data file.
 
     Parameters
     ----------
     filename : str
         Path to the file to read.
 
+    drop_nan : str {'columns', 'rows', 'both'} or None
+        Either to remove the columns/rows containing NaN values. By default,
+        all data will be kept.
+
     Returns
     -------
     data : DataFrame
@@ -26,18 +32,29 @@ def bikeread(filename):
     --------
     >>> from skcycling.datasets import load_fit
     >>> from skcycling.io import bikeread
-    >>> activity = bikeread(load_fit()[0])
+    >>> activity = bikeread(load_fit()[0], drop_nan='columns')
     >>> activity.head() # doctest : +NORMALIZE_WHITESPACE
-                         power
-    2014-05-07 12:26:22  256.0
-    2014-05-07 12:26:23  185.0
-    2014-05-07 12:26:24  343.0
-    2014-05-07 12:26:25  344.0
-    2014-05-07 12:26:26  389.0
-
+                         cadence  distance  power
+    2014-05-07 12:26:22     45.0      3.05  256.0
+    2014-05-07 12:26:23     42.0      6.09  185.0
+    2014-05-07 12:26:24     44.0      9.09  343.0
+    2014-05-07 12:26:25     45.0     11.94  344.0
+    2014-05-07 12:26:26     48.0     15.03  389.0
     """
+    if drop_nan is not None and drop_nan not in DROP_OPTIONS:
+        raise ValueError('"drop_nan" should be one of {}.'
+                         ' Got {} instead.'.format(DROP_OPTIONS, drop_nan))
+
     df = load_power_from_fit(filename)
 
+    if drop_nan is not None:
+        if drop_nan == 'columns':
+            df.dropna(axis=1, inplace=True)
+        elif drop_nan == 'rows':
+            df.dropna(axis=0, inplace=True)
+        else:
+            df.dropna(axis=1, inplace=True).dropna(axis=0, inplace=True)
+
     # remove possible outliers by clipping the value
     df[df['power'] > 2500.] = np.nan
 

diff --git a/skcycling/io/fit.py b/skcycling/io/fit.py
@@ -5,11 +5,18 @@
 # License: BSD 3 clause
 
 import os
+from collections import defaultdict
+
 import pandas as pd
+import numpy as np
 import six
 
 from fitparse import FitFile
 
+# 'timestamp' will be consider as the index of the DataFrame later on
+FIELDS_DATA = ('timestamp', 'power', 'heart-rate', 'cadence', 'distance',
+               'elevation')
+
 
 def check_filename_fit(filename):
     """Method to check if the filename corresponds to a fit file.
@@ -52,7 +59,7 @@ def load_power_from_fit(filename):
 
     Returns
     -------
-    power_rec : ndarray, shape (n_samples)
+    data : DataFrame
         Power records of the ride.
 
     """
@@ -61,8 +68,17 @@ def load_power_from_fit(filename):
     activity.parse()
     records = activity.get_messages(name='record')
 
-    power, timestamp = zip(*[
-        (rec.get_value('power'), rec.get_value('timestamp'))
-        for rec in records])
+    data = defaultdict(list)
+    for rec in records:
+        values = rec.get_values()
+        for key in FIELDS_DATA:
+            data[key].append(values.get(key, np.NaN))
+
+    data = pd.DataFrame(data)
+    if data.empty:
+        raise IOError('The file {} does not contain any data.'.format(
+            filename))
+    data.set_index(FIELDS_DATA[0], inplace=True)
+    del data.index.name
 
-    return pd.DataFrame({'power': power}, index=timestamp)
+    return data
diff --git a/skcycling/io/tests/test_fit.py b/skcycling/io/tests/test_fit.py
@@ -57,8 +57,8 @@ def test_load_power_if_no_record():
     for f in filenames:
         if pattern in f:
             filename = f
-    msg = "There is no data to treat in that file."
-    with pytest.raises(ValueError, message=msg):
+    msg = "does not contain any data."
+    with pytest.raises(IOError, message=msg):
         load_power_from_fit(filename)