In [None]:
import json
import os
import re
import fnmatch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import tqdm
import scipy

from typing import Dict, List, Tuple
from pprint import pprint


%matplotlib inline
sns.set()

## General plan

- Read raw velocity data, make some features out of it

- Try simple classifier with where features can be interpreted 

## 1. Read them all

Assumptions:

- there is `data` folder in the root of the project (its not under SC).

- there are 2 types of file `activity-xyz.json` and `stream-xyz.json`,  with the first one having activity general and aggregated metadata and the second one is more detailed lat lon velocity and timestamp 


In [None]:
def get_id(s, prefix):
    m = re.search("%s-(.+)\.json" % prefix, s)
    try:
        return m.group(1)
    except:
        return None
    
def process_strava_activity(**kwargs) -> List[Dict]:
    res = dict([(c, kwargs['stream_json'].get(c, np.nan)) for c in kwargs['cols']])
    res['id'] = kwargs['_id']
    return [res]    

def strava_file_reader(stream_file, data_path, stream_processor) -> List[Dict]:
    with open(os.path.join(data_path, stream_file), 'r') as f:
        prefix = stream_file.split("-")[0]
        activity_id = get_id(stream_file, prefix)        
        return stream_processor(stream_json=json.load(f), _id=activity_id)
    
assert get_id("activity-2342134134.json", "activity") == "2342134134"
assert get_id("stream-999935253.json", "stream") == "999935253"

# 1. Reading Data

In [None]:
data_path = os.path.join(sys.path[0], "../data")
activity_file_list = [file for file in os.listdir(data_path) if fnmatch.fnmatch(file, 'activity-*.json')]
stream_file_list = [file for file in os.listdir(data_path) if fnmatch.fnmatch(file, 'stream-*.json')]

## 1.1 Activities 

In [None]:

original_cols = ['max_speed','commute','distance','has_heartrate','average_speed','device_watts', 'id']

more_cols = ['premium', 'average_cadence', 'average_temp', 'latlng',
             'average_heartrate', 'average_watts', 'kilojoules', 'max_speed', 'manual', 'moving_time', 'elapsed_time']

activity_proc = lambda stream_json, _id: process_strava_activity(stream_json=stream_json, _id=_id, cols=original_cols + more_cols)

strava_activity_gen = (s for f in tqdm.tqdm_notebook(activity_file_list[:]) 
                         for s in strava_file_reader(f, data_path, activity_proc))

activity_df = pd.DataFrame(strava_activity_gen).set_index('id')
activity_df.head(10)

## 1.2 Streams

We can't read all streams in raw format (well we can, but we assume this notebook can be "tried at home")

Therefore we read the stream, convert it into features, and join these stream features with the activity df above somewhere

In [None]:
def process_strava_stream(**kwargs) -> List[Dict]:
    stream = kwargs['stream_json']    
    
    latlng = [ll for ll in stream.get('latlng', [])]
    lats = [l[0] for l in latlng]
    lons = [l[1] for l in latlng]        
    
    relative_time = [t for t in stream.get('time',[])]
    distance = [d for d in stream.get('distance',[])]
    velocity_smooth = [v for v in stream.get('velocity_smooth', [])]       
    
    time_index = pd.TimedeltaIndex(np.roll(relative_time, shift=1), unit='s')
    
    data = pd.DataFrame([(z[0], z[1], z[2], z[3]) for z in 
                          zip(distance, velocity_smooth, lats, lons)], 
                        columns=['distance', 'velocity', 'lat', 'lon'], index=time_index)    
    data = data.iloc[1:]    
    
    return [{'data': data, 'id':kwargs['_id']}]
    

## 2.Speed time series feature ideas  (naive not very deep in domain):



### The features form Velocity Time Series

`we resample the incoming velocities with 10s intervals`

1. number of na-s (disclaimer: it is helpful)

2. slow speed sections (disclaimer: it is helpful)

3. Diff between instant value and some moving average, mean and std of it: (disclaimer: it is helpful)

4. Time lag corresponding to the first zero of the auto-correlation function, based on moving average

   (disclaimer: it is NOT helpful, but sounds smart, we take it)

In [None]:
def find_zero(_y, sampling_rate=1):
    for i, z in enumerate(_y[1:]):
        if _y[i-1] >=0 and z <= 0:
            return i*sampling_rate
        else:
            continue            
    return None


def featurise(raw_stream: pd.DataFrame, stream_id:str) -> Dict:    
    try:    
        resampled_stream = raw_stream.resample('10s').mean()

        feature_nan_fraction = np.sum(resampled_stream.velocity.isnull()) / len(resampled_stream)

        slow_speed_fraction = np.sum(resampled_stream.velocity < 1) / len(resampled_stream)

        mav1m = resampled_stream.rolling('1min', min_periods=4).mean()
        mav5m = resampled_stream.rolling('5min', min_periods=20).mean()
        mav10m = resampled_stream.rolling('10min', min_periods=40).mean()

        velcoty_diff_1m = (raw_stream.velocity - mav1m.velocity).dropna()
        velcoty_diff_5m = (raw_stream.velocity - mav5m.velocity).dropna()
        velcoty_diff_10m = (raw_stream.velocity - mav10m.velocity).dropna()

        # ac stuff    
        def acf_zero(s, x = range(0, 2000)):
            y = [s.autocorr(lag=l) for l in x]
            return find_zero(y)

        return {
            'sream_id': stream_id,
            'nan_fraction': feature_nan_fraction,
            'slow_speed_fraction': slow_speed_fraction,

            'velocitydiff_std_1m': velcoty_diff_1m.std(),
            'velocitydiff_std_5m': velcoty_diff_5m.std(),
            'velocitydiff_std_10m': velcoty_diff_10m.std(),

            'velocitydiff_mean_1m': velcoty_diff_1m.mean(),
            'velocitydiff_mean_5m': velcoty_diff_5m.mean(),
            'velocitydiff_mean_10m': velcoty_diff_10m.mean(),

            'acf0_1m': acf_zero(mav1m.velocity),
            'acf0_5m': acf_zero(mav5m.velocity),
            'acf0_10m': acf_zero(mav10m.velocity),        
        }
    except Exception as e:
        print("Warn: %s" % e)
        return {}

In [None]:
# be careful est. calculation time on 8k samples is 6 hours, don't try at home, use pickle
stream_features = pd.DataFrame(( featurise(s['data'], s['id'])
    for f in tqdm.tqdm_notebook(stream_file_list[:])
    for s in strava_file_reader(f, data_path, process_strava_stream))).set_index('sream_id')

stream_features.to_pickle("stream_features", compression="bz2")

In [None]:
activity_df.to_pickle("activity_df", compression="bz2")