# Time series


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np 
from datetime import timedelta

## Use case - multiple time series signals

### Input: **datetime indexed time(zone-aware) series dataframes**

[documentation datetime index](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DatetimeIndex.html)

In [3]:
# load the data (and add a random delay / offset to it as multimodal data will be sampled
df_gsr = pd.read_feather('data/gsr.feather')
df_gsr['timestamp'] += timedelta(milliseconds=np.random.randint(-125, 125))
df_gsr.set_index('timestamp', inplace=True)

df_tmp = pd.read_feather('data/tmp.feather')
df_tmp['timestamp'] += timedelta(milliseconds=np.random.randint(-125, 125))
df_tmp.set_index('timestamp', inplace=True)

In [4]:
type(df_gsr.index)

pandas.core.indexes.datetimes.DatetimeIndex

In [5]:
print(df_gsr.shape)
df_gsr.head(2)

(31998, 1)


Unnamed: 0_level_0,EDA
timestamp,Unnamed: 1_level_1
2017-06-13 10:33:40.063000+02:00,0.0
2017-06-13 10:33:40.313000+02:00,0.107451


In [6]:
print(df_tmp.shape)
df_tmp.head(2)

(31992, 1)


Unnamed: 0_level_0,TMP
timestamp,Unnamed: 1_level_1
2017-06-13 10:33:40.021000+02:00,382.21
2017-06-13 10:33:40.271000+02:00,382.21


---

### Intermezzo Joining multiple time series: `pd.merge_asof`

[documentation](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.merge_asof.html)

In [7]:
# the name of the index columns is "timestamp"
df_tot = pd.merge_asof(df_tmp[4:], df_gsr, on='timestamp', direction='nearest').set_index('timestamp')
df_tot.head(2)

Unnamed: 0_level_0,TMP,EDA
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-06-13 10:33:41.021000+02:00,31.15,0.153501
2017-06-13 10:33:41.271000+02:00,31.15,0.148384


In [8]:
# preserve causality -> direction = backward
#  A “backward” search selects the last row in the right DataFrame whose ‘on’ key is less than or equal to the left’s key.
df_tot = pd.merge_asof(df_tmp[10:], df_gsr, on='timestamp', direction='backward').set_index('timestamp')

certainly look at `tolerance` and `direction` argument of this function!

---

# Classical machine learning -> feature extraction

Most common way to extract time series features: a **window-strided** manner

challenges:
* Always assumes fixed window size & stride
* No efficient implementations for dataframes (pd.rolling assumes same input<->output dimensions --> no stride possible) 
* No support aggregation multiple time series 

--> Writing my own code 

In [9]:
import sys 

# time series feature extraction
sys.path.append('time_series/')
from time_series import NumpyFuncWrapper
from time_series.features import NumpyFeatureCalculation
from time_series.features.feature_extraction import NumpyFeatureCalculationRegistry, NumpyFeatureCalculationPipeline
import scipy.stats as ss

# Serialization
import dill as pickle
pickle.settings['recurse']=True # allows to serialize lambda's YAY!

In [10]:
# ------------------------ some custom feature extraction functions ------------------------
# 1. one-to-many functions (as quantiles sort the windowed data) you might want to calculate them in 1 step
quantiles = [0.25, 0.5, 0.75]
f_quantiles = NumpyFuncWrapper(np.quantile, col_names=[f'quantile_{q}' for q in quantiles], q=quantiles)  # fyi: you can pass kwargs (in this case q=quantiles)

# 2. lambda functions
f_slope = NumpyFuncWrapper(lambda x: np.polyfit(np.arange(0, len(x)), x, 1)[0], col_names='slope')
f_rms = NumpyFuncWrapper(lambda x: np.sqrt(np.mean(x ** 2)), col_names='rms')
f_area = NumpyFuncWrapper(np.sum, col_names='area')

segment_funcs = [np.mean, np.std, np.var, np.max, np.min, ss.skew, ss.kurtosis, f_quantiles, f_slope, f_rms, f_area]

## Use case 1: single feature extraction for temperature singal

### Fixed window size

**TMP**

In [11]:
fs_tmp = 4
tmp_win_size = 60 * fs_tmp
tmp_stride_size = 30 * fs_tmp
tmp_feat_extr = NumpyFeatureCalculationRegistry(
    [NumpyFeatureCalculation(win_size=tmp_win_size, stride=tmp_stride_size, func=f) for f in segment_funcs])
tmp_feat_extr

	win: 240, stride: 120: [
		NumpyFeatureCalculation - func: mean,  
		NumpyFeatureCalculation - func: std,  
		NumpyFeatureCalculation - func: var,  
		NumpyFeatureCalculation - func: amax,  
		NumpyFeatureCalculation - func: amin,  
		NumpyFeatureCalculation - func: skew,  
		NumpyFeatureCalculation - func: kurtosis,  
		NumpyFeatureCalculation - func: quantile - kwargs: {'q': [0.25, 0.5, 0.75]} - col_names: ['quantile_0.25', 'quantile_0.5', 'quantile_0.75'],  
		NumpyFeatureCalculation - func: <lambda> - kwargs: {} - col_names: ['slope'],  
		NumpyFeatureCalculation - func: <lambda> - kwargs: {} - col_names: ['rms'],  
		NumpyFeatureCalculation - func: sum - kwargs: {} - col_names: ['area'],  
	]

In [12]:
win_stride_feat_dict = tmp_feat_extr.calculate_features(df_tmp)
# outputs a dict -> keys are the window, stride tuple, the value is the corresponding dataframe
print(win_stride_feat_dict.keys())
list(win_stride_feat_dict.values())[0].head(2)

dict_keys([(240, 120)])


Unnamed: 0_level_0,TMP_mean,TMP_std,TMP_var,TMP_amax,TMP_amin,TMP_skew,TMP_kurtosis,TMP_quantile_0.25,TMP_quantile_0.5,TMP_quantile_0.75,TMP_slope,TMP_rms,TMP_area
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2017-06-13 10:34:39.771000+02:00,37.011667,44.940998,2019.693284,382.21,31.13,7.550955,55.016935,31.15,31.17,31.17,-0.143703,58.2199,8882.8
2017-06-13 10:35:09.771000+02:00,31.193333,0.0335,0.001122,31.27,31.15,0.52333,-0.798863,31.17,31.19,31.23,0.000417,31.193351,7486.4


### Arbitrary window size

**GSR**

In [13]:
# PoC: we will select a random choice / combination of the window_size stride combination
window_size_s = [60, 120]
stride_size_s = [30, 10, 20]
fs_gsr = 4

import random
gsr_feat_extr = NumpyFeatureCalculationRegistry(
    [NumpyFeatureCalculation(win_size=random.choice(window_size_s) * fs_tmp, stride=random.choice(stride_size_s) * fs_tmp, func=f) for f in segment_funcs])
gsr_feat_extr

	win: 480, stride: 120: [
		NumpyFeatureCalculation - func: mean,  
		NumpyFeatureCalculation - func: skew,  
		NumpyFeatureCalculation - func: sum - kwargs: {} - col_names: ['area'],  
	]
	win: 480, stride: 80: [
		NumpyFeatureCalculation - func: std,  
		NumpyFeatureCalculation - func: quantile - kwargs: {'q': [0.25, 0.5, 0.75]} - col_names: ['quantile_0.25', 'quantile_0.5', 'quantile_0.75'],  
		NumpyFeatureCalculation - func: <lambda> - kwargs: {} - col_names: ['rms'],  
	]
	win: 240, stride: 80: [
		NumpyFeatureCalculation - func: var,  
	]
	win: 240, stride: 120: [
		NumpyFeatureCalculation - func: amax,  
	]
	win: 480, stride: 40: [
		NumpyFeatureCalculation - func: amin,  
		NumpyFeatureCalculation - func: kurtosis,  
	]
	win: 240, stride: 40: [
		NumpyFeatureCalculation - func: <lambda> - kwargs: {} - col_names: ['slope'],  
	]

In [14]:
win_stride_feat_dict = gsr_feat_extr.calculate_features(df_gsr)
# we can see that we return multiple keys based on the calculated features
print(win_stride_feat_dict.keys())
list(win_stride_feat_dict.values())[0].head(2)

dict_keys([(480, 120), (480, 80), (240, 80), (240, 120), (480, 40), (240, 40)])


Unnamed: 0_level_0,EDA_mean,EDA_skew,EDA_area
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-06-13 10:35:39.813000+02:00,0.19081,3.063073,91.588876
2017-06-13 10:36:09.813000+02:00,0.307291,0.778284,147.49968


### Multiple time series

In [15]:
# Construct the feature extraction pipeline (higher order wrapper which aggregates the feature calculataions
feature_extraction = NumpyFeatureCalculationPipeline(df_feature_wrappers=[
    ('gsr', gsr_feat_extr),
    ('tmp', tmp_feat_extr),
])
feature_extraction

gsr: (
	win: 480, stride: 120: [
		NumpyFeatureCalculation - func: mean,  
		NumpyFeatureCalculation - func: skew,  
		NumpyFeatureCalculation - func: sum - kwargs: {} - col_names: ['area'],  
	]
	win: 480, stride: 80: [
		NumpyFeatureCalculation - func: std,  
		NumpyFeatureCalculation - func: quantile - kwargs: {'q': [0.25, 0.5, 0.75]} - col_names: ['quantile_0.25', 'quantile_0.5', 'quantile_0.75'],  
		NumpyFeatureCalculation - func: <lambda> - kwargs: {} - col_names: ['rms'],  
	]
	win: 240, stride: 80: [
		NumpyFeatureCalculation - func: var,  
	]
	win: 240, stride: 120: [
		NumpyFeatureCalculation - func: amax,  
	]
	win: 480, stride: 40: [
		NumpyFeatureCalculation - func: amin,  
		NumpyFeatureCalculation - func: kurtosis,  
	]
	win: 240, stride: 40: [
		NumpyFeatureCalculation - func: <lambda> - kwargs: {} - col_names: ['slope'],  
	]

)
tmp: (
	win: 240, stride: 120: [
		NumpyFeatureCalculation - func: mean,  
		NumpyFeatureCalculation - func: std,  
		NumpyFeatureCalculation

In [16]:
df_feat = feature_extraction({'gsr': df_gsr, 'tmp': df_tmp})
df_feat.head(2)

Unnamed: 0_level_0,EDA_slope__w=480_s=120,EDA_amin__w=480_s=80,EDA_kurtosis__w=480_s=80,EDA_var__w=240_s=80,EDA_std__w=240_s=120,EDA_quantile_0.25__w=240_s=120,EDA_quantile_0.5__w=240_s=120,EDA_quantile_0.75__w=240_s=120,EDA_rms__w=240_s=120,EDA_amax__w=480_s=40,...,TMP_kurtosis__w=240_s=40,TMP_quantile_0.25__w=240_s=40,TMP_quantile_0.5__w=240_s=40,TMP_quantile_0.75__w=240_s=40,TMP_slope__w=240_s=40,TMP_rms__w=240_s=40,TMP_area__w=240_s=40,EDA_mean__w=240_s=120,EDA_skew__w=240_s=120,EDA_area__w=240_s=120
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-06-13 10:34:39.813000+02:00,3.3e-05,0.0,8.214156,0.000116,0.105031,0.153501,0.156059,0.158618,0.217807,0.161176,...,55.016935,31.15,31.17,31.17,-0.143703,58.2199,8882.8,0.19081,3.063073,91.588876
2017-06-13 10:34:49.813000+02:00,-8e-06,0.0,8.214156,0.000116,0.105031,0.153501,0.156059,0.158618,0.217807,0.161176,...,55.016935,31.15,31.17,31.17,-0.143703,58.2199,8882.8,0.19081,3.063073,91.588876


# LAYD: Look At Your Data

In [17]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import ipywidgets as widgets
from ipywidgets import interact_manual

In [18]:
df_dict = {
    'tmp': df_tmp,
    'gsr': df_gsr
}

In [19]:
feat_widget = widgets.SelectMultiple(options=df_feat.columns)
sig_widget = widgets.SelectMultiple(options=['gsr', 'tmp'])

In [20]:
@interact_manual
def visuzalize(features=feat_widget, signals=sig_widget):
    row_titles = list(signals) + ['features'] if len(features) else []
    fig = make_subplots(rows=len(row_titles), cols=1, shared_xaxes=True, vertical_spacing=0.1/len(row_titles), row_titles=row_titles)
    fig.update_layout(height=300 * len(row_titles))
    
    # first, visualize the "raw" signals
    row_idx = 1
    for sig in signals:
        df_sig = df_dict[sig][10:].resample('1s').mean()
        for col in set(df_sig.columns).difference(['index', 'timestamp']):
            fig.add_trace(go.Scattergl(x=df_sig.index, y=df_sig[col], name=col, hoverinfo='skip'), row=row_idx, col=1)
        row_idx +=1 

    # then visualize the features
    df_f = df_feat[3:]
    for feature in features:
        fig.add_trace(go.Scattergl(x=df_f.index, y=df_f[feature], name=feature, hoverinfo='skip', showlegend=True), row=row_idx, col=1)
    
    return fig

interactive(children=(SelectMultiple(description='features', options=('EDA_slope__w=480_s=120', 'EDA_amin__w=4…