# Time series


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import timedelta

import numpy as np
import pandas as pd

## Use case - multiple time series signals

### Input: **datetime indexed time(zone-aware) series dataframes**

[documentation datetime index](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DatetimeIndex.html)

In [3]:
# load the data (and add a random offset to it), as multimodal data might be
# with some small offsets
df_gsr = pd.read_feather("data/gsr.feather")
df_gsr["timestamp"] += timedelta(milliseconds=np.random.randint(-125, 125))
df_gsr.set_index("timestamp", inplace=True)

df_tmp = pd.read_feather("data/tmp.feather")
df_tmp["timestamp"] += timedelta(milliseconds=np.random.randint(-125, 125))
df_tmp.set_index("timestamp", inplace=True)

In [4]:
type(df_gsr.index)

pandas.core.indexes.datetimes.DatetimeIndex

In [5]:
print(df_gsr.shape)
df_gsr.head(2)

(31998, 1)


Unnamed: 0_level_0,EDA
timestamp,Unnamed: 1_level_1
2017-06-13 10:33:39.965000+02:00,0.0
2017-06-13 10:33:40.215000+02:00,0.107451


In [6]:
print(df_tmp.shape)
df_tmp.head(2)

(31992, 1)


Unnamed: 0_level_0,TMP
timestamp,Unnamed: 1_level_1
2017-06-13 10:33:40.041000+02:00,382.21
2017-06-13 10:33:40.291000+02:00,382.21


---

### Intermezzo Joining multiple time series: `pd.merge_asof`

[documentation](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.merge_asof.html)

In [7]:
# the name of the index columns is "timestamp"
df_tot = pd.merge_asof(
    df_tmp[4:], df_gsr, on="timestamp", direction="nearest"
).set_index("timestamp")
df_tot.head(2)

Unnamed: 0_level_0,TMP,EDA
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-06-13 10:33:41.041000+02:00,31.15,0.153501
2017-06-13 10:33:41.291000+02:00,31.15,0.148384


In [8]:
# preserve causality -> direction = backward
#  A “backward” search selects the last row in the right DataFrame whose ‘on’
#  key is less than or equal to the left’s key.
df_tot = pd.merge_asof(
    df_tmp[10:], df_gsr, on="timestamp", direction="backward"
).set_index("timestamp")

certainly look at `tolerance` and `direction` argument of this function!

---

# Classical machine learning -> feature extraction

Most common way to extract time series features: a **window-strided** manner

challenges:
* Always assumes fixed window size & stride
* No efficient implementations for dataframes (pd.rolling assumes same input<->output dimensions --> no stride possible) 
* No support aggregation multiple time series 

--> Writing my own code 

In [9]:
import sys

# time series feature extraction
sys.path.append("time_series/")
# Serialization
import dill as pickle
import scipy.stats as ss

from time_series import FeatureCollection, NumpyFuncWrapper
from time_series.features import FeatureDescriptor

# todo add example for MultipleFeatureDescriptors class
# from time_series.features import MultipleFeatureDescriptors

pickle.settings["recurse"] = True  # allows to serialize lambda's YAY!

## Defining functions

In [10]:
# --------------------- some custom feature extraction functions ---------------------
# 1. one-to-many functions (as quantiles sort the windowed data) you might
#    want to calculate them in 1 step
quantiles = [0.25, 0.5, 0.75]
f_quantiles = NumpyFuncWrapper(
    np.quantile, output_names=[f"quantile_{q}" for q in quantiles], q=quantiles
)  # fyi: you can pass kwargs (in this case q=quantiles)


# 2. in-line functions
def slope(x):
    return np.polyfit(np.arange(0, len(x)), x, 1)[0]


def rms(x):
    return np.sqrt(np.mean(x ** 2))


f_slope = NumpyFuncWrapper(slope, output_names="slope")
f_rms = NumpyFuncWrapper(rms, output_names="rms")
f_area = NumpyFuncWrapper(np.sum, output_names="area")

segment_funcs = [
    np.mean,
    np.std,
    np.var,
    np.max,
    np.min,
    ss.skew,
    ss.kurtosis,
    f_quantiles,
    f_slope,
    f_rms,
    f_area,
]

## Lambdas

Lambda's aren't serializable by default, making them hard to use in other enivronments (such als multiprocessing pools).

Hence, the `FeatureCollection` class will output a warning if lambda's are being used.

**note**:
* A possible workaround to still use lambdas and multiprocessing is using [dill](https://github.com/uqfoundation/dill) (for serialization) and [pathos](https://github.com/uqfoundation/pathos) (for multiprocessing).

In [11]:
import traceback

In [12]:
try:
    fc = FeatureCollection()
    fc.add([FeatureDescriptor(lambda x: np.mean(x), key="gsr", window=10, stride=5)])
except:
    traceback.print_exc()

Traceback (most recent call last):
  File "<ipython-input-12-97e349b48c52>", line 3, in <module>
    fc.add([FeatureDescriptor(lambda x: np.mean(x), key="gsr", window=10, stride=5)])
  File "/home/jonas/git/gIDLab/predict/time_series/time_series/features/feature_collection.py", line 114, in add
    self._add_feature(feature)
  File "/home/jonas/git/gIDLab/predict/time_series/time_series/features/feature_collection.py", line 69, in _add_feature
    f"\nFunction: {feature.function.output_names} is a lambda, thus not "
TypeError: 
Function: ['<lambda>'] is a lambda, thus not pickle-able. 
	This will give problems with the mulitprocessing based`calculate` function.


## Use case 1: single feature extraction for temperature signal

### Fixed window size

**TMP**

In [13]:
fs_tmp = 4
tmp_win_size = 60 * fs_tmp
tmp_stride_size = 30 * fs_tmp
tmp_feat_extr = FeatureCollection(
    [
        FeatureDescriptor(
            key="TMP", window=tmp_win_size, stride=tmp_stride_size, function=f
        )
        for f in segment_funcs
    ]
)
tmp_feat_extr

FeatureCollection(
	FeatureDescriptor(TMP, 240, 120) 
	FeatureDescriptor(TMP, 240, 120) 
	FeatureDescriptor(TMP, 240, 120) 
	FeatureDescriptor(TMP, 240, 120) 
	FeatureDescriptor(TMP, 240, 120) 
	FeatureDescriptor(TMP, 240, 120) 
	FeatureDescriptor(TMP, 240, 120) 
	FeatureDescriptor(TMP, 240, 120) 
	FeatureDescriptor(TMP, 240, 120) 
	FeatureDescriptor(TMP, 240, 120) 
	FeatureDescriptor(TMP, 240, 120) 
)

In [14]:
%%time
tmp_feat_extr.calculate(df_tmp, merge_dfs=True).sample(2)

CPU times: user 46.9 ms, sys: 63.7 ms, total: 111 ms
Wall time: 202 ms


Unnamed: 0_level_0,TMP_mean__w=240_s=120,TMP_std__w=240_s=120,TMP_var__w=240_s=120,TMP_amax__w=240_s=120,TMP_amin__w=240_s=120,TMP_skew__w=240_s=120,TMP_kurtosis__w=240_s=120,TMP_quantile_0.25__w=240_s=120,TMP_quantile_0.5__w=240_s=120,TMP_quantile_0.75__w=240_s=120,TMP_slope__w=240_s=120,TMP_rms__w=240_s=120,TMP_area__w=240_s=120
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2017-06-13 11:12:39.791000+02:00,32.997833,0.012529,0.000157,33.03,32.97,0.617194,2.285778,32.99,33.0,33.0,0.000118,32.997836,7919.48
2017-06-13 11:24:09.791000+02:00,33.013833,0.017615,0.00031,33.05,32.99,0.486352,-1.191268,33.0,33.0,33.03,-3e-06,33.013838,7923.32


In [15]:
%%time
tmp_feat_extr.calculate(df_tmp, merge_dfs=True, njobs=1).sample(2)

CPU times: user 14.8 ms, sys: 14.4 ms, total: 29.2 ms
Wall time: 322 ms


Unnamed: 0_level_0,TMP_mean__w=240_s=120,TMP_std__w=240_s=120,TMP_var__w=240_s=120,TMP_amax__w=240_s=120,TMP_amin__w=240_s=120,TMP_skew__w=240_s=120,TMP_kurtosis__w=240_s=120,TMP_quantile_0.25__w=240_s=120,TMP_quantile_0.5__w=240_s=120,TMP_quantile_0.75__w=240_s=120,TMP_slope__w=240_s=120,TMP_rms__w=240_s=120,TMP_area__w=240_s=120
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2017-06-13 11:15:39.791000+02:00,33.067667,0.014185,0.000201,33.09,33.03,-0.109331,-0.403061,33.05,33.07,33.07,-8e-06,33.06767,7936.24
2017-06-13 10:58:39.791000+02:00,32.5255,0.017741,0.000315,32.55,32.5,-0.306185,-1.061199,32.5,32.53,32.53,0.000111,32.525505,7806.12


### Arbitrary window size

**GSR**

In [16]:
# PoC: we will select a random combination of the window_size stride combination
window_size_s = [60, 120]
stride_size_s = [30, 10, 20]
fs_gsr = 4

import random

gsr_feat_extr = FeatureCollection(
    [
        FeatureDescriptor(
            key="EDA",
            window=random.choice(window_size_s) * fs_tmp,
            stride=random.choice(stride_size_s) * fs_tmp,
            function=f,
        )
        for f in segment_funcs
    ]
)
gsr_feat_extr

FeatureCollection(
	FeatureDescriptor(EDA, 240, 80) 
	FeatureDescriptor(EDA, 240, 120) 
	FeatureDescriptor(EDA, 240, 120) 
	FeatureDescriptor(EDA, 480, 120) 
	FeatureDescriptor(EDA, 240, 120) 
	FeatureDescriptor(EDA, 240, 80) 
	FeatureDescriptor(EDA, 240, 80) 
	FeatureDescriptor(EDA, 240, 120) 
	FeatureDescriptor(EDA, 240, 80) 
	FeatureDescriptor(EDA, 240, 120) 
	FeatureDescriptor(EDA, 240, 80) 
)

In [17]:
%%time
gsr_feat_extr.calculate(df_gsr, merge_dfs=True).sample(2)
# gsr_feat_extr.get_results(merge_dfs=True)

CPU times: user 49.9 ms, sys: 60.9 ms, total: 111 ms
Wall time: 292 ms


Unnamed: 0_level_0,EDA_mean__w=240_s=80,EDA_skew__w=240_s=80,EDA_kurtosis__w=240_s=80,EDA_slope__w=240_s=80,EDA_area__w=240_s=80,EDA_std__w=240_s=120,EDA_var__w=240_s=120,EDA_amin__w=240_s=120,EDA_quantile_0.25__w=240_s=120,EDA_quantile_0.5__w=240_s=120,EDA_quantile_0.75__w=240_s=120,EDA_rms__w=240_s=120,EDA_amax__w=480_s=120
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2017-06-13 11:47:59.715000+02:00,0.15963,-0.411834,0.185136,-1.4e-05,38.311242,,,,,,,,
2017-06-13 10:58:19.715000+02:00,0.15857,5.292978,32.354198,0.000328,38.056699,,,,,,,,


### Multiple time series

In [18]:
# Construct the feature FeatureCollection
#   =  higher order wrapper which aggregates the featuredescriptions
feature_extraction = FeatureCollection(feature_desc_list=[gsr_feat_extr, tmp_feat_extr])

In [19]:
df_feat = feature_extraction.calculate([df_gsr, df_tmp], merge_dfs=True)

# LAYD: Look At Your Data

In [20]:
import ipywidgets as widgets
import plotly.graph_objects as go
from ipywidgets import interact_manual
from plotly.subplots import make_subplots

In [21]:
df_dict = {"tmp": df_tmp, "gsr": df_gsr}

In [22]:
feat_widget = widgets.SelectMultiple(options=df_feat.columns)
sig_widget = widgets.SelectMultiple(options=["gsr", "tmp"])

In [23]:
@interact_manual
def visuzalize(features=feat_widget, signals=sig_widget):
    row_titles = list(signals) + ["features"] if len(features) else []
    fig = make_subplots(
        rows=len(row_titles),
        cols=1,
        shared_xaxes=True,
        vertical_spacing=0.1 / len(row_titles),
        row_titles=row_titles,
    )
    fig.update_layout(height=300 * len(row_titles))

    # first, visualize the "raw" signals
    row_idx = 1
    for sig in signals:
        df_sig = df_dict[sig][10:].resample("1s").mean()
        for col in set(df_sig.columns).difference(["index", "timestamp"]):
            fig.add_trace(
                go.Scattergl(x=df_sig.index, y=df_sig[col], name=col, hoverinfo="skip"),
                row=row_idx,
                col=1,
            )
        row_idx += 1

    # then visualize the features
    df_f = df_feat[3:]
    for feature in features:
        fig.add_trace(
            go.Scattergl(
                x=df_f.index,
                y=df_f[feature],
                name=feature,
                hoverinfo="skip",
                showlegend=True,
            ),
            row=row_idx,
            col=1,
        )

    return fig

interactive(children=(SelectMultiple(description='features', options=('EDA_mean__w=240_s=80', 'EDA_skew__w=240…