In [1]:
# !pip install memory_profiler

In [1]:
# %load_ext autoreload
%load_ext memory_profiler
# %autoreload 2

In [2]:
import numpy as np
import pandas as pd

# Series Processing pipeline

`TODO`

# Feature extraction

The most classical way to extract features from time series is by using a **strided-window** manner.


**Challenges**:
1. Existing solutions often assume a **single** stride & window size for all features to be calculated.  
  This raises 2 problems:
   * There is no clean interface for multiple stride-window feature calculation.
   * You are responsible for the efficient execution, i.e., you need to do the bookkeeping that feature-calculations on the same stride-window pair are executed on the same time-series expansion (so that it need to be expanded only once).
2. No easy support for aggregation of multivariate series, each possibly having different sampling rates
    _(e.g., Polysomnography data, Wearable data)_
3. No efficient implementations for timestamped data (e.g., pd.Series, pd.DataFrames with a timeindex):
    * `pd.rolling` assumes same input<->output dimensions, hence no stride possible:
    See:  
    https://pandas.pydata.org/docs/reference/api/pandas.Series.rolling.html
    https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rolling.html
    * `tsfresh` has an significant memory and time overhead for `Strided-Rolling` (feature-calculation)
    https://tsfresh.readthedocs.io/en/latest/api/tsfresh.utilities.html#tsfresh.utilities.dataframe_functions.roll_time_series  
    even more:
        * There is no convenient way to retain the time-index
        * It inherently makes the single window-stride assumption for the features.  
4. Little focus on serialization (+parallelization) of local-scope objects  
   To the best of our knowledge, no existing time series library takes this into consideration, thus hampering deployment in different environments.


**What tslib does**:

= Intuitive interface for **multiple** stride-window feature calculation on multiple (possibly differently sampled) time-series signals. <a style="color:orange">_(solving 1 & 2.)_</a>

Providing the following features:
* A single registry, in which all bookkeeping is done, enabling efficient processing <a style="color:orange">(solves 2)</a>
* Maintains the time-index (after feature calculation) <a style="color:orange">(solves 3)</a>
* Serialization of `lambda's` and local-scope methods <a style="color:orange">(solves 4.)</a>

---    

**Current assumptions**:
* The time series we still use -> are sampled at fixed rates (i.e, we as assume that (and thus not check whether) there are time series gaps).
    * The Series processing serves as an option to mitigate this

**Future work(s)**:
* Big-Data: Perform feature-extraction in batch -> batch-like generators in stridedrolling objects.
* Time-wise strided rolling?

In [3]:
import sys

# Serialization
import dill as pickle
import scipy.stats as ss

# load our library
sys.path.append("time_series/")
from time_series import FeatureCollection, NumpyFuncWrapper
from time_series.features import FeatureDescriptor, MultipleFeatureDescriptors

pickle.settings["recurse"] = True  # allows to serialize lambda's YAY!

## Defining functions

Functions are defined by making use of the [NumpyFuncWrapper](time_series/features/function_wrapper.py) class.  

The `NumpyFuncWrapper` interface is easy and convienient; you define:

|      attribute 	|          type         	| info                                                     	|
|---------------:	|:---------------------:	|----------------------------------------------------------	|
|         `func` 	|        Callable       	| The wrapped function that will operate on `numpy` arrays 	|
| `output_names` 	| Union[List[str], str] 	| The name of the outputs of `func`                        	|
|     `**kwargs` 	|        Optional       	| Additional keyword-arguments for the `func`              	|

In [4]:
# --------------------- some custom feature extraction functions ---------------------
# -- 1. one-to-many functions
#    To compute quantiles, you need sort the windowed data, which is a rather expensive
#    operation n*log(n). Hence, you might want to calculate all your desired quantiles
#    in a single function-wrapper, returning multiple outputs.

quantiles = [0.25, 0.5, 0.75]
f_quantiles = NumpyFuncWrapper(
    func=np.quantile,  # the wrapped function that will operate on numpy arrays
    output_names=[f"quantile_{q}" for q in quantiles],  # the output column names
    q=quantiles,  # optional - additional function-related kwargs
)


# -- 2. in-line functions
#    You can define your functions locally; these will serialize flawleslly
def slope(x):
    return np.polyfit(np.arange(0, len(x)), x, 1)[0]


f_slope = NumpyFuncWrapper(slope, output_names="slope")

# -- 3. Lambda's
#    Or even use lambda's and other modules' functions
f_rms = NumpyFuncWrapper(lambda x: np.sqrt(np.mean(x ** 2)), output_names="rms")
f_area = NumpyFuncWrapper(np.sum, output_names="area")


# (For convenience) we store the constructed `NumpyFuncWrappers` in a list
segment_funcs = [
    np.mean,
    np.std,
    np.var,
    np.max,
    np.min,
    ss.skew,
    ss.kurtosis,
    f_quantiles,
    f_slope,
    f_rms,
    f_area,
]
segment_funcs

[<function numpy.mean(a, axis=None, dtype=None, out=None, keepdims=<no value>)>,
 <function numpy.std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=<no value>)>,
 <function numpy.var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=<no value>)>,
 <function numpy.amax(a, axis=None, out=None, keepdims=<no value>, initial=<no value>, where=<no value>)>,
 <function numpy.amin(a, axis=None, out=None, keepdims=<no value>, initial=<no value>, where=<no value>)>,
 <function scipy.stats.stats.skew(a, axis=0, bias=True, nan_policy='propagate')>,
 <function scipy.stats.stats.kurtosis(a, axis=0, fisher=True, bias=True, nan_policy='propagate')>,
 NumpyFuncWrapper(quantile, ['quantile_0.25', 'quantile_0.5', 'quantile_0.75'], {'q': [0.25, 0.5, 0.75]}),
 NumpyFuncWrapper(slope, ['slope'], {}),
 NumpyFuncWrapper(<lambda>, ['rms'], {}),
 NumpyFuncWrapper(sum, ['area'], {})]

## Use case: single series feature extraction

The defined functions above will be encapsulated in a [FeatureDescriptor](time_series/features/feature.py) object.

A `FeatureDescriptor` describes a feature, and has 4 main attributes:

|  attribute 	|                  type                 	| info                                                                                                             	|
|-----------:	|:-------------------------------------:	|------------------------------------------------------------------------------------------------------------------	|
| `function` 	| Union[Callable, <br>NumpyFuncWrapper] 	| The `function` that calculates this feature.                                                                     	|
|      `key` 	|                 string                	| The signal key; i.e., the `pd.DataFrame` column name or <br> `pd.Series` name on which the function will operate.     	|
|   `window` 	|                  int                  	| The window size on which this feature will be applied, <br> expressed in the number of samples from the input signal. 	|
|   `stride` 	|                  int                  	| The stride of the window rolling process, also as a <br> number of samples of the input signal.                       	|

**note**: [MultipleFeatureDescriptor](time_series/features/feature.py) is actaully a factory for `FeatureDescriptor` objects.

### Example 1: Fixed window size and strides

**note**: this functionality is exposed by most existing time-series libraries.

In this example, we will use the _temperature_ signal from a wearable

In [5]:
# define the sample frequency and window size
fs_tmp = 4  # 4Hz
tmp_win_size = 60 * fs_tmp  # window of 60s
tmp_stride_size = 30 * fs_tmp  # stride of 30s


tmp_feat_extr = FeatureCollection(
    feature_desc_list=[
        MultipleFeatureDescriptors(
            signal_keys=["TMP"],
            windows=[tmp_win_size],
            strides=[tmp_stride_size],
            functions=segment_funcs,  # The list of functions we constructed earlier
        )
    ]
)
tmp_feat_extr

TMP: (
	win: 240   , stride: 120: [
		FeatureDescriptor - func: NumpyFuncWrapper(mean, ['mean'], {}),
		FeatureDescriptor - func: NumpyFuncWrapper(std, ['std'], {}),
		FeatureDescriptor - func: NumpyFuncWrapper(var, ['var'], {}),
		FeatureDescriptor - func: NumpyFuncWrapper(amax, ['amax'], {}),
		FeatureDescriptor - func: NumpyFuncWrapper(amin, ['amin'], {}),
		FeatureDescriptor - func: NumpyFuncWrapper(skew, ['skew'], {}),
		FeatureDescriptor - func: NumpyFuncWrapper(kurtosis, ['kurtosis'], {}),
		FeatureDescriptor - func: NumpyFuncWrapper(quantile, ['quantile_0.25', 'quantile_0.5', 'quantile_0.75'], {'q': [0.25, 0.5, 0.75]}),
		FeatureDescriptor - func: NumpyFuncWrapper(slope, ['slope'], {}),
		FeatureDescriptor - func: NumpyFuncWrapper(<lambda>, ['rms'], {}),
		FeatureDescriptor - func: NumpyFuncWrapper(sum, ['area'], {}),
	]
)

In [6]:
df_tmp = pd.read_feather("data/tmp.feather").set_index("timestamp")
df_tmp.sample(2)

Unnamed: 0_level_0,TMP
timestamp,Unnamed: 1_level_1
2017-06-13 11:19:38.750000+02:00,33.11
2017-06-13 11:30:30.500000+02:00,32.77


* **TODO**: describe output column names

In [7]:
%%time
# not multiprocessing
tmp_feat_extr.calculate(df_tmp, merge_dfs=True, njobs=1).sample(2)

CPU times: user 53.9 ms, sys: 21 ms, total: 74.9 ms
Wall time: 286 ms


Unnamed: 0_level_0,TMP_mean__w=240_s=120,TMP_std__w=240_s=120,TMP_var__w=240_s=120,TMP_amax__w=240_s=120,TMP_amin__w=240_s=120,TMP_skew__w=240_s=120,TMP_kurtosis__w=240_s=120,TMP_quantile_0.25__w=240_s=120,TMP_quantile_0.5__w=240_s=120,TMP_quantile_0.75__w=240_s=120,TMP_slope__w=240_s=120,TMP_rms__w=240_s=120,TMP_area__w=240_s=120
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2017-06-13 11:48:39.750000+02:00,32.772333,0.016367,0.000268,32.81,32.73,-0.217714,0.308911,32.77,32.77,32.79,-0.000148,32.772337,7865.36
2017-06-13 11:17:09.750000+02:00,33.072667,0.012893,0.000166,33.11,33.05,0.241676,0.209071,33.07,33.07,33.075,-6.3e-05,33.072669,7937.44


In [8]:
%%time
# multiprocessing
tmp_feat_extr.calculate(df_tmp, merge_dfs=True).sample(2)

CPU times: user 54.4 ms, sys: 94.4 ms, total: 149 ms
Wall time: 212 ms


Unnamed: 0_level_0,TMP_mean__w=240_s=120,TMP_std__w=240_s=120,TMP_var__w=240_s=120,TMP_amax__w=240_s=120,TMP_amin__w=240_s=120,TMP_skew__w=240_s=120,TMP_kurtosis__w=240_s=120,TMP_quantile_0.25__w=240_s=120,TMP_quantile_0.5__w=240_s=120,TMP_quantile_0.75__w=240_s=120,TMP_slope__w=240_s=120,TMP_rms__w=240_s=120,TMP_area__w=240_s=120
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2017-06-13 11:04:09.750000+02:00,32.8085,0.058305,0.003399,32.89,32.66,-1.233843,0.833926,32.805,32.83,32.84,0.00073,32.808552,7874.04
2017-06-13 10:52:39.750000+02:00,32.257,0.033332,0.001111,32.29,32.18,-0.828322,-0.329838,32.23,32.27,32.29,0.000416,32.257017,7741.68


In [9]:
%%memit
tmp_feat_extr.calculate(df_tmp, merge_dfs=True)

peak memory: 136.24 MiB, increment: 0.29 MiB


### Example 2: Using multiple window sizes and strides

In this example, we will use multiple stride-window-size combinations on a wearables' **Galvanic Skin Response** (GSR) signal , also known ElectorDermal Activity (EDA).

In [10]:
# PoC: we will select a random combination of the window_size stride combination
window_size_s = [60, 120]
stride_size_s = [30, 10, 20]
fs_gsr = 4

import random

gsr_feat_extr = FeatureCollection(
    [
        FeatureDescriptor(
            key="EDA",
            window=random.choice(window_size_s) * fs_tmp,
            stride=random.choice(stride_size_s) * fs_tmp,
            function=f,
        )
        for f in segment_funcs
    ]
)
gsr_feat_extr

EDA: (
	win: 240   , stride: 80: [
		FeatureDescriptor - func: NumpyFuncWrapper(mean, ['mean'], {}),
		FeatureDescriptor - func: NumpyFuncWrapper(std, ['std'], {}),
		FeatureDescriptor - func: NumpyFuncWrapper(var, ['var'], {}),
	]
	win: 480   , stride: 80: [
		FeatureDescriptor - func: NumpyFuncWrapper(amax, ['amax'], {}),
		FeatureDescriptor - func: NumpyFuncWrapper(kurtosis, ['kurtosis'], {}),
		FeatureDescriptor - func: NumpyFuncWrapper(<lambda>, ['rms'], {}),
	]
	win: 240   , stride: 120: [
		FeatureDescriptor - func: NumpyFuncWrapper(amin, ['amin'], {}),
		FeatureDescriptor - func: NumpyFuncWrapper(skew, ['skew'], {}),
	]
	win: 480   , stride: 120: [
		FeatureDescriptor - func: NumpyFuncWrapper(quantile, ['quantile_0.25', 'quantile_0.5', 'quantile_0.75'], {'q': [0.25, 0.5, 0.75]}),
	]
	win: 240   , stride: 40: [
		FeatureDescriptor - func: NumpyFuncWrapper(slope, ['slope'], {}),
	]
	win: 480   , stride: 40: [
		FeatureDescriptor - func: NumpyFuncWrapper(sum, ['area'], {}),
	]
)

In [11]:
df_gsr = pd.read_feather("data/gsr.feather").set_index("timestamp")
df_gsr.sample(2)

Unnamed: 0_level_0,EDA
timestamp,Unnamed: 1_level_1
2017-06-13 10:55:51.250000+02:00,0.153767
2017-06-13 11:16:22+02:00,0.120509


In [12]:
%%time
gsr_feat_extr.calculate(df_gsr, merge_dfs=True).sample(2)

CPU times: user 89.4 ms, sys: 9.21 ms, total: 98.6 ms
Wall time: 185 ms


Unnamed: 0_level_0,EDA_mean__w=240_s=80,EDA_std__w=240_s=80,EDA_var__w=240_s=80,EDA_amax__w=480_s=80,EDA_amin__w=240_s=120,EDA_skew__w=240_s=120,EDA_kurtosis__w=480_s=80,EDA_quantile_0.25__w=480_s=120,EDA_quantile_0.5__w=480_s=120,EDA_quantile_0.75__w=480_s=120,EDA_slope__w=240_s=40,EDA_rms__w=480_s=80,EDA_area__w=480_s=40
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2017-06-13 12:39:19.750000+02:00,0.195687,0.002882,8e-06,0.221564,,,3.757767,,,,1.7e-05,0.192694,92.470188
2017-06-13 11:20:29.750000+02:00,,,,,,,,,,,-2.1e-05,,53.601223


In [13]:
%%memit
gsr_feat_extr.calculate(df_gsr, merge_dfs=True)

peak memory: 138.79 MiB, increment: 2.93 MiB


## Use case: Multiple series feature extraction

In [14]:
# Construct the feature FeatureCollection
#   =  higher order wrapper which aggregates the featuredescriptions
multimodal_feature_extraction = FeatureCollection(
    feature_desc_list=[gsr_feat_extr, tmp_feat_extr]
)
multimodal_feature_extraction

EDA: (
	win: 240   , stride: 80: [
		FeatureDescriptor - func: NumpyFuncWrapper(mean, ['mean'], {}),
		FeatureDescriptor - func: NumpyFuncWrapper(std, ['std'], {}),
		FeatureDescriptor - func: NumpyFuncWrapper(var, ['var'], {}),
	]
	win: 480   , stride: 80: [
		FeatureDescriptor - func: NumpyFuncWrapper(amax, ['amax'], {}),
		FeatureDescriptor - func: NumpyFuncWrapper(kurtosis, ['kurtosis'], {}),
		FeatureDescriptor - func: NumpyFuncWrapper(<lambda>, ['rms'], {}),
	]
	win: 240   , stride: 120: [
		FeatureDescriptor - func: NumpyFuncWrapper(amin, ['amin'], {}),
		FeatureDescriptor - func: NumpyFuncWrapper(skew, ['skew'], {}),
	]
	win: 480   , stride: 120: [
		FeatureDescriptor - func: NumpyFuncWrapper(quantile, ['quantile_0.25', 'quantile_0.5', 'quantile_0.75'], {'q': [0.25, 0.5, 0.75]}),
	]
	win: 240   , stride: 40: [
		FeatureDescriptor - func: NumpyFuncWrapper(slope, ['slope'], {}),
	]
	win: 480   , stride: 40: [
		FeatureDescriptor - func: NumpyFuncWrapper(sum, ['area'], {}),
	]
)
T

In [15]:
%%memit
multimodal_feature_extraction.calculate([df_gsr, df_tmp], merge_dfs=True)

peak memory: 139.46 MiB, increment: 3.49 MiB


In [16]:
%%time
df_feat = multimodal_feature_extraction.calculate([df_gsr, df_tmp], merge_dfs=True)
df_feat.sample(2)

CPU times: user 158 ms, sys: 6.63 ms, total: 165 ms
Wall time: 215 ms


Unnamed: 0_level_0,EDA_mean__w=240_s=80,EDA_std__w=240_s=80,EDA_var__w=240_s=80,EDA_amax__w=480_s=80,EDA_amin__w=240_s=120,EDA_skew__w=240_s=120,EDA_kurtosis__w=480_s=80,EDA_quantile_0.25__w=480_s=120,EDA_quantile_0.5__w=480_s=120,EDA_quantile_0.75__w=480_s=120,...,TMP_amax__w=240_s=120,TMP_amin__w=240_s=120,TMP_skew__w=240_s=120,TMP_kurtosis__w=240_s=120,TMP_quantile_0.25__w=240_s=120,TMP_quantile_0.5__w=240_s=120,TMP_quantile_0.75__w=240_s=120,TMP_slope__w=240_s=120,TMP_rms__w=240_s=120,TMP_area__w=240_s=120
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-06-13 12:44:39.750000+02:00,0.158596,0.004143,1.7e-05,0.166559,0.142255,-1.570967,-0.163657,0.147372,0.156326,0.158884,...,31.41,31.33,-0.158906,-0.842109,31.35,31.37,31.39,0.000255,31.369675,7528.72
2017-06-13 12:19:49.750000+02:00,,,,,,,,,,,...,,,,,,,,,,


## Serialization

Serialization is mandatory to store and share your pipelines.
`TODO`

In [17]:
multimodal_feature_extraction.serialize("data/example_serialization.pkl")

## Other packages

### tsfresh

https://tsfresh.readthedocs.io/en/latest/api/tsfresh.utilities.html#tsfresh.utilities.dataframe_functions.roll_time_series

In [None]:
from tsfresh.feature_extraction import extract_features
from tsfresh.utilities.dataframe_functions import roll_time_series

In [None]:
# define the window-size and stride
# used the largest window and smallest strided, defined above
window = 480
stride = 40

df_gsr_id = df_gsr.reset_index(drop=False).copy()  # .set_index('timestamp', drop=True)
df_gsr_id["id"] = 1
df_gsr_id.sample(2)

**Note**: This ouputs merely one expansion with a fixed window and stride.

In [21]:
%%memit
tsf_out = roll_time_series(
    df_gsr_id,
    column_id="id",
    max_timeshift=window,
    min_timeshift=window,
    rolling_direction=stride,
)

Rolling: 100%|██████████| 80/80 [00:01<00:00, 51.69it/s]


peak memory: 373.57 MiB, increment: 133.35 MiB


In [22]:
%%time
roll_time_series(
    df_gsr_id.reset_index(drop=True),
    column_id="id",
    max_timeshift=window,
    min_timeshift=window,
    rolling_direction=stride,
).sample(2)

Rolling: 100%|██████████| 80/80 [00:01<00:00, 50.77it/s]


CPU times: user 1.89 s, sys: 442 ms, total: 2.33 s
Wall time: 2.34 s


Unnamed: 0,timestamp,EDA,id,sort
81134,2017-06-13 11:03:10.750000+02:00,0.21133,"(1, 7237)",7083
199035,2017-06-13 11:44:14.750000+02:00,0.151209,"(1, 17037)",16939


In [23]:
%%memit
tsf_feats = extract_features(tsf_out.drop(columns="timestamp"), column_id="id")

Feature Extraction: 100%|██████████| 79/79 [01:13<00:00,  1.08it/s]


peak memory: 702.43 MiB, increment: 305.52 MiB


In [24]:
# some logic re-needed to add timestamp to features

In [25]:
%%time
extract_features(
    roll_time_series(
        df_gsr_id.reset_index(drop=True),
        column_id="id",
        max_timeshift=window,
        min_timeshift=window,
        rolling_direction=stride,
    ).drop(columns="timestamp"),
    column_id="id",
).sample(2)

Rolling: 100%|██████████| 80/80 [00:01<00:00, 56.32it/s]
Feature Extraction: 100%|██████████| 79/79 [01:13<00:00,  1.08it/s]


CPU times: user 5.25 s, sys: 1.45 s, total: 6.7 s
Wall time: 1min 17s


Unnamed: 0,Unnamed: 1,EDA__variance_larger_than_standard_deviation,EDA__has_duplicate_max,EDA__has_duplicate_min,EDA__has_duplicate,EDA__sum_values,EDA__abs_energy,EDA__mean_abs_change,EDA__mean_change,EDA__mean_second_derivative_central,EDA__median,...,sort__permutation_entropy__dimension_5__tau_1,sort__permutation_entropy__dimension_6__tau_1,sort__permutation_entropy__dimension_7__tau_1,sort__query_similarity_count__query_None__threshold_0.0,"sort__matrix_profile__feature_""min""__threshold_0.98","sort__matrix_profile__feature_""max""__threshold_0.98","sort__matrix_profile__feature_""mean""__threshold_0.98","sort__matrix_profile__feature_""median""__threshold_0.98","sort__matrix_profile__feature_""25""__threshold_0.98","sort__matrix_profile__feature_""75""__threshold_0.98"
1,30957,0.0,0.0,0.0,1.0,71.245141,10.60511,0.001655,-6.1e-05,4e-06,0.146092,...,-0.0,-0.0,-0.0,,0.0,0.0,0.0,0.0,0.0,0.0
1,19877,0.0,0.0,1.0,1.0,111.884515,26.125586,0.002033,8e-06,5e-06,0.235635,...,-0.0,-0.0,-0.0,,0.0,0.0,0.0,0.0,0.0,0.0


### Seglearn

https://tsfresh.readthedocs.io/en/latest/api/tsfresh.utilities.html#tsfresh.utilities.dataframe_functions.roll_time_series

In [None]:
# !pip install -U seglearn

In [27]:
from numpy.random import rand
from seglearn.pipe import Pype
from seglearn.transform import FeatureRep, Segment
from seglearn.base import TS_Data
from seglearn.util import ts_stats, check_ts_data

In [28]:
s = Segment(width=480, step=40, order="F")

In [29]:
%%time
s.fit_transform(np.column_stack(df_gsr['EDA']), y=None)
s.transform(np.column_stack(df_tmp['TMP']), y=None)

CPU times: user 216 ms, sys: 9.13 ms, total: 225 ms
Wall time: 223 ms


(array([[382.21, 382.21, 382.21, ...,  31.35,  31.35,  31.35],
        [ 31.13,  31.13,  31.13, ...,  31.37,  31.37,  31.37],
        [ 31.15,  31.15,  31.15, ...,  31.37,  31.37,  31.37],
        ...,
        [ 31.39,  31.39,  31.39, ...,  31.35,  31.35,  31.35],
        [ 31.41,  31.41,  31.41, ...,  31.37,  31.37,  31.37],
        [ 31.39,  31.39,  31.39, ...,  31.35,  31.35,  31.35]]),
 None,
 None)

Speed seems to be in the same magnitude, but the time index is gone.

In [30]:
f_extr_pype = Pype([
    ("segment", Segment(width=480, step=40, order="F")),
    ("features", FeatureRep(features="default")),
])



In [33]:
f_extr_pype.fit_transform(np.column_stack(df_gsr['EDA']), y=None)

TypeError: object of type 'NoneType' has no len()

`TODO`: https://dmbee.github.io/seglearn/auto_examples/plot_feature_rep_mix_example.html#sphx-glr-auto-examples-plot-feature-rep-mix-example-py
still need to further look into this

# Serialization

## Series Pipeline

`TODO`

## Feature extraction

In [None]:
# restart the kernel
import os

os._exit(0)

In [None]:
import pickle
import sys

import pandas as pd

time_series_dir = "../time_series/"
data_dir = "data/"

sys.path.append(time_series_dir)

In [2]:
with open(f"data/example_serialization.pkl", "rb") as f:
    multimodal_feature_extraction = pickle.load(f)

df_gsr = pd.read_feather(f"{data_dir}gsr.feather").set_index("timestamp")
df_tmp = pd.read_feather(f"{data_dir}tmp.feather").set_index("timestamp")

**note**: This is truly amazing, we do not need redefine which local funcs were used;  
We only need a python kernel which knows the paths to the modules that are used in the serialization.

In [3]:
df_feat = multimodal_feature_extraction.calculate([df_gsr, df_tmp], merge_dfs=True)
df_feat.sample(2)

Unnamed: 0_level_0,EDA_mean__w=240_s=80,EDA_std__w=240_s=80,EDA_var__w=240_s=80,EDA_amax__w=480_s=80,EDA_amin__w=240_s=120,EDA_skew__w=240_s=120,EDA_kurtosis__w=480_s=80,EDA_quantile_0.25__w=480_s=120,EDA_quantile_0.5__w=480_s=120,EDA_quantile_0.75__w=480_s=120,...,TMP_amax__w=240_s=120,TMP_amin__w=240_s=120,TMP_skew__w=240_s=120,TMP_kurtosis__w=240_s=120,TMP_quantile_0.25__w=240_s=120,TMP_quantile_0.5__w=240_s=120,TMP_quantile_0.75__w=240_s=120,TMP_slope__w=240_s=120,TMP_rms__w=240_s=120,TMP_area__w=240_s=120
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-06-13 11:34:09.750000+02:00,,,,,0.16528,-1.550274,,0.171676,0.172955,0.174234,...,32.77,32.73,-0.000387,-6.7e-05,32.75,32.75,32.75,-5.2e-05,32.749335,7859.84
2017-06-13 11:15:29.750000+02:00,,,,,,,,,,,...,,,,,,,,,,


# Bonus - Get LAYD: Look At Your Data

And as a bonus, for running/reading this notebook, you get some nice visualization code, for
ofcourse time-series.

In [6]:
import ipywidgets as widgets
import plotly.graph_objects as go
from ipywidgets import interact_manual
from plotly.subplots import make_subplots

In [7]:
df_dict = {"tmp": df_tmp, "gsr": df_gsr}

In [8]:
feat_widget = widgets.SelectMultiple(options=df_feat.columns)
sig_widget = widgets.SelectMultiple(options=["gsr", "tmp"])

In [None]:
@interact_manual
def visuzalize(features=feat_widget, signals=sig_widget):
    row_titles = list(signals) + ["features"] if len(features) else []
    fig = make_subplots(
        rows=len(row_titles),
        cols=1,
        shared_xaxes=True,
        vertical_spacing=0.1 / len(row_titles),
        row_titles=row_titles,
    )
    fig.update_layout(height=300 * len(row_titles))

    # first, visualize the "raw" signals
    row_idx = 1
    for sig in signals:
        df_sig = df_dict[sig][10:].resample("1s").mean()
        for col in set(df_sig.columns).difference(["index", "timestamp"]):
            fig.add_trace(
                go.Scattergl(x=df_sig.index, y=df_sig[col], name=col, hoverinfo="skip"),
                row=row_idx,
                col=1,
            )
        row_idx += 1

    # then visualize the features
    for feature in features:
        df_ff = df_feat[feature].dropna()
        fig.add_trace(
            go.Scattergl(
                connectgaps=True,
                x=df_ff.index,
                y=df_ff,
                name=feature,
                hoverinfo="skip",
                mode="markers",
                showlegend=True,
            ),
            row=row_idx,
            col=1,
        )

    return fig