In [1]:
import numpy as np
import pandas as pd
import scipy.stats as ss
from typing import List

# Data loading

In [2]:
# Load your data
df_tmp = pd.read_parquet("data/empatica/tmp.parquet").set_index("timestamp")
df_gsr = pd.read_parquet("data/empatica/gsr.parquet").set_index("timestamp")
df_acc = pd.read_parquet("data/empatica/acc.parquet").set_index("timestamp")

# !!! The data MUST datetime Indexed
assert isinstance(df_tmp.index, pd.DatetimeIndex)

print(df_tmp.head(3))   # ~4Hz
print('-'*60)
print(df_acc.head(6))  # ~32Hz
print('-'*60)
print(df_gsr.head(3))   # ~4Hz

                                         TMP
timestamp                                   
2017-06-13 14:22:13+02:00         382.209991
2017-06-13 14:22:13.250000+02:00  382.209991
2017-06-13 14:22:13.500000+02:00  382.209991
------------------------------------------------------------
                                  ACC_x  ACC_y  ACC_z
timestamp                                            
2017-06-13 14:22:13+02:00             0      5     63
2017-06-13 14:22:13.031250+02:00      0      5     63
2017-06-13 14:22:13.062500+02:00      0      5     63
2017-06-13 14:22:13.093750+02:00      0      5     63
2017-06-13 14:22:13.125000+02:00      0      5     63
2017-06-13 14:22:13.156250+02:00     -1      5     63
------------------------------------------------------------
                                       EDA
timestamp                                 
2017-06-13 14:22:13+02:00         0.000000
2017-06-13 14:22:13.250000+02:00  0.400309
2017-06-13 14:22:13.500000+02:00  0.475767


**note**: The `ACC` signal is sampled at a different sample frequency.

> So we deal with `multivariate data` of which each modality has a different sample-frequency.

# Series Processing pipeline

In [3]:
try: 
    from tsflex.processing import SeriesProcessor, SeriesPipeline
    from tsflex.processing.logger import get_processor_logs
except:
    import sys
    sys.path.append('../')
    from tsflex.processing import SeriesProcessor, SeriesPipeline
    from tsflex.processing.logger import get_processor_logs

> **note**: processing takes place on one or multiple flat series as is and does not use window-stride parameters<br>the **output** of a processing step should either be `one or multiple` existing **series** (which will be updated in the internal representation ) and/or `one-multiple` new series.

**[link to docs](https://predict-idlab.github.io/tsflex/features/index.html#getting-started)**

![series uml](https://raw.githubusercontent.com/predict-idlab/tsflex/main/docs/_static/series_uml.png)

As shown above, there are 2 relevant classes for processing.

1. [SeriesPipeline](https://predict-idlab.github.io/tsflex/processing/#tsflex.processing.SeriesPipeline): serves as a pipeline, withholding the to-be-applied processing steps
2. [SeriesProcessor](https://predict-idlab.github.io/tsflex/processing/#tsflex.processing.SeriesProcessor): an instance of this class describes a processing step.<br>
   Processors are defined by:<br>
   * `function`: the Callable processing-function - e.g. scipy.signal.detrend
   * `series_names`: the name(s) of the series on which the processing function should be applied
   * `**kwargs`: the keyword arguments for the function.

The snippets below show how the SeriesPipeline & SeriesProcessor interplay:

In [4]:
# --------------------- some custom signal procesisng functions ---------------------
from scipy.signal import savgol_filter

def clip_quantiles(sig: pd.Series, lower_q=0.01, upper_q=0.99) -> np.ndarray:
    # Note that this function induces a data leakage
    quantile_vals = np.quantile(sig, q=[lower_q, upper_q])
    return np.clip(sig, *quantile_vals)

def smv(*sigs) -> pd.Series:
    # creates a new series
    sig_prefixes = set(sig.name.split('_')[0] for sig in sigs)
    result = np.sqrt(np.sum([np.square(sig) for sig in sigs], axis=0))
    return pd.Series(result, index=sigs[0].index, name='|'.join(sig_prefixes)+'_'+'SMV')

In [5]:
# -- 1.  Create the series processors (with their keyword arguments)
clipper_tmp = SeriesProcessor(clip_quantiles, series_names="TMP", lower_q=0, upper_q=0.999)
savgol_eda = SeriesProcessor(savgol_filter, "EDA", window_length=5, polyorder=2)
savgol_acc = SeriesProcessor(savgol_filter, ["ACC_x", "ACC_y", "ACC_z"], window_length=33, polyorder=2)
smv_processor = SeriesProcessor(smv, ("ACC_x", "ACC_y", "ACC_z"))
clipper_smv = SeriesProcessor(clip_quantiles, "ACC_SMV")

# -- 2. Create the series pipeline which wittholds a list of processors
series_pipe = SeriesPipeline(
    processors=[clipper_tmp, savgol_eda, savgol_acc, smv_processor, clipper_smv]
)
series_pipe

[
	clip_quantiles {'lower_q': 0, 'upper_q': 0.999} :  ('TMP',)
	savgol_filter {'window_length': 5, 'polyorder': 2} :  ('EDA',)
	savgol_filter {'window_length': 33, 'polyorder': 2} :  ('ACC_x',) ('ACC_y',) ('ACC_z',)
	smv {} :  ('ACC_x', 'ACC_y', 'ACC_z')
	clip_quantiles {} :  ('ACC_SMV',)
]

In [6]:
# 3. Process the data
out_data : List[pd.Series] = series_pipe.process(
    [df_gsr, df_acc, df_tmp]
)

# update the df_gsr and the df_tmp, and save the df_smv
for series in out_data:
    print(series.name)

EDA
ACC_x
ACC_y
ACC_z
TMP
ACC_SMV


## Processing function logging

The `ProcessingPipeline` can also log the duration of it's `SeriesProcessor` steps

To do so, one needs to set the `logging_file_path` argumetn and then call the `get_processor_logs()` method with this corresponding path to see the output dataframe.

In [7]:
_ = series_pipe.process([df_gsr, df_acc, df_tmp], logging_file_path="example_processing_logs.log")



In [8]:
get_processor_logs("example_processing_logs.log")

Unnamed: 0,log_time,function,series_names,output_names,duration,duration %
0,2023-08-16 12:45:29.484,clip_quantiles,"(TMP,)",TMP,0 days 00:00:00.002462031,7.67
1,2023-08-16 12:45:29.486,savgol_filter,"(EDA,)",EDA,0 days 00:00:00.001066185,3.32
2,2023-08-16 12:45:29.505,savgol_filter,"(ACC_x,), (ACC_y,), (ACC_z,)","ACC_x, ACC_y, ACC_z",0 days 00:00:00.019186704,59.79
3,2023-08-16 12:45:29.510,smv,"(ACC_x, ACC_y, ACC_z)",ACC_SMV,0 days 00:00:00.004629862,14.43
4,2023-08-16 12:45:29.514,clip_quantiles,"(ACC_SMV,)",ACC_SMV,0 days 00:00:00.004746801,14.79


This is especially useful to see which of the processing step is the bottleneck

# Feature extraction

In [9]:
try: 
    from tsflex.features import FeatureCollection, FuncWrapper
    from tsflex.features import FeatureDescriptor, MultipleFeatureDescriptors
    from tsflex.features.logger import get_feature_logs
except:
    import sys
    sys.path.append('../')
    from tsflex.features import FeatureCollection, FuncWrapper
    from tsflex.features import FeatureDescriptor, MultipleFeatureDescriptors
    from tsflex.features.logger import get_feature_logs

## Defining functions

**[link to docs](https://predict-idlab.github.io/tsflex/features/index.html#getting-started)**

![features uml](https://raw.githubusercontent.com/predict-idlab/tsflex/main/docs/_static/features_uml.png)

As shown above, there are 3 relevant classes for feature-extraction.

1. [FeatureCollection](https://predict-idlab.github.io/tsflex/features/#tsflex.features.FeatureCollection): serves as a registry, withholding the to-be-calculated _features_
2. [FeatureDescriptor](https://predict-idlab.github.io/tsflex/features/#tsflex.features.FeatureDescriptor): an instance of this class describes a _feature_. <br>Features are defined by:
      * `series_name`: the names of the signal(s) which this feature will use. 
      * `function`: the _Callable_ feature-function - e.g. _np.mean_
      * `window`: the _time-based_ window -  e.g. _"1hour"_
      * `stride`: the _time-based_ stride - e.g. _"2days"_
3. [NumpyFuncWrapper](https://predict-idlab.github.io/tsflex/features/#tsflex.features.NumpyFuncWrapper): a wrapper around _Callable_ functions, intended for advanced feature function definitions, such as:
    * features with multiple output columns
    * passing _**kwargs_ to feature functions


**Note**: this library does `not` provide any feature-functions as:
* There already exist many other feature extraction libraries such as numpy, scipy, tsfresh with which `tsflex` integrates.
* (Relevant) features are dependent on the objective and signals-modalites, making features methods very problem specific.
* Finally, as can be seen in the example below, our `NumpyFuncWrapper`'s `func`-attribute is versatile enough to wrap the end-user's desired features.

In [10]:
# --------------------- some custom feature extraction functions ---------------------
# -- 1. one-to-many functions
#    To compute quantiles, you need sort the windowed data, which is a rather expensive
#    operation O(n*log(n)). Hence, you might want to calculate all your desired 
#    quantiles in a single function-wrapper, returning multiple outputs.

quantiles = [0.25, 0.5, 0.75]
f_quantiles = FuncWrapper(
    func=np.quantile,  # the wrapped function that will operate on numpy arrays
    output_names=[f"quantile_{q}" for q in quantiles],  # the output column names
    q=quantiles,  # optional - additional function-related kwargs
)


# -- 2. in-line functions
#    You can define your functions locally; these will serialize flawlessly
def slope(x):
    return np.polyfit(np.arange(0, len(x)), x, 1)[0]

f_slope = FuncWrapper(slope, output_names="slope")

# -- 3. Lambda's
#    Or even use lambda's and other modules' functions
f_rms = FuncWrapper(lambda x: np.sqrt(np.mean(x ** 2)), output_names="rms")
f_area = FuncWrapper(np.sum, output_names="area")


# (For convenience) we store the constructed `NumpyFuncWrappers` in a list
segment_funcs = [
    np.mean,
    np.std,
    np.var,
    np.max,
    np.min,
    ss.skew,  # use other libraries such as scipy
    ss.kurtosis,
    f_quantiles,
    f_slope,
    f_rms,
    f_area,
]
segment_funcs

[<function numpy.mean(a, axis=None, dtype=None, out=None, keepdims=<no value>, *, where=<no value>)>,
 <function numpy.std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=<no value>, *, where=<no value>)>,
 <function numpy.var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=<no value>, *, where=<no value>)>,
 <function numpy.amax(a, axis=None, out=None, keepdims=<no value>, initial=<no value>, where=<no value>)>,
 <function numpy.amin(a, axis=None, out=None, keepdims=<no value>, initial=<no value>, where=<no value>)>,
 <function scipy.stats._stats_py.skew(a, axis=0, bias=True, nan_policy='propagate', *, keepdims=False)>,
 <function scipy.stats._stats_py.kurtosis(a, axis=0, fisher=True, bias=True, nan_policy='propagate', *, keepdims=False)>,
 FuncWrapper(quantile, ['quantile_0.25', 'quantile_0.5', 'quantile_0.75'], {'q': [0.25, 0.5, 0.75]}),
 FuncWrapper(slope, ['slope'], {}),
 FuncWrapper(<lambda>, ['rms'], {}),
 FuncWrapper(sum, ['area'], {})]

## Single series feature extraction

The defined functions above will be encapsulated in a [FeatureDescriptor](https://predict-idlab.github.io/tsflex/features/index.html#tsflex.features.FeatureDescriptor) object.

A `FeatureDescriptor` describes a feature, and has 4 main attributes:

<center>

## Featuredescriptor constructor args:

|  attribute 	|                  type                 	| info                                                                                                             	|
|-----------:	|:-------------------------------------:	|------------------------------------------------------------------------------------------------------------------	|
| `function` 	| Union[Callable, <br>NumpyFuncWrapper] 	| The `function` that calculates this feature.                                                                     	|
|      `key` 	|                 Tuple[str, ...]                	| The signal key; i.e., the `pd.DataFrame` column name or <br> `pd.Series` name on which the function will operate.     	|
|   `window` 	|                  Union[str, pd.timedelta]               	| The window size on which this feature will be applied, <br> expressed in a **time-based** manner. 	|
|   `stride` 	|                  Union[str, pd.timedelta]                  	| The stride of the window rolling process, also as <br> expressed in a **time-based** manner|

</center>


**note**: [MultipleFeatureDescriptor](https://predict-idlab.github.io/tsflex/features/index.html#tsflex.features.MultipleFeatureDescriptors) is actaully a factory for `FeatureDescriptor` objects.

### Fixed window size & stride

**note**: this functionality is exposed by most existing time-series libraries (often on a sample-based matter).

In this example, we will use the _temperature_ signal from a wearable

In [11]:
df_tmp.sample(2)

Unnamed: 0_level_0,TMP
timestamp,Unnamed: 1_level_1
2017-06-13 14:28:02.500000+02:00,30.35
2017-06-13 15:49:32.500000+02:00,33.68


Note how the `TMP`-column is used as signal_key in the `FeatureCollection`

In [12]:
# Define the sample frequency and window size
tmp_feat_extr = FeatureCollection(
    feature_descriptors=[
        MultipleFeatureDescriptors(
            functions=segment_funcs,  # The list of functions we constructed earlier
            series_names=["TMP"],
            windows='60s',
            strides='30s',
        )
    ]
)

# The FeatureCollection's __repr__() gives a nice overview of the structure
tmp_feat_extr

TMP: (
	win: 1m    : [
		FeatureDescriptor - func: FuncWrapper(mean, ['mean'], {})    stride: ['30s'],
		FeatureDescriptor - func: FuncWrapper(std, ['std'], {})    stride: ['30s'],
		FeatureDescriptor - func: FuncWrapper(var, ['var'], {})    stride: ['30s'],
		FeatureDescriptor - func: FuncWrapper(amax, ['amax'], {})    stride: ['30s'],
		FeatureDescriptor - func: FuncWrapper(amin, ['amin'], {})    stride: ['30s'],
		FeatureDescriptor - func: FuncWrapper(skew, ['skew'], {})    stride: ['30s'],
		FeatureDescriptor - func: FuncWrapper(kurtosis, ['kurtosis'], {})    stride: ['30s'],
		FeatureDescriptor - func: FuncWrapper(quantile, ['quantile_0.25', 'quantile_0.5', 'quantile_0.75'], {'q': [0.25, 0.5, 0.75]})    stride: ['30s'],
		FeatureDescriptor - func: FuncWrapper(slope, ['slope'], {})    stride: ['30s'],
		FeatureDescriptor - func: FuncWrapper(<lambda>, ['rms'], {})    stride: ['30s'],
		FeatureDescriptor - func: FuncWrapper(sum, ['area'], {})    stride: ['30s'],
	]
)

In [13]:
# to extract the features we just call the collection's `calculate()` function
extracted_feats = tmp_feat_extr.calculate(
    data=out_data,    # The signals on which features are calculated
    # NOTE: out_data withholds the updated TMP signal
    return_df=True,  # If true, an outer merge on the feature-outputs will be performed
    n_jobs=2         # If > 1, the feature extraction is parallellized
)

extracted_feats.sample(2)

Unnamed: 0_level_0,TMP__amax__w=1m,TMP__amin__w=1m,TMP__area__w=1m,TMP__kurtosis__w=1m,TMP__mean__w=1m,TMP__quantile_0.25__w=1m,TMP__quantile_0.5__w=1m,TMP__quantile_0.75__w=1m,TMP__rms__w=1m,TMP__skew__w=1m,TMP__slope__w=1m,TMP__std__w=1m,TMP__var__w=1m
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2017-06-13 14:38:43+02:00,32.790001,32.290001,7803.44043,-1.432049,32.514336,32.330002,32.51,32.66,32.514774,0.209048,0.002374,0.16899,0.028558
2017-06-13 15:17:13+02:00,34.610001,34.43,8278.279297,-0.480542,34.492832,34.450001,34.470001,34.529999,34.492874,0.847968,0.00068,0.05345,0.002857


### Multiple `time-based` window sizes and strides

_In this example, we use **multiple** stride-window-size combinations on a wearables' ElectorDermal Activity (EDA)_

Note that we do not use int-based window-stride combinations, but `time-based` ones. Also take a closer look at the `__repr__` string.

In [14]:
# PoC: we will select a random combination of the window_size stride combination
window_size_s = ['30s', '120s', '90s', '1h']
stride_size_s = ['15s', '30s']

import random

gsr_feat_extr = FeatureCollection(
    [
        FeatureDescriptor(
            series_name="EDA",
            window=random.choice(window_size_s),
            stride=random.choice(stride_size_s),
            function=f,
        )
        for f in segment_funcs
    ]
)

# the __repr__ string outputs the windows & strides in a time-string representation :)
print(gsr_feat_extr)
print('-'*60)
gsr_feat_extr.calculate(df_gsr, return_df=True, show_progress=False, n_jobs=None).sample(2)

EDA: (
	win: 1h    : [
		FeatureDescriptor - func: FuncWrapper(mean, ['mean'], {})    stride: ['30s'],
	]
	win: 1m30s : [
		FeatureDescriptor - func: FuncWrapper(std, ['std'], {})    stride: ['30s'],
		FeatureDescriptor - func: FuncWrapper(var, ['var'], {})    stride: ['30s'],
		FeatureDescriptor - func: FuncWrapper(kurtosis, ['kurtosis'], {})    stride: ['15s'],
		FeatureDescriptor - func: FuncWrapper(slope, ['slope'], {})    stride: ['15s'],
		FeatureDescriptor - func: FuncWrapper(sum, ['area'], {})    stride: ['15s'],
	]
	win: 2m    : [
		FeatureDescriptor - func: FuncWrapper(amax, ['amax'], {})    stride: ['15s'],
		FeatureDescriptor - func: FuncWrapper(amin, ['amin'], {})    stride: ['15s'],
	]
	win: 30s   : [
		FeatureDescriptor - func: FuncWrapper(skew, ['skew'], {})    stride: ['15s'],
		FeatureDescriptor - func: FuncWrapper(quantile, ['quantile_0.25', 'quantile_0.5', 'quantile_0.75'], {'q': [0.25, 0.5, 0.75]})    stride: ['30s'],
		FeatureDescriptor - func: FuncWrapper(<lambda

Unnamed: 0_level_0,EDA__amax__w=2m,EDA__amin__w=2m,EDA__area__w=1m30s,EDA__kurtosis__w=1m30s,EDA__mean__w=1h,EDA__quantile_0.25__w=30s,EDA__quantile_0.5__w=30s,EDA__quantile_0.75__w=30s,EDA__rms__w=30s,EDA__skew__w=30s,EDA__slope__w=1m30s,EDA__std__w=1m30s,EDA__var__w=1m30s
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2017-06-13 16:12:13+02:00,1.220547,1.012079,398.828674,7.343093,2.056634,1.082421,1.095849,1.106721,1.095335,-0.420529,-8.9e-05,0.016636,0.000277
2017-06-13 14:52:58+02:00,0.950591,0.777933,306.619629,-1.330574,,,,,0.814525,0.054707,-0.000287,,


**note**: The `NaN` values in the above pd.DataFrame are cause by the outer merge which we do to retain the time-indices. Various feature were extracted at different windows & strides, thus making few features share the same time-indices.
<br><br>
If we set `return_df=False`, a `List[pd.Series]` will be returned

In [15]:
feat_list : List[pd.Series] = gsr_feat_extr.calculate(df_gsr, return_df=False, show_progress=False, n_jobs=None)
print(len(feat_list))
feat_list[0].sample(3)

11


Unnamed: 0_level_0,EDA__amax__w=2m
timestamp,Unnamed: 1_level_1
2017-06-13 15:55:28+02:00,4.517331
2017-06-13 16:05:58+02:00,1.47122
2017-06-13 15:51:43+02:00,4.738588


## Rounded start index feature extraction

When working with data that contain 'unclean' timestamps, it is possible to prettify the results by using the `exact_time` parameter.

Here you can see some data that have a very small resolution.

In [17]:
df_acc.iloc[30:].head(5)

Unnamed: 0_level_0,ACC_x,ACC_y,ACC_z
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-06-13 14:22:13.937500+02:00,1,6,63
2017-06-13 14:22:13.968750+02:00,1,6,63
2017-06-13 14:22:14+02:00,-1,4,63
2017-06-13 14:22:14.031250+02:00,-2,4,63
2017-06-13 14:22:14.062500+02:00,0,5,63


In [19]:
fc = FeatureCollection(
    [
FeatureDescriptor(
            series_name="ACC_x",
            window='30s',
            stride='10s',
            function=np.std,
        )
    ]
)

fc

ACC_x: (
	win: 30s   : [
		FeatureDescriptor - func: FuncWrapper(std, ['std'], {})    stride: ['10s'],
	]
)

In [22]:
fc.calculate(df_acc.iloc[30:])[0]

Unnamed: 0_level_0,ACC_x__std__w=30s
timestamp,Unnamed: 1_level_1
2017-06-13 14:22:43.937500+02:00,0.771801
2017-06-13 14:22:53.937500+02:00,0.799006
2017-06-13 14:23:03.937500+02:00,0.659779
2017-06-13 14:23:13.937500+02:00,0.624957
2017-06-13 14:23:23.937500+02:00,0.700799
...,...
2017-06-13 16:27:13.937500+02:00,18.077957
2017-06-13 16:27:23.937500+02:00,8.099124
2017-06-13 16:27:33.937500+02:00,14.798578
2017-06-13 16:27:43.937500+02:00,14.624782


You can, for example, have a resolution of 1 second as seen below.

**note:**
This can result in (slightly) altered results than when not rounding the timestamps. This will also depend on the resolution used.

In [21]:
fc.calculate(df_acc.iloc[30:], exact_time='1s')[0]

Unnamed: 0_level_0,ACC_x__std__w=30s
timestamp,Unnamed: 1_level_1
2017-06-13 14:22:44+02:00,0.770055
2017-06-13 14:22:54+02:00,0.798542
2017-06-13 14:23:04+02:00,0.660551
2017-06-13 14:23:14+02:00,0.624135
2017-06-13 14:23:24+02:00,0.700799
...,...
2017-06-13 16:27:14+02:00,18.065915
2017-06-13 16:27:24+02:00,8.160273
2017-06-13 16:27:34+02:00,14.800646
2017-06-13 16:27:44+02:00,14.624782


For time indices, you can also use a timedelta object.

In [25]:
fc.calculate(df_acc.iloc[30:], exact_time=pd.Timedelta(10, 'seconds'))[0]

Unnamed: 0_level_0,ACC_x__std__w=30s
timestamp,Unnamed: 1_level_1
2017-06-13 14:22:50+02:00,0.783929
2017-06-13 14:23:00+02:00,0.798185
2017-06-13 14:23:10+02:00,0.665362
2017-06-13 14:23:20+02:00,0.631974
2017-06-13 14:23:30+02:00,0.683508
...,...
2017-06-13 16:27:20+02:00,11.929327
2017-06-13 16:27:30+02:00,14.153599
2017-06-13 16:27:40+02:00,14.686610
2017-06-13 16:27:50+02:00,14.468111


Or when setting it to `False`, it rounds using the LCM of strides and window. (In this case, the first index will be rounded to a multiple of 30 seconds).

In [27]:
fc.calculate(df_acc.iloc[30:], exact_time=False)[0]

Unnamed: 0_level_0,ACC_x__std__w=30s
timestamp,Unnamed: 1_level_1
2017-06-13 14:23:00+02:00,0.798185
2017-06-13 14:23:10+02:00,0.665362
2017-06-13 14:23:20+02:00,0.631974
2017-06-13 14:23:30+02:00,0.683508
2017-06-13 14:23:40+02:00,0.629029
...,...
2017-06-13 16:27:20+02:00,11.929327
2017-06-13 16:27:30+02:00,14.153599
2017-06-13 16:27:40+02:00,14.686610
2017-06-13 16:27:50+02:00,14.468111


This functionality can also be used in sequence data, where the indices are numeric. In that case, the values of window, stride should be numeric and the value of exact_time can either be a `bool`, `int` or `float`.

The full description of this parameter can be found in this table:

| index datatype | rounding datatype | return datatype | extra info                                                        |
| :------------- | :---------------- | --------------: | ----------------------------------------------------------------: |
| int            | int               |             int | round `index` to nearest multiple of `rounding`                   |
|                | float             |           float | round `index` to nearest multiple of `rounding`                   |
|                | bool              |           float | round `index` to lcm of `window` and/or `stride`                  |
| float          | int               |           float | round `index` to nearest multiple of `rounding`                   |
|                | float             |           float | round `index` to nearest multiple of `rounding`                   |
|                | bool              |           float | round `index` to lcm of `window` and/or `stride`                  |
| pd.timestamp   | str               |    pd.timestamp | round `index` to resolution of `rounding` (e.g. '10s', '2m', 'h') |
|                | bool              |    pd.timestamp | round `index` to lcm of `window` and/or `stride`                  |
|                | pd.timedelta      |    pd.timestamp | round `index` to nearest multiple of `rounding`.                  |


## Multiple series feature extraction

In [16]:
# Construct the feature FeatureCollection
#   =  higher order wrapper which aggregates the featuredescriptions
multimodal_feature_extraction = FeatureCollection(
    feature_descriptors=[gsr_feat_extr, tmp_feat_extr]
)

print(multimodal_feature_extraction)
print('-'*60)

df_feat = multimodal_feature_extraction.calculate(
    [df_gsr, df_tmp], return_df=True
)
df_feat.sample(2)

EDA: (
	win: 1h    : [
		FeatureDescriptor - func: FuncWrapper(mean, ['mean'], {})    stride: ['30s'],
	]
	win: 1m30s : [
		FeatureDescriptor - func: FuncWrapper(std, ['std'], {})    stride: ['30s'],
		FeatureDescriptor - func: FuncWrapper(var, ['var'], {})    stride: ['30s'],
		FeatureDescriptor - func: FuncWrapper(kurtosis, ['kurtosis'], {})    stride: ['15s'],
		FeatureDescriptor - func: FuncWrapper(slope, ['slope'], {})    stride: ['15s'],
		FeatureDescriptor - func: FuncWrapper(sum, ['area'], {})    stride: ['15s'],
	]
	win: 2m    : [
		FeatureDescriptor - func: FuncWrapper(amax, ['amax'], {})    stride: ['15s'],
		FeatureDescriptor - func: FuncWrapper(amin, ['amin'], {})    stride: ['15s'],
	]
	win: 30s   : [
		FeatureDescriptor - func: FuncWrapper(skew, ['skew'], {})    stride: ['15s'],
		FeatureDescriptor - func: FuncWrapper(quantile, ['quantile_0.25', 'quantile_0.5', 'quantile_0.75'], {'q': [0.25, 0.5, 0.75]})    stride: ['30s'],
		FeatureDescriptor - func: FuncWrapper(<lambda

Unnamed: 0_level_0,EDA__amax__w=2m,EDA__amin__w=2m,EDA__area__w=1m30s,EDA__kurtosis__w=1m30s,EDA__mean__w=1h,EDA__quantile_0.25__w=30s,EDA__quantile_0.5__w=30s,EDA__quantile_0.75__w=30s,EDA__rms__w=30s,EDA__skew__w=30s,...,TMP__kurtosis__w=1m,TMP__mean__w=1m,TMP__quantile_0.25__w=1m,TMP__quantile_0.5__w=1m,TMP__quantile_0.75__w=1m,TMP__rms__w=1m,TMP__skew__w=1m,TMP__slope__w=1m,TMP__std__w=1m,TMP__var__w=1m
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-06-13 15:39:28+02:00,3.183709,1.596755,921.28009,0.894696,,,,,2.297914,-2.049654,...,,,,,,,,,,
2017-06-13 15:40:13+02:00,2.855021,1.596755,827.42395,1.085358,1.263708,2.137228,2.169202,2.221959,2.179614,0.264285,...,-1.026206,34.577831,34.549999,34.57,34.610001,34.577858,0.256684,-0.000537,0.040788,0.001664


## Logging

The `FeatureCollection` can also log the duration of it's `Features`

To do so, one needs to set the `logging_file_path` argumetn and then call the `get_feature_logs()` method with this corresponding path to see the output dataframe.

In [17]:
multimodal_feature_extraction.calculate(out_data, logging_file_path="example_feature_logs.log");

In [18]:
get_feature_logs("example_feature_logs.log")

Unnamed: 0,log_time,function,series_names,window,stride,output_names,duration,duration %
0,2023-05-05 12:17:13.103,sum,"(EDA,)",1h,"(30s,)",EDA__area__w=1h,0 days 00:00:00.003769857,0.21
1,2023-05-05 12:17:13.106,mean,"(EDA,)",1h,"(30s,)",EDA__mean__w=1h,0 days 00:00:00.006607244,0.37
2,2023-05-05 12:17:13.144,var,"(EDA,)",30s,"(15s,)",EDA__var__w=30s,0 days 00:00:00.032368199,1.82
3,2023-05-05 12:17:13.150,amax,"(EDA,)",1m30s,"(15s,)",EDA__amax__w=1m30s,0 days 00:00:00.010895506,0.61
4,2023-05-05 12:17:13.153,amin,"(EDA,)",1m30s,"(15s,)",EDA__amin__w=1m30s,0 days 00:00:00.007409622,0.42
5,2023-05-05 12:17:13.158,std,"(EDA,)",30s,"(15s,)",EDA__std__w=30s,0 days 00:00:00.058665547,3.3
6,2023-05-05 12:17:13.160,<lambda>,"(EDA,)",1m30s,"(30s,)",EDA__rms__w=1m30s,0 days 00:00:00.014571842,0.82
7,2023-05-05 12:17:13.173,mean,"(TMP,)",1m,"(30s,)",TMP__mean__w=1m,0 days 00:00:00.009052561,0.51
8,2023-05-05 12:17:13.177,quantile,"(EDA,)",30s,"(30s,)","EDA__quantile_0.25__w=30s, EDA__quantile_0.5__...",0 days 00:00:00.044969643,2.53
9,2023-05-05 12:17:13.186,amax,"(TMP,)",1m,"(30s,)",TMP__amax__w=1m,0 days 00:00:00.003757103,0.21


This is especially  useful for optimizing to see which features are the bottleneck

## Use case: batch-based feature extraction

In [19]:
from tsflex.chunking import chunk_data

* maybe execute this on a highdimensional series, like the `sleep data`

In [20]:
same_range_chunks = chunk_data(
    data=[df_tmp],
    fs_dict={"EDA": 4, "TMP": 4},
    max_chunk_dur='10min'
)

## Serialization

Serialization is mandatory to store and share your pipelines.
`TODO`

In [21]:
multimodal_feature_extraction.serialize("data/example_serialization.pkl")

# Bonus - Get LAYD: Look At Your Data

And as a bonus, for running/reading this notebook, you get some nice visualization code, for
ofcourse time-series.

In [24]:
# !pip install ipywidgets
# !pip install plotly
import ipywidgets as widgets
import plotly.graph_objects as go
from ipywidgets import interact_manual
from plotly.subplots import make_subplots

In [25]:
df_dict = {"tmp": df_tmp, "gsr": df_gsr}
feat_widget = widgets.SelectMultiple(options=df_feat.columns)
sig_widget = widgets.SelectMultiple(options=["gsr", "tmp"])


@interact_manual
def visuzalize(features=feat_widget, signals=sig_widget):
    row_titles = list(signals) + ["features"] if len(features) else []
    fig = make_subplots(
        rows=len(row_titles),
        cols=1,
        shared_xaxes=True,
        vertical_spacing=0.1 / len(row_titles),
        row_titles=row_titles,
    )
    fig.update_layout(height=300 * len(row_titles))

    # first, visualize the "raw" signals
    row_idx = 1
    for sig in signals:
        df_sig = df_dict[sig][10:].resample("1s").mean()
        for col in set(df_sig.columns).difference(["index", "timestamp"]):
            fig.add_trace(
                go.Scattergl(x=df_sig.index, y=df_sig[col], name=col, hoverinfo="skip"),
                row=row_idx,
                col=1,
            )
        row_idx += 1

    # then visualize the features
    for feature in features:
        df_ff = df_feat[feature].dropna()
        fig.add_trace(
            go.Scattergl(
                connectgaps=True,
                x=df_ff.index,
                y=df_ff,
                name=feature,
                hoverinfo="skip",
                mode="markers",
                showlegend=True,
            ),
            row=row_idx,
            col=1,
        )

    return fig.show()

interactive(children=(SelectMultiple(description='features', options=('EDA__amax__w=1m30s', 'EDA__amin__w=1m30…