# Feature extraction with tsfresh transformer

In this tutorial, we show how you can use `sktime` with [`tsfresh`](https://tsfresh.readthedocs.io) to first extract features from time series, so that we can then use any scikit-learn estimator.

## Preliminaries
You have to install tsfresh if you haven't already. To install it, uncomment the cell below:

In [1]:
# !pip install --upgrade tsfresh

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sktime.datasets import load_basic_motions
from sktime.datasets import load_gunpoint
from sktime.transformers.series_as_features.summarize import \
    TSFreshFeatureExtractor

## Univariate time series classification data

For more details on the data set, see the [univariate time series classification notebook](https://github.com/alan-turing-institute/sktime/blob/master/examples/01_classification_univariate.ipynb).

In [3]:
X, y = load_gunpoint(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

Unnamed: 0,dim_0
0,0 -0.64789 1 -0.64199 2 -0.63819 3...
1,0 -0.64443 1 -0.64540 2 -0.64706 3...
2,0 -0.77835 1 -0.77828 2 -0.77715 3...
3,0 -0.75006 1 -0.74810 2 -0.74616 3...
4,0 -0.59954 1 -0.59742 2 -0.59927 3...


In [None]:
X_train.head()

In [4]:
# binary classification task
np.unique(y_train)

array(['1', '2'], dtype='<U1')

## Using tsfresh to extract features

In [5]:
# tf = TsFreshTransfomer()
t = TSFreshFeatureExtractor(default_fc_parameters="efficient", show_warnings=False)
Xt = t.fit_transform(X_train)
Xt.head()

Feature Extraction: 100%|██████████| 5/5 [00:03<00:00,  1.40it/s]


variable,dim_0__abs_energy,dim_0__absolute_sum_of_changes,"dim_0__agg_autocorrelation__f_agg_""mean""__maxlag_40","dim_0__agg_autocorrelation__f_agg_""median""__maxlag_40","dim_0__agg_autocorrelation__f_agg_""var""__maxlag_40","dim_0__agg_linear_trend__attr_""intercept""__chunk_len_10__f_agg_""max""","dim_0__agg_linear_trend__attr_""intercept""__chunk_len_10__f_agg_""mean""","dim_0__agg_linear_trend__attr_""intercept""__chunk_len_10__f_agg_""min""","dim_0__agg_linear_trend__attr_""intercept""__chunk_len_10__f_agg_""var""","dim_0__agg_linear_trend__attr_""intercept""__chunk_len_50__f_agg_""max""",...,dim_0__symmetry_looking__r_0.9500000000000001,dim_0__time_reversal_asymmetry_statistic__lag_1,dim_0__time_reversal_asymmetry_statistic__lag_2,dim_0__time_reversal_asymmetry_statistic__lag_3,dim_0__value_count__value_-1,dim_0__value_count__value_0,dim_0__value_count__value_1,dim_0__variance,dim_0__variance_larger_than_standard_deviation,dim_0__variation_coefficient
0,148.99956,23.28697,0.447178,0.494247,0.090215,-0.947333,-1.108709,-0.877764,-0.028691,0.182893,...,1.0,0.005725,0.022142,0.050016,0.0,0.0,0.0,0.99333,0.0,-1115664.0
1,149.000133,24.087908,0.481521,0.53736,0.090385,-1.203152,-1.207365,-0.802284,-0.113191,-0.546756,...,1.0,0.003163,0.005665,0.008716,0.0,0.0,0.0,0.993334,0.0,2577573.0
2,148.999183,27.02718,0.50356,0.563602,0.07432,-1.067355,-1.159071,-0.894303,-0.086171,-0.031793,...,1.0,0.000531,-0.001739,-0.001876,0.0,0.0,0.0,0.993328,0.0,-688934.4
3,148.999458,26.741264,0.492886,0.556587,0.0812,-1.168309,-1.141256,-0.816465,-0.109551,-0.329178,...,1.0,0.004883,0.008887,0.006236,0.0,0.0,0.0,0.99333,0.0,-790999.4
4,148.999167,24.23109,0.346763,0.35825,0.12687,-1.038005,-0.931154,-0.610837,-0.097404,-0.489313,...,1.0,-0.002373,-0.003888,-0.004423,0.0,0.0,0.0,0.993328,0.0,-695343.0


## Using tsfresh with sktime

In [6]:
classifier = make_pipeline(
    TSFreshFeatureExtractor(default_fc_parameters="efficient", show_warnings=False),
    RandomForestClassifier()
)
classifier.fit(X_train, y_train)
classifier.score(X_test, y_test)

Feature Extraction: 100%|██████████| 5/5 [00:03<00:00,  1.44it/s]
Feature Extraction: 100%|██████████| 5/5 [00:10<00:00,  2.19s/it]


0.96

## Multivariate time series classification data

In [7]:
X, y = load_basic_motions(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5
0,0 0.079106 1 0.079106 2 -0.903497 3...,0 0.394032 1 0.394032 2 -3.666397 3...,0 0.551444 1 0.551444 2 -0.282844 3...,0 0.351565 1 0.351565 2 -0.095881 3...,0 0.023970 1 0.023970 2 -0.319605 3...,0 0.633883 1 0.633883 2 0.972131 3...
1,0 0.377751 1 0.377751 2 2.952965 3...,0 -0.610850 1 -0.610850 2 0.970717 3...,0 -0.147376 1 -0.147376 2 -5.962515 3...,0 -0.103872 1 -0.103872 2 -7.593275 3...,0 -0.109198 1 -0.109198 2 -0.697804 3...,0 -0.037287 1 -0.037287 2 -2.865789 3...
2,0 -0.813905 1 -0.813905 2 -0.424628 3...,0 0.825666 1 0.825666 2 -1.305033 3...,0 0.032712 1 0.032712 2 0.826170 3...,0 0.021307 1 0.021307 2 -0.372872 3...,0 0.122515 1 0.122515 2 -0.045277 3...,0 0.775041 1 0.775041 2 0.383526 3...
3,0 0.289855 1 0.289855 2 -0.669185 3...,0 0.284130 1 0.284130 2 -0.210466 3...,0 0.213680 1 0.213680 2 0.252267 3...,0 -0.314278 1 -0.314278 2 0.018644 3...,0 0.074574 1 0.074574 2 0.007990 3...,0 -0.079901 1 -0.079901 2 0.237040 3...
4,0 -0.123238 1 -0.123238 2 -0.249547 3...,0 0.379341 1 0.379341 2 0.541501 3...,0 -0.286006 1 -0.286006 2 0.208420 3...,0 -0.098545 1 -0.098545 2 -0.023970 3...,0 0.058594 1 0.058594 2 0.175783 3...,0 -0.074574 1 -0.074574 2 0.114525 3...


In [None]:
# multivariate input data
X_train.head()

In [8]:
t = TSFreshFeatureExtractor(default_fc_parameters="efficient", show_warnings=False)
Xt = t.fit_transform(X_train)
Xt.head()

Feature Extraction: 100%|██████████| 5/5 [00:11<00:00,  2.37s/it]


variable,dim_0__abs_energy,dim_0__absolute_sum_of_changes,"dim_0__agg_autocorrelation__f_agg_""mean""__maxlag_40","dim_0__agg_autocorrelation__f_agg_""median""__maxlag_40","dim_0__agg_autocorrelation__f_agg_""var""__maxlag_40","dim_0__agg_linear_trend__attr_""intercept""__chunk_len_10__f_agg_""max""","dim_0__agg_linear_trend__attr_""intercept""__chunk_len_10__f_agg_""mean""","dim_0__agg_linear_trend__attr_""intercept""__chunk_len_10__f_agg_""min""","dim_0__agg_linear_trend__attr_""intercept""__chunk_len_10__f_agg_""var""","dim_0__agg_linear_trend__attr_""intercept""__chunk_len_50__f_agg_""max""",...,dim_5__symmetry_looking__r_0.9500000000000001,dim_5__time_reversal_asymmetry_statistic__lag_1,dim_5__time_reversal_asymmetry_statistic__lag_2,dim_5__time_reversal_asymmetry_statistic__lag_3,dim_5__value_count__value_-1,dim_5__value_count__value_0,dim_5__value_count__value_1,dim_5__variance,dim_5__variance_larger_than_standard_deviation,dim_5__variation_coefficient
0,10.629914,22.690124,0.039365,0.029099,0.008885,1.021608,0.068493,-0.493076,0.172195,1.6382,...,1.0,0.019919,-0.005089,-0.02841,0.0,0.0,0.0,0.260379,0.0,9.377847
1,48.609672,38.102336,0.013049,-0.018868,0.022497,2.735651,0.069951,-0.537672,0.972,4.310925,...,1.0,0.011348,-0.07149,-0.125565,0.0,1.0,0.0,0.437148,0.0,-6.456339
2,10.309371,24.348966,-0.003053,-0.025593,0.031389,0.455456,-0.080829,-0.742865,0.154155,0.535213,...,1.0,0.001142,0.006904,0.01876,0.0,0.0,0.0,0.249788,0.0,15.676867
3,6.150112,19.595197,0.009984,0.018959,0.011848,0.183837,-0.084124,-0.391862,0.037496,0.289855,...,1.0,0.002119,0.003174,-0.002938,0.0,3.0,0.0,0.061584,0.0,8.401754
4,8.912128,11.658267,-0.014735,-0.007014,0.043948,-0.03043,-0.274553,-0.528818,0.026664,0.126049,...,1.0,-0.000509,-0.000807,-0.000318,0.0,2.0,0.0,0.026815,0.0,12.272156
