# Feature extraction with tsfresh transformer

In this tutorial, we show how you can use sktime with [tsfresh](https://tsfresh.readthedocs.io) to first extract features from time series, so that we can then use any scikit-learn estimator.

## Preliminaries
You have to install tsfresh if you haven't already. To install it, uncomment the cell below:

In [1]:
# !pip install --upgrade tsfresh

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sktime.datasets import load_basic_motions
from sktime.datasets import load_arrow_head
from sktime.transformers.series_as_features.summarize import \
    TSFreshFeatureExtractor
from sktime.forecasting.base import ForecastingHorizon
from sklearn.ensemble import RandomForestRegressor
from sktime.forecasting.compose import ReducedTimeSeriesRegressionForecaster
from sklearn.pipeline import make_pipeline
from sktime.datasets import load_airline
from sktime.forecasting.model_selection import temporal_train_test_split

## Univariate time series classification data

For more details on the data set, see the [univariate time series classification notebook](https://github.com/alan-turing-institute/sktime/blob/master/examples/02_classification_univariate.ipynb).

In [3]:
X, y = load_arrow_head(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(158, 1) (158,) (53, 1) (53,)


In [4]:
X_train.head()

Unnamed: 0,dim_0
124,0 -1.8963 1 -1.8777 2 -1.8509 3 ...
139,0 -1.8493 1 -1.8615 2 -1.8273 3 ...
69,0 -1.7998 1 -1.7987 2 -1.7942 3 ...
82,0 -1.9692 1 -1.9847 2 -1.9578 3 ...
134,0 -1.8955 1 -1.8728 2 -1.8633 3 ...


In [5]:
# binary classification task
np.unique(y_train)

array(['0', '1', '2'], dtype=object)

## Using tsfresh to extract features

In [6]:
# tf = TsFreshTransformer()
t = TSFreshFeatureExtractor(default_fc_parameters="efficient", show_warnings=False)
Xt = t.fit_transform(X_train)
Xt.head()

  warn("Found non-unique index, replaced with unique index.")
Feature Extraction: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:11<00:00,  2.31s/it]


Unnamed: 0,dim_0__variance_larger_than_standard_deviation,dim_0__has_duplicate_max,dim_0__has_duplicate_min,dim_0__has_duplicate,dim_0__sum_values,dim_0__abs_energy,dim_0__mean_abs_change,dim_0__mean_change,dim_0__mean_second_derivative_central,dim_0__median,...,dim_0__fourier_entropy__bins_2,dim_0__fourier_entropy__bins_3,dim_0__fourier_entropy__bins_5,dim_0__fourier_entropy__bins_10,dim_0__fourier_entropy__bins_100,dim_0__permutation_entropy__dimension_3__tau_1,dim_0__permutation_entropy__dimension_4__tau_1,dim_0__permutation_entropy__dimension_5__tau_1,dim_0__permutation_entropy__dimension_6__tau_1,dim_0__permutation_entropy__dimension_7__tau_1
0,0.0,1.0,0.0,1.0,-0.000425,249.999671,0.326655,0.005714,-8.2e-05,0.029368,...,0.08151,0.08151,0.162765,0.254761,1.347846,1.495494,2.274216,2.85807,3.303602,3.6891
1,0.0,0.0,0.0,1.0,0.000348,249.999793,0.335123,0.005528,-1.1e-05,0.10295,...,0.08151,0.08151,0.193641,0.300661,1.43115,1.492702,2.277894,2.856353,3.317181,3.708545
2,0.0,0.0,0.0,1.0,-8e-05,249.998516,0.334229,0.004226,-0.0002,-0.024066,...,0.08151,0.08151,0.127671,0.138673,1.175797,1.574929,2.472788,3.211234,3.750249,4.129999
3,0.0,1.0,0.0,1.0,0.000109,250.000212,0.349702,0.005176,-1.1e-05,0.004516,...,0.08151,0.092513,0.138673,0.184769,1.122911,1.515526,2.345801,3.039532,3.602549,4.011818
4,0.0,0.0,0.0,1.0,4e-06,250.000421,0.30897,0.005309,-0.000186,0.081687,...,0.08151,0.08151,0.173767,0.254761,1.205061,1.503986,2.315625,2.967582,3.471518,3.889273


## Using tsfresh with sktime

In [7]:
classifier = make_pipeline(
    TSFreshFeatureExtractor(default_fc_parameters="efficient", show_warnings=False),
    RandomForestClassifier()
)
classifier.fit(X_train, y_train)
classifier.score(X_test, y_test)

  warn("Found non-unique index, replaced with unique index.")
Feature Extraction: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:11<00:00,  2.33s/it]
  warn("Found non-unique index, replaced with unique index.")
Feature Extraction: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:04<00:00,  1.02it/s]


0.8113207547169812

## Multivariate time series classification data

In [8]:
X, y = load_basic_motions(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(60, 6) (60,) (20, 6) (20,)


In [9]:
# multivariate input data
X_train.head()

Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5
37,0 -0.046089 1 -0.283051 2 -0.587748 3...,0 -0.738026 1 -0.314572 2 3.388108 3...,0 0.179667 1 -0.724257 2 -0.223563 3...,0 0.364882 1 -1.163894 2 -2.543521 3...,0 -0.237040 1 -0.101208 2 0.402169 3...,0 0.386189 1 -0.165129 2 -0.897557 3...
1,0 -0.247409 1 -0.247409 2 -0.771290 3...,0 -0.060459 1 -0.060459 2 -0.047618 3...,0 -0.608565 1 -0.608565 2 -0.294411 3...,0 -0.023970 1 -0.023970 2 -0.269001 3...,0 0.101208 1 0.101208 2 0.111862 3...,0 0.071911 1 0.071911 2 0.135832 3...
17,0 3.789469 1 3.789469 2 1.78594...,0 -1.353556 1 -1.353556 2 -10.69460...,0 -0.685072 1 -0.685072 2 -4.465480 3...,0 -0.021307 1 -0.021307 2 2.753927 3...,0 -0.159802 1 -0.159802 2 -0.820319 3...,0 0.133169 1 0.133169 2 2.974987 3...
7,0 -0.352746 1 -0.352746 2 -1.354561 3...,0 0.316845 1 0.316845 2 0.490525 3...,0 -0.473779 1 -0.473779 2 1.454261 3...,0 -0.327595 1 -0.327595 2 -0.269001 3...,0 0.106535 1 0.106535 2 0.021307 3...,0 0.197090 1 0.197090 2 0.460763 3...
39,0 1.211973 1 1.211973 2 -0.605948 3...,0 -0.247107 1 -0.247107 2 -3.855673 3...,0 0.327837 1 0.327837 2 7.113185 3...,0 0.058594 1 0.058594 2 0.900220 3...,0 -0.527348 1 -0.527348 2 -1.326360 3...,0 -0.042614 1 -0.042614 2 -0.095881 3...


In [10]:
t = TSFreshFeatureExtractor(default_fc_parameters="efficient", show_warnings=False)
Xt = t.fit_transform(X_train)
Xt.head()

  warn("Found non-unique index, replaced with unique index.")
Feature Extraction: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:18<00:00,  3.79s/it]


Unnamed: 0,dim_0__variance_larger_than_standard_deviation,dim_0__has_duplicate_max,dim_0__has_duplicate_min,dim_0__has_duplicate,dim_0__sum_values,dim_0__abs_energy,dim_0__mean_abs_change,dim_0__mean_change,dim_0__mean_second_derivative_central,dim_0__median,...,dim_5__fourier_entropy__bins_2,dim_5__fourier_entropy__bins_3,dim_5__fourier_entropy__bins_5,dim_5__fourier_entropy__bins_10,dim_5__fourier_entropy__bins_100,dim_5__permutation_entropy__dimension_3__tau_1,dim_5__permutation_entropy__dimension_4__tau_1,dim_5__permutation_entropy__dimension_5__tau_1,dim_5__permutation_entropy__dimension_6__tau_1,dim_5__permutation_entropy__dimension_7__tau_1
0,1.0,0.0,0.0,1.0,362.222132,4714.701692,3.254709,0.077807,0.046355,1.367126,...,0.223718,0.493681,0.840916,1.339775,2.946071,1.683833,2.814142,3.772351,4.32948,4.513799
1,0.0,0.0,0.0,1.0,-30.777978,10.812641,0.115899,-0.001302,0.000166,-0.312134,...,0.165443,0.192626,0.288342,0.288342,1.152688,1.616751,2.59862,3.407283,4.035698,4.360755
2,1.0,0.0,0.0,1.0,456.363177,14668.442452,8.609941,-0.103845,0.003627,10.290202,...,0.165443,0.192626,0.192626,0.356468,1.923853,1.538814,2.523494,3.444948,4.027225,4.375502
3,0.0,0.0,0.0,1.0,-17.42876,7.940863,0.177152,0.002326,-0.000244,-0.152038,...,0.223718,0.26116,0.26116,0.424177,1.889808,1.556425,2.42499,3.29674,3.888758,4.230903
4,1.0,0.0,0.0,1.0,589.535839,8508.951625,4.636855,0.019771,-0.004861,4.086704,...,0.096509,0.26116,0.413917,0.836011,2.632203,1.736243,2.921233,3.781926,4.323973,4.499051


## Univariate time series regression data

In [11]:
y = load_airline()
y_train, y_test = temporal_train_test_split(y)

regressor = make_pipeline(TSFreshFeatureExtractor(show_warnings=False, disable_progressbar=True), RandomForestRegressor())
forecaster = ReducedTimeSeriesRegressionForecaster(regressor, window_length=12)
forecaster.fit(y_train)

fh = ForecastingHorizon(y_test.index, is_relative=False)
y_pred = forecaster.predict(fh)