# Feature extraction with tsfresh transformer

In this tutorial, we show how you can use sktime with [tsfresh](https://tsfresh.readthedocs.io) to first extract features from time series, so that we can then use any scikit-learn estimator.

## Preliminaries
You have to install tsfresh if you haven't already. To install it, uncomment the cell below:

In [1]:
# !pip install --upgrade tsfresh

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sktime.datasets import load_basic_motions
from sktime.datasets import load_arrow_head
from sktime.transformers.series_as_features.summarize import \
    TSFreshFeatureExtractor

## Univariate time series classification data

For more details on the data set, see the [univariate time series classification notebook](https://github.com/alan-turing-institute/sktime/blob/master/examples/02_classification_univariate.ipynb).

In [3]:
X, y = load_arrow_head(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(158, 1) (158,) (53, 1) (53,)


In [4]:
X_train.head()

Unnamed: 0,dim_0
19,0 -1.8822 1 -1.8806 2 -1.8658 3 ...
65,0 -1.6931 1 -1.7078 2 -1.6863 3 ...
25,0 -1.8541 1 -1.8339 2 -1.8187 3 ...
164,0 -1.8055 1 -1.7985 2 -1.7606 3 ...
15,0 -1.8295 1 -1.8238 2 -1.8101 3 ...


In [5]:
# binary classification task
np.unique(y_train)

array(['0', '1', '2'], dtype=object)

## Using tsfresh to extract features

In [6]:
# tf = TsFreshTransformer()
t = TSFreshFeatureExtractor(default_fc_parameters="efficient", show_warnings=False)
Xt = t.fit_transform(X_train)
Xt.head()

  warn("Found non-unique index, replaced with unique index.")
Feature Extraction: 100%|██████████| 5/5 [00:22<00:00,  4.59s/it]


Unnamed: 0,dim_0__variance_larger_than_standard_deviation,dim_0__has_duplicate_max,dim_0__has_duplicate_min,dim_0__has_duplicate,dim_0__sum_values,dim_0__abs_energy,dim_0__mean_abs_change,dim_0__mean_change,dim_0__mean_second_derivative_central,dim_0__median,...,dim_0__fourier_entropy__bins_2,dim_0__fourier_entropy__bins_3,dim_0__fourier_entropy__bins_5,dim_0__fourier_entropy__bins_10,dim_0__fourier_entropy__bins_100,dim_0__permutation_entropy__dimension_3__tau_1,dim_0__permutation_entropy__dimension_4__tau_1,dim_0__permutation_entropy__dimension_5__tau_1,dim_0__permutation_entropy__dimension_6__tau_1,dim_0__permutation_entropy__dimension_7__tau_1
0,0.0,0.0,0.0,1.0,8.6e-05,249.999359,0.318336,0.006488,-7.7e-05,0.14572,...,0.046288,0.092513,0.092513,0.204643,1.153087,1.540983,2.410511,3.124671,3.642789,3.998297
1,0.0,0.0,0.0,0.0,-0.000316,249.998476,0.365702,0.006306,-0.000164,-0.24211,...,0.08151,0.08151,0.08151,0.173767,1.185667,1.563736,2.395978,3.102661,3.601835,3.965827
2,0.0,0.0,0.0,1.0,-0.000134,250.00026,0.341566,0.005547,-0.000139,0.097725,...,0.08151,0.08151,0.08151,0.173767,1.244553,1.534137,2.371258,3.082351,3.592476,3.996974
3,0.0,0.0,0.0,1.0,0.000612,250.000226,0.306761,0.005337,-0.000103,0.11441,...,0.08151,0.092513,0.173767,0.219798,1.166994,1.451753,2.211597,2.856107,3.349555,3.700527
4,0.0,0.0,0.0,1.0,0.000216,250.000316,0.339828,0.005761,-6.4e-05,0.058163,...,0.08151,0.08151,0.08151,0.173767,1.320115,1.542581,2.393299,3.099352,3.610727,3.985086


## Using tsfresh with sktime

In [7]:
classifier = make_pipeline(
    TSFreshFeatureExtractor(default_fc_parameters="efficient", show_warnings=False),
    RandomForestClassifier()
)
classifier.fit(X_train, y_train)
classifier.score(X_test, y_test)

  warn("Found non-unique index, replaced with unique index.")
Feature Extraction: 100%|██████████| 5/5 [00:13<00:00,  2.62s/it]
  warn("Found non-unique index, replaced with unique index.")
Feature Extraction: 100%|██████████| 5/5 [00:03<00:00,  1.29it/s]


0.8679245283018868

## Multivariate time series classification data

In [8]:
X, y = load_basic_motions(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(60, 6) (60,) (20, 6) (20,)


In [9]:
# multivariate input data
X_train.head()

Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5
21,0 0.648833 1 0.648833 2 0.076985 3...,0 -0.996722 1 -0.996722 2 -0.897264 3...,0 -0.644136 1 -0.644136 2 0.970515 3...,0 -0.101208 1 -0.101208 2 -0.407496 3...,0 0.055931 1 0.055931 2 -0.157139 3...,0 -0.031960 1 -0.031960 2 -0.343575 3...
20,0 -0.071819 1 -0.071819 2 -0.360728 3...,0 0.354963 1 0.354963 2 -2.704719 3...,0 0.275074 1 0.275074 2 0.892838 3...,0 -1.033389 1 -1.033389 2 0.066584 3...,0 0.743081 1 0.743081 2 -0.271664 3...,0 -0.825646 1 -0.825646 2 0.122515 3...
37,0 -0.046089 1 -0.283051 2 -0.587748 3...,0 -0.738026 1 -0.314572 2 3.388108 3...,0 0.179667 1 -0.724257 2 -0.223563 3...,0 0.364882 1 -1.163894 2 -2.543521 3...,0 -0.237040 1 -0.101208 2 0.402169 3...,0 0.386189 1 -0.165129 2 -0.897557 3...
13,0 2.580342 1 2.580342 2 -7.26891...,0 -0.850954 1 -0.850954 2 -6.06223...,0 -0.150030 1 -0.150030 2 0.96421...,0 -0.005327 1 -0.005327 2 0.002663 3...,0 0.050604 1 0.050604 2 -0.364882 3...,0 0.311615 1 0.311615 2 -0.772378 3...
18,0 0.951708 1 0.951708 2 6.22747...,0 -1.304853 1 -1.304853 2 -1.22245...,0 -0.944935 1 -0.944935 2 0.682350 3...,0 -0.386189 1 -0.386189 2 -0.346238 3...,0 0.308951 1 0.308951 2 0.298298 3...,0 0.098545 1 0.098545 2 -1.408924 3...


In [10]:
t = TSFreshFeatureExtractor(default_fc_parameters="efficient", show_warnings=False)
Xt = t.fit_transform(X_train)
Xt.head()

  warn("Found non-unique index, replaced with unique index.")
Feature Extraction: 100%|██████████| 5/5 [00:21<00:00,  4.21s/it]


Unnamed: 0,dim_0__variance_larger_than_standard_deviation,dim_0__has_duplicate_max,dim_0__has_duplicate_min,dim_0__has_duplicate,dim_0__sum_values,dim_0__abs_energy,dim_0__mean_abs_change,dim_0__mean_change,dim_0__mean_second_derivative_central,dim_0__median,...,dim_5__fourier_entropy__bins_2,dim_5__fourier_entropy__bins_3,dim_5__fourier_entropy__bins_5,dim_5__fourier_entropy__bins_10,dim_5__fourier_entropy__bins_100,dim_5__permutation_entropy__dimension_3__tau_1,dim_5__permutation_entropy__dimension_4__tau_1,dim_5__permutation_entropy__dimension_5__tau_1,dim_5__permutation_entropy__dimension_6__tau_1,dim_5__permutation_entropy__dimension_7__tau_1
0,1.0,1.0,0.0,1.0,57.045746,172.027276,0.807892,0.001584,0.003131,0.4221,...,0.165443,0.165443,0.165443,0.165443,1.241657,1.494736,2.333086,3.047524,3.577109,3.928619
1,1.0,0.0,0.0,1.0,113.745549,356.056167,1.297926,0.004898,-0.002951,1.043021,...,0.096509,0.096509,0.26116,0.288342,1.515164,1.54343,2.424844,3.185694,3.766752,4.154904
2,1.0,0.0,0.0,1.0,362.222132,4714.701692,3.254709,0.077807,0.046355,1.367126,...,0.223718,0.493681,0.840916,1.339775,2.946071,1.683833,2.814142,3.772351,4.32948,4.513799
3,1.0,0.0,0.0,1.0,321.402722,10764.169856,6.780527,0.136657,0.091813,7.22729,...,0.165443,0.165443,0.192626,0.192626,1.339437,1.556814,2.559083,3.37957,3.95365,4.375502
4,1.0,0.0,0.0,1.0,292.068012,11792.713884,8.246383,-0.139636,0.018494,6.285126,...,0.096509,0.096509,0.26116,0.26116,0.985953,1.623656,2.64476,3.475038,4.10673,4.395817
