# Feature extraction with tsfresh transformer

In this tutorial, we show how you can use sktime with [tsfresh](https://tsfresh.readthedocs.io) to first extract features from time series, so that we can then use any scikit-learn estimator.

## Preliminaries
You have to install tsfresh if you haven't already. To install it, uncomment the cell below:

In [1]:
# !pip install --upgrade tsfresh

In [2]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

from sktime.datasets import load_arrow_head, load_basic_motions
from sktime.transformations.panel.tsfresh import TSFreshFeatureExtractor

## Univariate time series classification data

For more details on the data set, see the [univariate time series classification notebook](https://github.com/alan-turing-institute/sktime/blob/main/examples/02_classification_univariate.ipynb).

In [3]:
X, y = load_arrow_head(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(158, 1) (158,) (53, 1) (53,)


In [4]:
X_train.head()

Unnamed: 0,dim_0
37,0 -2.022016 1 -2.016564 2 -2.00742...
26,0 -2.255120 1 -2.233663 2 -2.22919...
7,0 -1.633596 1 -1.643175 2 -1.61367...
0,0 -1.963009 1 -1.957825 2 -1.95614...
42,0 -1.992052 1 -2.014400 2 -1.96105...


In [5]:
#  binary classification task
np.unique(y_train)

array(['0', '1', '2'], dtype='<U1')

## Using tsfresh to extract features

In [6]:
# tf = TsFreshTransformer()
t = TSFreshFeatureExtractor(default_fc_parameters="efficient", show_warnings=False)
Xt = t.fit_transform(X_train)
Xt.head()

  warn(
Feature Extraction: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 158/158 [00:16<00:00,  9.86it/s]


Unnamed: 0,dim_0__variance_larger_than_standard_deviation,dim_0__has_duplicate_max,dim_0__has_duplicate_min,dim_0__has_duplicate,dim_0__sum_values,dim_0__abs_energy,dim_0__mean_abs_change,dim_0__mean_change,dim_0__mean_second_derivative_central,dim_0__median,...,dim_0__permutation_entropy__dimension_6__tau_1,dim_0__permutation_entropy__dimension_7__tau_1,dim_0__query_similarity_count__query_None__threshold_0.0,"dim_0__matrix_profile__feature_""min""__threshold_0.98","dim_0__matrix_profile__feature_""max""__threshold_0.98","dim_0__matrix_profile__feature_""mean""__threshold_0.98","dim_0__matrix_profile__feature_""median""__threshold_0.98","dim_0__matrix_profile__feature_""25""__threshold_0.98","dim_0__matrix_profile__feature_""75""__threshold_0.98",dim_0__mean_n_absolute_max__number_of_maxima_7
0,0.0,0.0,0.0,1.0,-2.252e-07,250.000002,0.066732,6e-06,-5.2e-05,-0.054243,...,2.446627,2.761424,0.0,2.1929,10.928014,7.167912,9.183038,2.986499,9.919575,2.00619
1,0.0,0.0,0.0,1.0,3.075e-07,250.000001,0.070874,6.2e-05,-8e-06,-0.121147,...,2.752452,3.133561,0.0,1.218768,7.021585,2.900469,2.774175,2.043043,3.569393,2.238317
2,0.0,0.0,0.0,1.0,-2.424e-07,250.0,0.050352,-2.7e-05,2.5e-05,0.079668,...,2.377966,2.621832,0.0,1.978227,11.703229,5.454042,5.137846,2.705232,7.90301,1.634829
3,0.0,0.0,0.0,1.0,2.098e-07,250.000001,0.054993,0.000215,1.9e-05,-0.150448,...,2.509489,2.810846,0.0,1.9946,11.369093,5.456374,6.182347,3.315364,6.78629,1.936245
4,0.0,0.0,0.0,1.0,1.37e-07,250.0,0.057174,9.3e-05,-4.8e-05,-0.185412,...,2.341616,2.5723,0.0,1.806383,11.244796,6.309328,7.787622,3.151034,8.403381,1.972535


## Using tsfresh with sktime

In [7]:
classifier = make_pipeline(
    TSFreshFeatureExtractor(default_fc_parameters="efficient", show_warnings=False),
    RandomForestClassifier(),
)
classifier.fit(X_train, y_train)
classifier.score(X_test, y_test)

  warn(
Feature Extraction: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 158/158 [00:18<00:00,  8.72it/s]
  warn(
Feature Extraction: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 53/53 [00:05<00:00,  8.85it/s]


0.8113207547169812

## Multivariate time series classification data

In [8]:
X, y = load_basic_motions(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(60, 6) (60,) (20, 6) (20,)


In [9]:
#  multivariate input data
X_train.head()

Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5
10,0 0.300413 1 0.300413 2 -1.96499...,0 0.727580 1 0.727580 2 -0.30055...,0 0.878731 1 0.878731 2 -1.226914 3...,0 -0.082565 1 -0.082565 2 -0.631219 3...,0 -0.055931 1 -0.055931 2 0.039951 3...,0 0.668507 1 0.668507 2 0.130505 3...
39,0 1.211973 1 1.211973 2 -0.605948 3...,0 -0.247107 1 -0.247107 2 -3.855673 3...,0 0.327837 1 0.327837 2 7.113185 3...,0 0.058594 1 0.058594 2 0.900220 3...,0 -0.527348 1 -0.527348 2 -1.326360 3...,0 -0.042614 1 -0.042614 2 -0.095881 3...
36,0 -1.801504 1 -1.801504 2 -0.480725 3...,0 2.344990 1 2.344990 2 -0.994385 3...,0 0.281253 1 0.281253 2 0.378807 3...,0 0.716447 1 0.716447 2 -0.870923 3...,0 0.162466 1 0.162466 2 0.095881 3...,0 0.921527 1 0.921527 2 -0.474080 3...
19,0 -0.218596 1 -0.218596 2 0.388206 3...,0 0.191299 1 0.191299 2 0.22179...,0 -0.228349 1 -0.228349 2 0.779480 3...,0 0.042614 1 0.042614 2 -0.745744 3...,0 0.055931 1 0.055931 2 0.482070 3...,0 -0.058594 1 -0.058594 2 0.356892 3...
27,0 -0.255266 1 -0.255266 2 -0.792226 3...,0 -0.154748 1 -0.154748 2 -1.176848 3...,0 -0.273293 1 -0.273293 2 -0.709993 3...,0 -0.050604 1 -0.050604 2 -0.237040 3...,0 0.015980 1 0.015980 2 -0.314278 3...,0 0.013317 1 0.013317 2 0.170456 3...


In [10]:
t = TSFreshFeatureExtractor(default_fc_parameters="efficient", show_warnings=False)
Xt = t.fit_transform(X_train)
Xt.head()

  warn(
Feature Extraction: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 360/360 [00:28<00:00, 12.64it/s]


Unnamed: 0,dim_0__variance_larger_than_standard_deviation,dim_0__has_duplicate_max,dim_0__has_duplicate_min,dim_0__has_duplicate,dim_0__sum_values,dim_0__abs_energy,dim_0__mean_abs_change,dim_0__mean_change,dim_0__mean_second_derivative_central,dim_0__median,...,dim_5__permutation_entropy__dimension_6__tau_1,dim_5__permutation_entropy__dimension_7__tau_1,dim_5__query_similarity_count__query_None__threshold_0.0,"dim_5__matrix_profile__feature_""min""__threshold_0.98","dim_5__matrix_profile__feature_""max""__threshold_0.98","dim_5__matrix_profile__feature_""mean""__threshold_0.98","dim_5__matrix_profile__feature_""median""__threshold_0.98","dim_5__matrix_profile__feature_""25""__threshold_0.98","dim_5__matrix_profile__feature_""75""__threshold_0.98",dim_5__mean_n_absolute_max__number_of_maxima_7
0,1.0,0.0,1.0,1.0,419.211878,15733.291175,7.433115,-0.141187,-0.03583,10.172421,...,3.638942,4.031663,0.0,0.722229,1.914229,1.111113,1.118647,0.931036,1.20831,9.367462
1,1.0,0.0,0.0,1.0,589.535839,8508.951625,4.071747,0.019771,-0.004861,4.086704,...,4.181624,4.395817,0.0,0.730343,2.773211,1.618466,1.653693,1.193309,2.118037,12.389248
2,1.0,0.0,0.0,1.0,412.62596,5716.535296,3.523823,0.022801,0.0,2.097393,...,4.101222,4.395817,0.0,0.736316,2.514378,1.432725,1.437321,1.159282,1.602725,9.46829
3,1.0,0.0,0.0,1.0,550.888556,13352.996557,6.155899,0.068802,0.0,8.23406,...,3.475322,3.868891,0.0,0.803139,2.619395,1.360248,1.281866,1.046278,1.642261,9.872361
4,1.0,1.0,0.0,1.0,54.248883,176.508713,0.856079,-0.00926,-0.000214,0.487162,...,3.124753,3.583662,0.0,0.94769,2.294251,1.356783,1.251071,1.169921,1.443343,2.117001


## Using tsfresh for forecasting
You can also use tsfresh to do univariate forecasting. To find out more about forecasting, check out our forecasting tutorial notebook.

In [11]:
from sklearn.ensemble import RandomForestRegressor

from sktime.datasets import load_airline
from sktime.forecasting.base import ForecastingHorizon
from sktime.forecasting.compose import make_reduction
from sktime.forecasting.model_selection import temporal_train_test_split

y = load_airline()
y_train, y_test = temporal_train_test_split(y)

regressor = make_pipeline(
    TSFreshFeatureExtractor(show_warnings=False, disable_progressbar=True),
    RandomForestRegressor(),
)
forecaster = make_reduction(
    regressor, scitype="time-series-regressor", window_length=12
)
forecaster.fit(y_train)

fh = ForecastingHorizon(y_test.index, is_relative=False, freq=y.index.freqstr)
y_pred = forecaster.predict(fh)