# Feature extraction with tsfresh transformer

In this tutorial, we show how you can use sktime with [tsfresh](https://tsfresh.readthedocs.io) to first extract features from time series, so that we can then use any scikit-learn estimator.

## Preliminaries
You have to install tsfresh if you haven't already. To install it, uncomment the cell below:

In [1]:
# !pip install --upgrade tsfresh

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sktime.datasets import load_basic_motions
from sktime.datasets import load_arrow_head
from sktime.transformations.panel.summarize import \
    TSFreshFeatureExtractor

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


## Univariate time series classification data

For more details on the data set, see the [univariate time series classification notebook](https://github.com/alan-turing-institute/sktime/blob/main/examples/02_classification_univariate.ipynb).

In [3]:
X, y = load_arrow_head(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(158, 1) (158,) (53, 1) (53,)


In [4]:
X_train.head()

Unnamed: 0,dim_0
105,0 -1.6758 1 -1.6742 2 -1.6674 3 ...
120,0 -2.0683 1 -2.0494 2 -1.9998 3 ...
42,0 -1.9921 1 -2.0144 2 -1.9611 3 ...
49,0 -1.7647 1 -1.7483 2 -1.6966 3 ...
63,0 -2.0341 1 -2.0299 2 -1.9880 3 ...


In [5]:
# binary classification task
np.unique(y_train)

array(['0', '1', '2'], dtype=object)

## Using tsfresh to extract features

In [6]:
# tf = TsFreshTransformer()
t = TSFreshFeatureExtractor(default_fc_parameters="efficient", show_warnings=False)
Xt = t.fit_transform(X_train)
Xt.head()

  warn("Found non-unique index, replaced with unique index.")
Feature Extraction: 100%|██████████| 5/5 [00:11<00:00,  2.20s/it]


Unnamed: 0,dim_0__variance_larger_than_standard_deviation,dim_0__has_duplicate_max,dim_0__has_duplicate_min,dim_0__has_duplicate,dim_0__sum_values,dim_0__abs_energy,dim_0__mean_abs_change,dim_0__mean_change,dim_0__mean_second_derivative_central,dim_0__median,...,dim_0__fourier_entropy__bins_2,dim_0__fourier_entropy__bins_3,dim_0__fourier_entropy__bins_5,dim_0__fourier_entropy__bins_10,dim_0__fourier_entropy__bins_100,dim_0__permutation_entropy__dimension_3__tau_1,dim_0__permutation_entropy__dimension_4__tau_1,dim_0__permutation_entropy__dimension_5__tau_1,dim_0__permutation_entropy__dimension_6__tau_1,dim_0__permutation_entropy__dimension_7__tau_1
0,0.0,0.0,0.0,1.0,-7.7e-05,250.000532,0.313268,0.005533,-1.395582e-05,0.010407,...,0.08151,0.092513,0.138673,0.138673,0.926719,1.454743,2.223152,2.831917,3.346462,3.758022
1,0.0,0.0,0.0,1.0,-2e-06,249.99916,0.320786,0.006955,-0.000160241,0.34959,...,0.046288,0.092513,0.204643,0.204643,1.276476,1.512406,2.39586,3.181019,3.804647,4.229748
2,0.0,0.0,0.0,1.0,-0.000408,249.999669,0.368617,0.004858,-1.39759e-05,-0.18541,...,0.08151,0.08151,0.138673,0.184769,1.268258,1.52631,2.327442,3.066192,3.616213,3.993559
3,0.0,0.0,0.0,0.0,8e-05,249.99932,0.346602,0.00466,-3.176707e-05,-0.065735,...,0.08151,0.08151,0.08151,0.127671,1.107978,1.575622,2.462761,3.236553,3.85944,4.323118
4,0.0,0.0,0.0,1.0,-0.000536,249.99913,0.356139,0.004699,2.008032e-07,-0.15709,...,0.08151,0.08151,0.127671,0.208796,1.329414,1.478852,2.255589,2.920289,3.503391,3.960527


## Using tsfresh with sktime

In [7]:
classifier = make_pipeline(
    TSFreshFeatureExtractor(default_fc_parameters="efficient", show_warnings=False),
    RandomForestClassifier()
)
classifier.fit(X_train, y_train)
classifier.score(X_test, y_test)

  warn("Found non-unique index, replaced with unique index.")
Feature Extraction: 100%|██████████| 5/5 [00:12<00:00,  2.43s/it]
  warn("Found non-unique index, replaced with unique index.")
Feature Extraction: 100%|██████████| 5/5 [00:03<00:00,  1.32it/s]


0.8113207547169812

## Multivariate time series classification data

In [8]:
X, y = load_basic_motions(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(60, 6) (60,) (20, 6) (20,)


In [9]:
# multivariate input data
X_train.head()

Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5
7,0 -0.352746 1 -0.352746 2 -1.354561 3...,0 0.316845 1 0.316845 2 0.490525 3...,0 -0.473779 1 -0.473779 2 1.454261 3...,0 -0.327595 1 -0.327595 2 -0.269001 3...,0 0.106535 1 0.106535 2 0.021307 3...,0 0.197090 1 0.197090 2 0.460763 3...
21,0 -0.171905 1 -0.171905 2 -0.397472 3...,0 0.206276 1 0.206276 2 -3.217950 3...,0 -0.308410 1 -0.308410 2 -0.035401 3...,0 -0.189099 1 -0.189099 2 0.857606 3...,0 0.079901 1 0.079901 2 0.135832 3...,0 0.055931 1 0.055931 2 0.391516 3...
17,0 0.324449 1 0.324449 2 9.29442...,0 -0.977516 1 -0.977516 2 -6.96322...,0 -1.260218 1 -1.260218 2 -2.498493 3...,0 -0.788358 1 -0.788358 2 2.434323 3...,0 0.316941 1 0.316941 2 -0.079901 3...,0 0.588605 1 0.588605 2 6.535916 3...
18,0 0.951708 1 0.951708 2 6.22747...,0 -1.304853 1 -1.304853 2 -1.22245...,0 -0.944935 1 -0.944935 2 0.682350 3...,0 -0.386189 1 -0.386189 2 -0.346238 3...,0 0.308951 1 0.308951 2 0.298298 3...,0 0.098545 1 0.098545 2 -1.408924 3...
16,0 1.370472 1 1.370472 2 8.98811...,0 -1.054298 1 -1.054298 2 7.71701...,0 -0.451409 1 -0.451409 2 -6.073897 3...,0 -0.306288 1 -0.306288 2 0.458100 3...,0 -0.423476 1 -0.423476 2 0.761725 3...,0 0.292971 1 0.292971 2 2.159995 3...


In [10]:
t = TSFreshFeatureExtractor(default_fc_parameters="efficient", show_warnings=False)
Xt = t.fit_transform(X_train)
Xt.head()

  warn("Found non-unique index, replaced with unique index.")
Feature Extraction: 100%|██████████| 5/5 [00:24<00:00,  4.82s/it]


Unnamed: 0,dim_0__variance_larger_than_standard_deviation,dim_0__has_duplicate_max,dim_0__has_duplicate_min,dim_0__has_duplicate,dim_0__sum_values,dim_0__abs_energy,dim_0__mean_abs_change,dim_0__mean_change,dim_0__mean_second_derivative_central,dim_0__median,...,dim_5__fourier_entropy__bins_2,dim_5__fourier_entropy__bins_3,dim_5__fourier_entropy__bins_5,dim_5__fourier_entropy__bins_10,dim_5__fourier_entropy__bins_100,dim_5__permutation_entropy__dimension_3__tau_1,dim_5__permutation_entropy__dimension_4__tau_1,dim_5__permutation_entropy__dimension_5__tau_1,dim_5__permutation_entropy__dimension_6__tau_1,dim_5__permutation_entropy__dimension_7__tau_1
0,0.0,0.0,0.0,1.0,-17.42876,7.940863,0.177152,0.002326,-0.000244,-0.152038,...,0.223718,0.26116,0.26116,0.424177,1.889808,1.556425,2.42499,3.29674,3.888758,4.230903
1,1.0,0.0,0.0,1.0,158.749791,535.495127,1.369504,0.035368,-0.004945,1.331803,...,0.096509,0.26116,0.26116,0.26116,1.34819,1.487768,2.33372,3.087472,3.753782,4.171911
2,1.0,0.0,0.0,1.0,505.902373,13876.020277,7.436936,-0.174782,-0.087916,9.463268,...,0.096509,0.192626,0.192626,0.288342,0.61267,1.533172,2.40487,3.130376,3.719884,4.120886
3,1.0,0.0,0.0,1.0,292.068012,11792.713884,8.246383,-0.139636,0.018494,6.285126,...,0.096509,0.096509,0.26116,0.26116,0.985953,1.623656,2.64476,3.475038,4.10673,4.395817
4,1.0,0.0,0.0,1.0,525.281957,16841.431717,9.634983,0.122669,0.091556,10.755665,...,0.096509,0.096509,0.26116,0.26116,1.629072,1.569105,2.571916,3.406333,4.023954,4.346007


## Using tsfresh for forecasting
You can also use tsfresh to do univariate forecasting. To find out more about forecasting, check out our forecasting tutorial notebook. 

In [7]:
from sktime.datasets import load_airline
from sktime.forecasting.model_selection import temporal_train_test_split
from sklearn.ensemble import RandomForestRegressor
from sktime.forecasting.compose import ReducedTimeSeriesRegressionForecaster
from sktime.forecasting.base import ForecastingHorizon

y = load_airline()
y_train, y_test = temporal_train_test_split(y)

regressor = make_pipeline(
    TSFreshFeatureExtractor(show_warnings=False, disable_progressbar=True), 
    RandomForestRegressor()
)
forecaster = ReducedTimeSeriesRegressionForecaster(regressor, window_length=12)
forecaster.fit(y_train)

fh = ForecastingHorizon(y_test.index, is_relative=False)
y_pred = forecaster.predict(fh)