# Feature extraction with tsfresh transformer

In this tutorial, we show how you can use sktime with [tsfresh](https://tsfresh.readthedocs.io) to first extract features from time series, so that we can then use any scikit-learn estimator.

## Preliminaries
You have to install tsfresh if you haven't already. To install it, uncomment the cell below:

In [1]:
# !pip install --upgrade tsfresh

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sktime.datasets import load_basic_motions
from sktime.datasets import load_arrow_head
from sktime.transformers.series_as_features.summarize import \
    TSFreshFeatureExtractor

## Univariate time series classification data

For more details on the data set, see the [univariate time series classification notebook](https://github.com/alan-turing-institute/sktime/blob/master/examples/02_classification_univariate.ipynb).

In [3]:
X, y = load_arrow_head(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(158, 1) (158,) (53, 1) (53,)


In [4]:
X_train.head()

Unnamed: 0,dim_0
53,0 -2.0146 1 -2.0107 2 -1.9874 3 ...
67,0 -1.8025 1 -1.7615 2 -1.7225 3 ...
167,0 -1.7471 1 -1.7295 2 -1.7300 3 ...
169,0 -1.7767 1 -1.7786 2 -1.7501 3 ...
57,0 -1.8031 1 -1.8010 2 -1.7880 3 ...


In [5]:
# binary classification task
np.unique(y_train)

array(['0', '1', '2'], dtype=object)

## Using tsfresh to extract features

In [6]:
# tf = TsFreshTransfomer()
t = TSFreshFeatureExtractor(default_fc_parameters="efficient", show_warnings=False)
Xt = t.fit_transform(X_train)
Xt.head()

  warn("Found non-unique index, replaced with unique index.")
Feature Extraction: 100%|██████████| 5/5 [00:09<00:00,  2.00s/it]


variable,dim_0__abs_energy,dim_0__absolute_sum_of_changes,"dim_0__agg_autocorrelation__f_agg_""mean""__maxlag_40","dim_0__agg_autocorrelation__f_agg_""median""__maxlag_40","dim_0__agg_autocorrelation__f_agg_""var""__maxlag_40","dim_0__agg_linear_trend__attr_""intercept""__chunk_len_10__f_agg_""max""","dim_0__agg_linear_trend__attr_""intercept""__chunk_len_10__f_agg_""mean""","dim_0__agg_linear_trend__attr_""intercept""__chunk_len_10__f_agg_""min""","dim_0__agg_linear_trend__attr_""intercept""__chunk_len_10__f_agg_""var""","dim_0__agg_linear_trend__attr_""intercept""__chunk_len_50__f_agg_""max""",...,dim_0__symmetry_looking__r_0.9500000000000001,dim_0__time_reversal_asymmetry_statistic__lag_1,dim_0__time_reversal_asymmetry_statistic__lag_2,dim_0__time_reversal_asymmetry_statistic__lag_3,dim_0__value_count__value_-1,dim_0__value_count__value_0,dim_0__value_count__value_1,dim_0__variance,dim_0__variance_larger_than_standard_deviation,dim_0__variation_coefficient
0,249.999787,93.03565,0.230672,0.260563,0.075874,-0.161724,-0.51381,-1.405556,0.157462,0.748515,...,1.0,0.055958,0.012402,-0.011028,0.0,0.0,0.0,0.996015,0.0,2020156.0
1,249.999587,79.563034,0.315412,0.361389,0.088638,-0.266584,-0.573227,-1.092258,0.099126,0.859708,...,1.0,0.046156,0.008396,-0.01665,0.0,0.0,0.0,0.996014,0.0,1704077.0
2,249.999384,79.02282,0.316026,0.349829,0.081266,-0.282467,-0.595313,-1.139504,0.099352,0.683231,...,1.0,0.038116,0.003989,-0.020627,0.0,0.0,0.0,0.996013,0.0,16699950.0
3,250.000845,82.01538,0.26879,0.287141,0.083965,-0.285361,-0.523476,-1.19342,0.100949,0.647236,...,1.0,0.049885,0.008217,-0.016709,0.0,0.0,0.0,0.996019,0.0,19269220.0
4,249.999931,82.909338,0.277398,0.31288,0.089202,-0.283003,-0.559214,-1.229867,0.107138,0.68244,...,1.0,0.052606,0.013682,-0.002932,0.0,0.0,0.0,0.996016,0.0,-1311516.0


## Using tsfresh with sktime

In [7]:
classifier = make_pipeline(
    TSFreshFeatureExtractor(default_fc_parameters="efficient", show_warnings=False),
    RandomForestClassifier()
)
classifier.fit(X_train, y_train)
classifier.score(X_test, y_test)

  warn("Found non-unique index, replaced with unique index.")
Feature Extraction: 100%|██████████| 5/5 [00:09<00:00,  1.97s/it]
  warn("Found non-unique index, replaced with unique index.")
Feature Extraction: 100%|██████████| 5/5 [00:03<00:00,  1.53it/s]


0.8301886792452831

## Multivariate time series classification data

In [8]:
X, y = load_basic_motions(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(60, 6) (60,) (20, 6) (20,)


In [9]:
# multivariate input data
X_train.head()

Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5
17,0 3.789469 1 3.789469 2 1.78594...,0 -1.353556 1 -1.353556 2 -10.69460...,0 -0.685072 1 -0.685072 2 -4.465480 3...,0 -0.021307 1 -0.021307 2 2.753927 3...,0 -0.159802 1 -0.159802 2 -0.820319 3...,0 0.133169 1 0.133169 2 2.974987 3...
24,0 0.383922 1 0.383922 2 -0.272575 3...,0 0.302612 1 0.302612 2 -1.381236 3...,0 -0.398075 1 -0.398075 2 -0.681258 3...,0 0.071911 1 0.071911 2 -0.761725 3...,0 0.175783 1 0.175783 2 -0.114525 3...,0 -0.087891 1 -0.087891 2 -0.503377 3...
15,0 -0.159076 1 -0.159076 2 -0.97770...,0 0.376722 1 0.376722 2 0.38349...,0 -0.445368 1 -0.445368 2 1.695360 3...,0 -0.029297 1 -0.029297 2 -0.255684 3...,0 0.029297 1 0.029297 2 0.375536 3...,0 -0.047941 1 -0.047941 2 0.516694 3...
23,0 -0.647511 1 -0.647511 2 -0.156391 3...,0 -0.111979 1 -0.111979 2 -0.159968 3...,0 -0.739682 1 -0.739682 2 0.441646 3...,0 0.202416 1 0.202416 2 -0.615239 3...,0 0.165129 1 0.165129 2 0.007990 3...,0 0.074574 1 0.074574 2 0.127842 3...
28,0 0.369660 1 0.369660 2 -0.635316 3...,0 -0.645952 1 -0.645952 2 -4.169368 3...,0 0.063500 1 0.063500 2 -0.315898 3...,0 -0.101208 1 -0.101208 2 0.122515 3...,0 -0.029297 1 -0.029297 2 -0.205080 3...,0 0.045277 1 0.045277 2 0.197090 3...


In [10]:
t = TSFreshFeatureExtractor(default_fc_parameters="efficient", show_warnings=False)
Xt = t.fit_transform(X_train)
Xt.head()

  warn("Found non-unique index, replaced with unique index.")
Feature Extraction: 100%|██████████| 5/5 [00:18<00:00,  3.75s/it]


variable,dim_0__abs_energy,dim_0__absolute_sum_of_changes,"dim_0__agg_autocorrelation__f_agg_""mean""__maxlag_40","dim_0__agg_autocorrelation__f_agg_""median""__maxlag_40","dim_0__agg_autocorrelation__f_agg_""var""__maxlag_40","dim_0__agg_linear_trend__attr_""intercept""__chunk_len_10__f_agg_""max""","dim_0__agg_linear_trend__attr_""intercept""__chunk_len_10__f_agg_""mean""","dim_0__agg_linear_trend__attr_""intercept""__chunk_len_10__f_agg_""min""","dim_0__agg_linear_trend__attr_""intercept""__chunk_len_10__f_agg_""var""","dim_0__agg_linear_trend__attr_""intercept""__chunk_len_50__f_agg_""max""",...,dim_5__symmetry_looking__r_0.9500000000000001,dim_5__time_reversal_asymmetry_statistic__lag_1,dim_5__time_reversal_asymmetry_statistic__lag_2,dim_5__time_reversal_asymmetry_statistic__lag_3,dim_5__value_count__value_-1,dim_5__value_count__value_0,dim_5__value_count__value_1,dim_5__variance,dim_5__variance_larger_than_standard_deviation,dim_5__variation_coefficient
0,14668.442452,852.384132,-0.019733,-0.067975,0.140029,18.527246,4.206907,-17.096802,154.651004,20.67378,...,1.0,15.053465,48.867108,70.182013,0.0,0.0,0.0,26.184878,1.0,149.517043
1,354.117244,114.057871,-0.023083,-0.032382,0.030085,3.329963,1.457159,-0.272249,1.32189,6.289511,...,1.0,0.15313,0.718881,1.765228,0.0,0.0,0.0,2.662879,1.0,-20.101505
2,20089.782616,936.012458,-0.031604,-0.070448,0.144797,24.032611,6.174375,-16.526685,200.755496,27.548164,...,1.0,5.090285,19.718272,76.965414,0.0,1.0,0.0,34.822337,1.0,-24.347572
3,380.452882,130.339733,-0.008718,-0.041681,0.048424,5.132551,1.462494,-0.902601,3.773765,7.581589,...,1.0,0.16112,0.378758,1.095259,0.0,0.0,0.0,2.663761,1.0,35.197925
4,123.411342,83.323817,-0.021147,-0.008992,0.074056,2.027459,0.411596,-0.957229,0.800756,2.808579,...,1.0,0.149593,0.361487,0.451426,0.0,0.0,0.0,1.521051,1.0,13.044029
