# sktime pipeline and ensemble walkthrough

[github lookup](https://github.com/alan-turing-institute/sktime/blob/dev/examples/pipeline_walkthrough.ipynb)

### Preliminaries

In [1]:
import sktime
print(sktime.__version__)
print(sktime.__file__)

0.1.dev
/Users/mloning/.conda/envs/sktime/lib/python3.7/site-packages/sktime/__init__.py


In [2]:
from sktime.transformers.compose import RowwiseTransformer
from sktime.transformers.compose import TSColumnTransformer
from sktime.transformers.compose import Tabulariser
from sktime.transformers.series_to_series import RandomIntervalSegmenter
from sktime.pipeline import TSPipeline
from sktime.pipeline import TSFeatureUnion
from sktime.classifiers.ensemble import TimeSeriesForestClassifier
from sktime.datasets import load_gunpoint
from sktime.utils.time_series import time_series_slope

from statsmodels.tsa.stattools import acf
from statsmodels.tsa.ar_model import AR
from sklearn.preprocessing import FunctionTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import numpy as np
import pandas as pd

### Load data

In [3]:
X_train, y_train = load_gunpoint(split='TRAIN', return_X_y=True)
X_test, y_test = load_gunpoint(split='TEST', return_X_y=True)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(50, 1) (150, 1) (50,) (150,)


### Time-series forest classifier (TSF)
Specify time-series tree classifier as modular pipeline using series-to-primitive features

In [4]:
steps = [
    ('segment', RandomIntervalSegmenter(n_intervals='sqrt')),
    ('transform', TSFeatureUnion([
        ('mean', RowwiseTransformer(FunctionTransformer(func=np.mean, validate=False))),
        ('std', RowwiseTransformer(FunctionTransformer(func=np.std, validate=False))),
        ('slope', RowwiseTransformer(FunctionTransformer(func=time_series_slope, validate=False)))
    ])),
    ('clf', DecisionTreeClassifier())
]
base_estimator = TSPipeline(steps)

We can direclty fit and evaluate the single tree (pipeline)

In [5]:
base_estimator.fit(X_train, y_train)
base_estimator.score(X_test, y_test)

0.7266666666666667

To improve prediction performance, we can use the single tree as the base estimator in an ensemble 

In [6]:
tsf = TimeSeriesForestClassifier(base_estimator=base_estimator, 
                                 n_estimators=100,
                                 criterion='entropy',
                                 bootstrap=True, 
                                 oob_score=True)

Fit and optionally report out-of-bag score

In [7]:
tsf.fit(X_train, y_train)
if tsf.oob_score:
    print(tsf.oob_score_)

1.0


Evaluate on test set

In [8]:
tsf.score(X_test, y_test)

0.94

### Random interval spectral ensemble (RISE)

Define helper functions for extracting the following series-to-series features 
* estimated autocorrelation,
* fitted auto-regressive coefficients,  
* Fourier transform, 
* power spectrum.

In [9]:
def ar_coefs(x, maxlag=100):
    # necessary to remove trend in fit method, otherwise drops trend in some of the cases
    nlags = np.minimum(len(x) - 1, maxlag)
    model = AR(endog=x) 
    return model.fit(maxlag=nlags, trend='nc').params

def acf_coefs(x, maxlag=100):
    nlags = np.minimum(len(x) - 1, maxlag)
    return acf(x, nlags=nlags)

def powerspectrum(x, **kwargs):
    fft = np.fft.fft(x)
    ps = fft.real * fft.real + fft.imag * fft.imag
    return ps[:ps.shape[0] // 2]

Specify base estimator for ensemble, before passing extracted series-to-series features to final estimator, we have to turn the transformed data into the tabular format with columns containing only primitives (e.g. individual auto-correlatiton coefficients)  

In [10]:
steps = [
    ('segment', RandomIntervalSegmenter(n_intervals=1, min_length=5)),
    ('transform', TSFeatureUnion([
        ('ar', RowwiseTransformer(FunctionTransformer(func=ar_coefs, validate=False))),
        ('acf', RowwiseTransformer(FunctionTransformer(func=acf_coefs, validate=False))),
        ('ps', RowwiseTransformer(FunctionTransformer(func=powerspectrum, validate=False)))
    ])),
    ('tabularise', Tabulariser()),
    ('clf', DecisionTreeClassifier())

]
base_estimator = TSPipeline(steps)

Set up ensemble

In [11]:
rise = TimeSeriesForestClassifier(base_estimator=base_estimator,
                                  n_estimators=100)

Fit 

In [12]:
rise.fit(X_train, y_train)
if rise.oob_score:
    print(rise.oob_score_)

Evaluate

In [13]:
rise.score(X_test, y_test)

0.98