# SKTIME

GitHub: https://github.com/alan-turing-institute/sktime

Documentation: https://www.sktime.org/en/stable/index.html


In [22]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sktime.classification.compose import ColumnEnsembleClassifier
from sktime.classification.dictionary_based import BOSSEnsemble
from sktime.classification.interval_based import TimeSeriesForestClassifier
# from sktime.classification.shapelet_based import MrSEQLClassifier
from sktime.datasets import load_basic_motions
from sktime.transformations.panel.compose import ColumnConcatenator

import matplotlib.pyplot as plt
%matplotlib inline

In [107]:
from sktime.classification.shapelet_based import ShapeletTransformClassifier 

In [108]:
ShapeletTransformClassifier

sktime.classification.shapelet_based._stc.ShapeletTransformClassifier

In [106]:
!cat /home/us3r/projects/automl-time-series/venv/lib/python3.8/site-packages/sktime/classification/shapelet_based/__init__.py

# -*- coding: utf-8 -*-
"""Shapelet based time series classifiers."""
__all__ = ["ShapeletTransformClassifier"]

from sktime.classification.shapelet_based._stc import ShapeletTransformClassifier


# Load Dataset

In [11]:
target = 'activity'
ts_column = 'datetime'

In [14]:
train

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",subject,activity,activity.1
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.112754,0.030400,-0.464761,-0.018446,-0.841247,0.179941,-0.058627,1,STANDING,0
1,0.278419,-0.016411,-0.123520,-0.998245,-0.975300,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317,1,STANDING,0
2,0.279653,-0.019467,-0.113462,-0.995380,-0.967187,-0.978944,-0.996520,-0.963668,-0.977469,-0.938692,...,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118,1,STANDING,0
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.982750,-0.989302,-0.938692,...,-0.036788,-0.012892,0.640011,-0.485366,-0.848649,0.181935,-0.047663,1,STANDING,0
4,0.276629,-0.016570,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,-0.990441,-0.942469,...,0.123320,0.122542,0.693578,-0.615971,-0.847865,0.185151,-0.043892,1,STANDING,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7347,0.299665,-0.057193,-0.181233,-0.195387,0.039905,0.077078,-0.282301,0.043616,0.060410,0.210795,...,-0.190437,0.829718,0.206972,-0.425619,-0.791883,0.238604,0.049819,30,WALKING_UPSTAIRS,0
7348,0.273853,-0.007749,-0.147468,-0.235309,0.004816,0.059280,-0.322552,-0.029456,0.080585,0.117440,...,0.064907,0.875679,-0.879033,0.400219,-0.771840,0.252676,0.050053,30,WALKING_UPSTAIRS,0
7349,0.273387,-0.017011,-0.045022,-0.218218,-0.103822,0.274533,-0.304515,-0.098913,0.332584,0.043999,...,0.052806,-0.266724,0.864404,0.701169,-0.779133,0.249145,0.040811,30,WALKING_UPSTAIRS,0
7350,0.289654,-0.018843,-0.158281,-0.219139,-0.111412,0.268893,-0.310487,-0.068200,0.319473,0.101702,...,-0.101360,0.700740,0.936674,-0.589479,-0.785181,0.246432,0.025339,30,WALKING_UPSTAIRS,0


In [15]:
# Train dataset
# Data source: https://www.kaggle.com/datasets/uciml/human-activity-recognition-with-smartphones
train = pd.read_csv("../data/human-activity-recognition/train.csv")

# Convert labels to binary
train[target] = pd.DataFrame(np.where(train['Activity']=='WALKING_DOWNSTAIRS', 1, 0))

# Drop unused columns
train.drop(['subject', 'Activity'], axis=1, inplace=True)

# Create a new date column
train[ts_column] = pd.date_range(start='1/1/2022', periods=len(train), freq='S')
train.head(3)

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",activity,datetime
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.710304,-0.112754,0.0304,-0.464761,-0.018446,-0.841247,0.179941,-0.058627,0,2022-01-01 00:00:00
1,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,-0.861499,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317,0,2022-01-01 00:00:01
2,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,-0.977469,-0.938692,...,-0.760104,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118,0,2022-01-01 00:00:02


In [17]:
# The max value will be incremented by 1 second and used in test dataset (see below)
train[ts_column].max()

Timestamp('2022-01-01 02:02:31')

In [18]:
# Test dataset
test = pd.read_csv("../data/human-activity-recognition/test.csv")

# Convert labels to binary
test[target] = pd.DataFrame(np.where(test['Activity']=='WALKING_DOWNSTAIRS', 1, 0))

# Drop unused columns
test.drop(['subject', 'Activity'], axis=1, inplace=True)

# Create a new date column
test[ts_column] = pd.date_range(start='1/1/2022', periods=len(test), freq='S')
test.head(3)

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",activity,datetime
0,0.257178,-0.023285,-0.014654,-0.938404,-0.920091,-0.667683,-0.952501,-0.925249,-0.674302,-0.894088,...,-0.705974,0.006462,0.16292,-0.825886,0.271151,-0.720009,0.276801,-0.057978,0,2022-01-01 00:00:00
1,0.286027,-0.013163,-0.119083,-0.975415,-0.967458,-0.944958,-0.986799,-0.968401,-0.945823,-0.894088,...,-0.594944,-0.083495,0.0175,-0.434375,0.920593,-0.698091,0.281343,-0.083898,0,2022-01-01 00:00:01
2,0.275485,-0.02605,-0.118152,-0.993819,-0.969926,-0.962748,-0.994403,-0.970735,-0.963483,-0.93926,...,-0.640736,-0.034956,0.202302,0.064103,0.145068,-0.702771,0.280083,-0.079346,0,2022-01-01 00:00:02


In [19]:
print('Train dataset shape:', train.shape)
print('Test dataset shape:', test.shape)

Train dataset shape: (7352, 563)
Test dataset shape: (2947, 563)


# Train Model

sktime offers three main ways of solving multivariate time series classification problems:
* Concatenation 
* Column-wise ensembling 
* Bespoke estimator-specific methods

Reference: [Multivariate time series classification with sktime](https://www.sktime.org/en/v0.8.1/examples/03_classification_multivariate.html)

In [71]:
from sktime.datatypes._panel._convert import from_2d_array_to_nested
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [91]:
train.columns

Index(['tBodyAcc-mean()-X', 'tBodyAcc-mean()-Y', 'tBodyAcc-mean()-Z',
       'tBodyAcc-std()-X', 'tBodyAcc-std()-Y', 'tBodyAcc-std()-Z',
       'tBodyAcc-mad()-X', 'tBodyAcc-mad()-Y', 'tBodyAcc-mad()-Z',
       'tBodyAcc-max()-X',
       ...
       'fBodyBodyGyroJerkMag-kurtosis()', 'angle(tBodyAccMean,gravity)',
       'angle(tBodyAccJerkMean),gravityMean)',
       'angle(tBodyGyroMean,gravityMean)',
       'angle(tBodyGyroJerkMean,gravityMean)', 'angle(X,gravityMean)',
       'angle(Y,gravityMean)', 'angle(Z,gravityMean)', 'activity', 'datetime'],
      dtype='object', length=563)

In [96]:
features = train.columns.difference([target, 'datetime'])

X = train[features]
X = from_2d_array_to_nested(X)
y = train[target]

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42, shuffle=False)

print('Dataset shape:', X_train.shape, y_train.shape, X_val.shape, y_val.shape)

Dataset shape: (5514, 1) (5514,) (1838, 1) (1838,)


In [73]:
X2, y2 = load_basic_motions(return_X_y=True)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state=42)
print(X2_train.shape, y2_train.shape, X2_test.shape, y2_test.shape)

(60, 6) (60,) (20, 6) (20,)


In [93]:
from numpy.random import default_rng

from sktime.datatypes._panel._convert import from_2d_array_to_nested

rng = default_rng()
X_2d = rng.standard_normal((50, 20))
print(f"The tabular data has the shape {X_2d.shape}")

The tabular data has the shape (50, 20)


In [94]:
X_nested = from_2d_array_to_nested(X_2d)
print(f"X_nested is a nested DataFrame: {is_nested_dataframe(X_nested)}")
print(f"The cell contains a {type(X_nested.iloc[0,0])}.")
print(f"The nested DataFrame has shape {X_nested.shape}")
X_nested.head()

X_nested is a nested DataFrame: True
The cell contains a <class 'pandas.core.series.Series'>.
The nested DataFrame has shape (50, 1)


Unnamed: 0,0
0,0 0.596395 1 -0.630929 2 -0.281962 3...
1,0 0.216178 1 1.002912 2 -0.293288 3...
2,0 -0.179428 1 -0.237563 2 0.986768 3...
3,0 0.640440 1 -0.451651 2 0.649068 3...
4,0 -0.199285 1 -0.076133 2 1.982288 3...


## Column ensembling

In [None]:
%%time 

clf = ColumnEnsembleClassifier(
    estimators=[
        ("TSF0", TimeSeriesForestClassifier(n_estimators=100), [0]),
        ("BOSSEnsemble3", BOSSEnsemble(max_ensemble_size=5), [0]),
    ]
)

clf.fit(X_train, y_train)

print('Validation accuracy score', clf.score(X_val, y_val))

```
TypeError: loop of ufunc does not support argument 0 of type float which has no callable sqrt method
```

In [None]:
steps = [
    ("concatenate", ColumnConcatenator()),
    ("classify", TimeSeriesForestClassifier(n_estimators=100)),
]

clf = Pipeline(steps)
clf.fit(X_train, y_train)


In [111]:
clf.score(X_train, y_train)

1.0

In [110]:
clf.score(X_val, y_val)

0.9836779107725789

In [112]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = clf.predict(X_val)

# print accuracy
print("Accuracy: ", accuracy_score(y_val, y_pred))

# print precision, recall, F1-score per each class/tag
print('Classification report:')
print(classification_report(y_val, y_pred))

# print confusion matrix, check documentation for sorting rows/columns
print('Confusion matrix')
print(confusion_matrix(y_val, y_pred))

Accuracy:  0.9836779107725789
Classification report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1588
           1       0.99      0.89      0.94       250

    accuracy                           0.98      1838
   macro avg       0.98      0.95      0.96      1838
weighted avg       0.98      0.98      0.98      1838

Confusion matrix
[[1585    3]
 [  27  223]]
