In [1]:
# General imports
from sklearn.tree import DecisionTreeClassifier
from sktime.datasets import load_gunpoint
from sktime.pipeline import Pipeline, FeatureUnion
import pandas as pd
import numpy as np

# Imports for the current sktime implementation (i.e. pd.Series)
from sklearn.preprocessing import FunctionTransformer
from sktime.transformers.segment import RandomIntervalSegmenter as BaseIntervalSegmenter
from sktime.transformers.compose import RowwiseTransformer, Tabularizer
from sktime.utils.time_series import time_series_slope
from sktime.utils.data_container import tabularize

# Imports for the awkwardarray-based implementation
from awkwardarray.transformers import \
    RandomIntervalSegmenter as AwkIntervalSegmenter, \
    FeatureUnionTransformer as AwkFeatureUnionTransformer, \
    UniversalFunctionTransformer as AwkUniversalFunctionTransformer, \
    GenericFunctionTransformer as AwkGenericFunctionTransformer, TabularTransformer
from awkwardarray.utils import awkward_build, awkward_slope_func, awkward_tabularize

# Imports for the extensionarray-based implementation
from extensionarray.timeframe import TimeFrame
from extensionarray.reimplement import \
    RandomIntervalSegmenter as ExtIntervalSegmenter, \
    UniversalFunctionTransformer as ExtUniversalFunctionTransformer, \
    GenericFunctionTransformer as ExtGenericFunctionTransformer, \
    extarray_slope_func

## Prepare the data for each implementation


In [2]:
# Replicate the gunpoint dataset 20 times to get a reasonably sized dataset
X = pd.concat([load_gunpoint(return_X_y=False) for _ in range(20)])
y = X['class_val']
X.drop('class_val', axis=1, inplace=True)


In [3]:
# This is the current implementation
X_base = X.copy()

print(X_base.dtypes)
print("Class of a single cell: " + str(X_base.iloc[0,0].__class__))
print("Representation in the console: ")
print(X_base.head(2))

dim_0    object
dtype: object
Class of a single cell: <class 'pandas.core.series.Series'>
Representation in the console: 
                                               dim_0
0  0     -0.64789
1     -0.64199
2     -0.63819
3...
1  0     -0.64443
1     -0.64540
2     -0.64706
3...


In [4]:
# Convert to an awkward array
X_awk = awkward_build(X)

print("Representation in the console: ")
print(X_awk)

Representation in the console: 
[[[-0.64789 -0.64199 -0.63819 ... -0.64043 -0.63867 -0.63866]] [[-0.64443 -0.6454 -0.64706 ... -0.63493 -0.6345 -0.6316]] [[-0.77835 -0.77828 -0.77715 ... -0.7042 -0.70761 -0.70712]] ... [[-0.77913 -0.77838 -0.77574 ... -0.50501 -0.50373 -0.50438]] [[-0.70303 -0.70262 -0.7025 ... -0.64113 -0.64141 -0.64211]] [[-1.4357 -1.4323 -1.4329 ... -1.4355 -1.4353 -1.4309]]]


In [5]:
# Convert to an extensionarray/TimeFrame
X_ext = TimeFrame(data={'dim_0': tabularize(X['dim_0'], return_array=True)})

print(X_ext.dtypes)
print("Class of a single cell: " + str(X_ext.iloc[0,0].__class__))
print("Representation in the console: ")
print(X_ext.head(2))


dim_0    timeseries
dtype: object
Class of a single cell: <class 'numpy.ndarray'>
Representation in the console: 
                                               dim_0
0  [-0.64789  -0.64199  -0.63819  -0.63826  -0.63...
1  [-0.64443  -0.6454   -0.64706  -0.64749  -0.64...


## Tabularize

In [7]:
base_tabularizer = Tabularizer(check_input=False)
X_base_tab = base_tabularizer.fit_transform(X)

print(X_base_tab.__class__)
print(X_base_tab.dtypes)
print("Representation in the console: ")
print(X_base_tab.iloc[0:2, 0:5])

<class 'pandas.core.frame.DataFrame'>
dim_0__0      float64
dim_0__1      float64
dim_0__2      float64
dim_0__3      float64
dim_0__4      float64
               ...   
dim_0__145    float64
dim_0__146    float64
dim_0__147    float64
dim_0__148    float64
dim_0__149    float64
Length: 150, dtype: object
Representation in the console: 
   dim_0__0  dim_0__1  dim_0__2  dim_0__3  dim_0__4
0  -0.64789  -0.64199  -0.63819  -0.63826  -0.63835
1  -0.64443  -0.64540  -0.64706  -0.64749  -0.64691


In [8]:
awk_tabularizer = TabularTransformer()
X_awk_tab = awk_tabularizer.fit_transform(X_awk)
print(X_awk_tab.__class__)
print("Representation in the console: ")
print(X_awk_tab)

<class 'awkward.array.jagged.JaggedArray'>
Representation in the console: 
[[-0.64789 -0.64199 -0.63819 ... -0.64043 -0.63867 -0.63866] [-0.64443 -0.6454 -0.64706 ... -0.63493 -0.6345 -0.6316] [-0.77835 -0.77828 -0.77715 ... -0.7042 -0.70761 -0.70712] ... [-0.77913 -0.77838 -0.77574 ... -0.50501 -0.50373 -0.50438] [-0.70303 -0.70262 -0.7025 ... -0.64113 -0.64141 -0.64211] [-1.4357 -1.4323 -1.4329 ... -1.4355 -1.4353 -1.4309]]


In [9]:
X_ext_tab = X_ext.tabularise()

print(X_ext_tab.__class__)
print("Representation in the console: ")
print(X_ext_tab.iloc[0:2, 0:5])


<class 'pandas.core.frame.DataFrame'>
Representation in the console: 
                                               dim_0
0  [-0.64789  -0.64199  -0.63819  -0.63826  -0.63...
1  [-0.64443  -0.6454   -0.64706  -0.64749  -0.64...


## Segmentation

In [11]:
base_segmenter = BaseIntervalSegmenter(n_intervals=3)
X_base_seg = base_segmenter.fit_transform(X_base)

print(X_base_seg.__class__)
print(X_base_seg.dtypes)
print("Class of a single cell: " + str(X_base_seg.iloc[0,0].__class__)) # Note: this has changed from pd.Series to np.ndarray
print("Representation in the console: ")
print(X_base_seg.iloc[0:2, 0:3])

<class 'pandas.core.frame.DataFrame'>
dim_0_48_59      object
dim_0_141_150    object
dim_0_8_116      object
dtype: object
Class of a single cell: <class 'numpy.ndarray'>
Representation in the console: 
                                         dim_0_48_59  \
0  [-0.64448, -0.64889, -0.65766, -0.6612, -0.649...   
1  [-0.50726, -0.47291, -0.39633, -0.33415, -0.2,...   

                                       dim_0_141_150  \
0  [-0.63972, -0.63973, -0.64018, -0.63923, -0.63...   
1  [-0.64143, -0.63927, -0.6378, -0.63768, -0.635...   

                                         dim_0_8_116  
0  [-0.64505, -0.64712, -0.64915, -0.65125, -0.65...  
1  [-0.6353, -0.63538, -0.63411, -0.63372, -0.632...  


In [12]:
awk_segmenter = AwkIntervalSegmenter(n_intervals=3)
X_awk_seg = awk_segmenter.fit_transform(X_awk)

print(X_awk_seg.__class__)
print("Representation in the console: ")
print(X_awk_seg)

<class 'awkward.array.jagged.JaggedArray'>
Representation in the console: 
[[[1.8206 1.8172 1.8316 ... 1.2954 1.1717 1.0295] [-0.66345 -0.66219 -0.66234 ... -0.24145 -0.42928 -0.54788] [-0.28314 -0.17745 0.15825 ... -0.65258 -0.64332 -0.63887]] [[1.9097 1.9175 1.919 ... 0.46326 0.22508 -0.012778] [-0.60895 -0.60814 -0.60875 ... -0.80119 -0.81305 -0.8142] [0.69455 0.80781 0.92774 ... -0.64982 -0.6472 -0.6455]] [[1.6966 1.6942 1.6887 ... 0.64692 0.47169 0.30269] [-0.78045 -0.777 -0.76997 ... -0.45182 -0.49966 -0.53307] [1.1356 1.3415 1.52 ... -0.58864 -0.62929 -0.66251]] ... [[1.7845 1.7775 1.7747 ... 0.52741 0.44066 0.24082] [-0.71048 -0.71123 -0.71224 ... -0.71966 -0.73853 -0.73307] [0.89838 1.0403 1.2154 ... -0.5186 -0.51439 -0.50992]] [[1.8415 1.8388 1.8358 ... -0.34491 -0.43918 -0.50599] [-0.69749 -0.69596 -0.69521 ... -0.60391 -0.60818 -0.6079] [1.7226 1.8053 1.83 ... -0.68814 -0.71272 -0.72431]] [[0.78779 0.78659 0.78636 ... 0.77869 0.777 0.7711] [-1.2089 -1.0004 -0.75366 ... 0.78

In [13]:
ext_segmenter = ExtIntervalSegmenter(n_intervals=3)
X_ext_seg = ext_segmenter.fit_transform(X_ext)

print(X_ext_seg.__class__)
print(X_ext_seg.dtypes) # Note: the segments remain TimeSeries with all attached functionality
print("Class of a single cell: " + str(X_ext_seg.iloc[0,0].__class__)) 
print("Representation in the console: ")
print(X_ext_seg.iloc[0:2, 0:3])

<class 'pandas.core.frame.DataFrame'>
dim_0_26_54     timeseries
dim_0_54_92     timeseries
dim_0_58_126    timeseries
dtype: object
Class of a single cell: <class 'numpy.ndarray'>
Representation in the console: 
                                         dim_0_26_54  \
0  [-0.66141 -0.66145 -0.66037 -0.65911 -0.65974 ...   
1  [-0.61068  -0.61113  -0.61108  -0.61171  -0.61...   

                                         dim_0_54_92  \
0  [-0.53743 -0.46503 -0.35853 -0.28314 -0.17745 ...   
1  [0.10084 0.31489 0.52512 0.69455 0.80781 0.927...   

                                        dim_0_58_126  
0  [-0.17745   0.15825   0.35028   0.48241   0.60...  
1  [ 0.80781   0.92774   1.0859    1.2465    1.41...  


## Functions (mean)

In [15]:
base_mean_trans = RowwiseTransformer(FunctionTransformer(func=np.mean, validate=False))
X_base_mean = base_mean_trans.fit_transform(X_base_seg)

print(X_base_mean.__class__)
print(X_base_mean.dtypes)
print("Class of a single cell: " + str(X_base_mean.iloc[0,0].__class__)) # Note: this has changed from pd.Series to np.ndarray
print("Representation in the console: ")
print(X_base_mean.iloc[0:2, 0:3])

<class 'pandas.core.frame.DataFrame'>
dim_0_48_59      float64
dim_0_141_150    float64
dim_0_8_116      float64
dtype: object
Class of a single cell: <class 'numpy.float64'>
Representation in the console: 
   dim_0_48_59  dim_0_141_150  dim_0_8_116
0    -0.517604      -0.639582     0.251101
1     0.040733      -0.636440     0.252787


In [16]:
awk_mean_trans = AwkUniversalFunctionTransformer("mean")
X_awk_mean = awk_mean_trans.fit_transform(X_awk_seg)

print(X_awk_mean.__class__)
print("Representation in the console: ")
print(X_awk_mean)

<class 'awkward.array.jagged.JaggedArray'>
Representation in the console: 
[[1.7153444444444446 0.4642138181818183 0.712687690140845] [1.4547915555555557 0.46386372727272734 0.629937633802817] [1.419971111111111 0.4927193522727272 0.714291690140845] ... [1.4083251851851855 0.42903604545454554 0.6772048169014085] [1.0359294814814815 0.45374573863636364 0.5839254366197184] [0.782517777777778 0.6708634545454545 0.6203691126760562]]


In [17]:
ext_mean_trans = ExtUniversalFunctionTransformer("mean")
X_ext_mean = ext_mean_trans.fit_transform(X_ext_seg)

print(X_ext_mean.__class__)
print(X_ext_mean.dtypes) # Note: the segments remain TimeSeries with all attached functionality
print("Class of a single cell: " + str(X_ext_mean.iloc[0,0].__class__)) 
print("Representation in the console: ")
print(X_ext_mean.iloc[0:2, 0:3])


<class 'pandas.core.frame.DataFrame'>
dim_0_26_54     float64
dim_0_54_92     float64
dim_0_58_126    float64
dtype: object
Class of a single cell: <class 'numpy.float64'>
Representation in the console: 
   dim_0_26_54  dim_0_54_92  dim_0_58_126
0    -0.651426     1.299906      0.767149
1    -0.543285     1.571880      0.666525


**Note:** Both new implementations use a separate GenericFunctionTransformer to apply a custom slope function. This is not shown here, since it has the same output. It should also be possible to create a FunctionTransformer that can handle both universal functions and generic functions at the same time (e.g. through simple if-else statements).



## Run the full benchmarking

In [18]:
%run ./comparison_benchmark.py



Awkward Array Tabularize: [5.891200000000651e-05, 6.193600000001354e-05, 5.741999999999692e-05, 6.314999999997184e-05]
Awkward Array Segmenter: [0.004103032000000013, 0.004390236000000023, 0.003684504000000004, 0.004158588000000023]
Awkward Array Mean: [0.0005607819999999819, 0.00045519800000001, 0.001895409999999984, 0.0005494759999999843]
Awkward Array Std: [0.005631224000000046, 0.008903218000000023, 0.009458136000000011, 0.013094786000000056]
Awkward Array Slope: [0.00892844199999999, 0.019455030000000022, 0.009823215999999989, 0.009848724000000005]
Awkward Array Union: [7.081399999997017e-05, 7.490799999999353e-05, 7.828599999996299e-05, 8.325999999996724e-05]
Awkward Array Classifier: [0.0012223339999999894, 0.001209265999999971, 0.0011577139999999985, 0.0014075400000000116]
Awkward Array Pipeline: [0.09762096200000002, 0.09844961999999996, 0.09631554600000002, 0.10025390999999999]

Extension Array Tabularize: [0.0003310059999999737, 0.00032560000000003697, 0.0003266279999999710