In [1]:
# General imports
from sklearn.tree import DecisionTreeClassifier
from sktime.datasets import load_gunpoint
from sktime.pipeline import Pipeline, FeatureUnion
import pandas as pd
import numpy as np

# Imports for the current sktime implementation (i.e. pd.Series)
from sklearn.preprocessing import FunctionTransformer
from sktime.transformers.segment import RandomIntervalSegmenter as BaseIntervalSegmenter
from sktime.transformers.compose import RowwiseTransformer, Tabularizer
from sktime.utils.time_series import time_series_slope
from sktime.utils.data_container import tabularize

# Imports for the awkwardarray-based implementation
from awkwardarray.transformers import \
    RandomIntervalSegmenter as AwkIntervalSegmenter, \
    FeatureUnionTransformer as AwkFeatureUnionTransformer, \
    UniversalFunctionTransformer as AwkUniversalFunctionTransformer, \
    GenericFunctionTransformer as AwkGenericFunctionTransformer, TabularTransformer
from awkwardarray.utils import awkward_build, awkward_slope_func, awkward_tabularize

# Imports for the extensionarray-based implementation
from extensionarray.timeframe import TimeFrame
from extensionarray.timeseries import TimeSeries
from extensionarray.reimplement import \
    RandomIntervalSegmenter as ExtIntervalSegmenter, \
    UniversalFunctionTransformer as ExtUniversalFunctionTransformer, \
    GenericFunctionTransformer as ExtGenericFunctionTransformer, \
    extarray_slope_func

## Prepare the data for each implementation


In [2]:
# Replicate the gunpoint dataset 20 times to get a reasonably sized dataset
X = pd.concat([load_gunpoint(return_X_y=False) for _ in range(20)])
y = X['class_val']
X.drop('class_val', axis=1, inplace=True)


In [3]:
# This is the current implementation
X_base = X.copy()

print(X_base.dtypes)
print("Class of a single cell: " + str(X_base.iloc[0,0].__class__))
print("Representation in the console: ")
print(X_base.head(2))

dim_0    object
dtype: object
Class of a single cell: <class 'pandas.core.series.Series'>
Representation in the console: 
                                               dim_0
0  0     -0.64789
1     -0.64199
2     -0.63819
3...
1  0     -0.64443
1     -0.64540
2     -0.64706
3...


In [4]:
# Convert to an awkward array
X_awk = awkward_build(X)

print("Representation in the console: ")
print(X_awk)

Representation in the console: 
[[[-0.64789 -0.64199 -0.63819 ... -0.64043 -0.63867 -0.63866]] [[-0.64443 -0.6454 -0.64706 ... -0.63493 -0.6345 -0.6316]] [[-0.77835 -0.77828 -0.77715 ... -0.7042 -0.70761 -0.70712]] ... [[-0.77913 -0.77838 -0.77574 ... -0.50501 -0.50373 -0.50438]] [[-0.70303 -0.70262 -0.7025 ... -0.64113 -0.64141 -0.64211]] [[-1.4357 -1.4323 -1.4329 ... -1.4355 -1.4353 -1.4309]]]


In [5]:
# Convert to an extensionarray/TimeFrame
X_ext = TimeFrame(data={'dim_0': tabularize(X['dim_0'], return_array=True)})

print(X_ext.dtypes)
print("Class of a single cell: " + str(X_ext.iloc[0,0].__class__))
print("Representation in the console: ")
print(X_ext.head(2))


dim_0    timeseries
dtype: object
Class of a single cell: <class 'numpy.ndarray'>
Representation in the console: 
                                               dim_0
0  [-0.64789  -0.64199  -0.63819  -0.63826  -0.63...
1  [-0.64443  -0.6454   -0.64706  -0.64749  -0.64...


## Tabularize

In [6]:
base_tabularizer = Tabularizer(check_input=False)
X_base_tab = base_tabularizer.fit_transform(X)

print(X_base_tab.__class__)
print(X_base_tab.dtypes)
print("Representation in the console: ")
print(X_base_tab.iloc[0:2, 0:5])

<class 'pandas.core.frame.DataFrame'>
dim_0__0      float64
dim_0__1      float64
dim_0__2      float64
dim_0__3      float64
dim_0__4      float64
               ...   
dim_0__145    float64
dim_0__146    float64
dim_0__147    float64
dim_0__148    float64
dim_0__149    float64
Length: 150, dtype: object
Representation in the console: 
   dim_0__0  dim_0__1  dim_0__2  dim_0__3  dim_0__4
0  -0.64789  -0.64199  -0.63819  -0.63826  -0.63835
1  -0.64443  -0.64540  -0.64706  -0.64749  -0.64691


In [7]:
awk_tabularizer = TabularTransformer()
X_awk_tab = awk_tabularizer.fit_transform(X_awk)
print(X_awk_tab.__class__)
print("Representation in the console: ")
print(X_awk_tab)

<class 'awkward.array.jagged.JaggedArray'>
Representation in the console: 
[[-0.64789 -0.64199 -0.63819 ... -0.64043 -0.63867 -0.63866] [-0.64443 -0.6454 -0.64706 ... -0.63493 -0.6345 -0.6316] [-0.77835 -0.77828 -0.77715 ... -0.7042 -0.70761 -0.70712] ... [-0.77913 -0.77838 -0.77574 ... -0.50501 -0.50373 -0.50438] [-0.70303 -0.70262 -0.7025 ... -0.64113 -0.64141 -0.64211] [-1.4357 -1.4323 -1.4329 ... -1.4355 -1.4353 -1.4309]]


In [8]:
X_ext_tab = X_ext.tabularise()

print(X_ext_tab.__class__)
print("Representation in the console: ")
print(X_ext_tab.iloc[0:2, 0:5])


<class 'pandas.core.frame.DataFrame'>
Representation in the console: 
   dim_0_0  dim_0_1  dim_0_2  dim_0_3  dim_0_4
0 -0.64789 -0.64199 -0.63819 -0.63826 -0.63835
1 -0.64443 -0.64540 -0.64706 -0.64749 -0.64691


## Segmentation

In [9]:
base_segmenter = BaseIntervalSegmenter(n_intervals=3, random_state=2)
X_base_seg = base_segmenter.fit_transform(X_base)

print(X_base_seg.__class__)
print(X_base_seg.dtypes)
print("Class of a single cell: " + str(X_base_seg.iloc[0,0].__class__)) # Note: this has changed from pd.Series to np.ndarray
print("Representation in the console: ")
print(X_base_seg.iloc[0:2, 0:3])

<class 'pandas.core.frame.DataFrame'>
dim_0_15_60     object
dim_0_72_149    object
dim_0_22_128    object
dtype: object
Class of a single cell: <class 'numpy.ndarray'>
Representation in the console: 
                                         dim_0_15_60  \
0  [-0.66099, -0.66156, -0.66226, -0.66191, -0.66...   
1  [-0.6287, -0.62747, -0.62409, -0.62038, -0.615...   

                                        dim_0_72_149  \
0  [1.8102, 1.8206, 1.8172, 1.8316, 1.8316, 1.829...   
1  [1.9125, 1.9097, 1.9175, 1.919, 1.919, 1.9252,...   

                                        dim_0_22_128  
0  [-0.66219, -0.66234, -0.66171, -0.66139, -0.66...  
1  [-0.60814, -0.60875, -0.60932, -0.60964, -0.61...  


In [10]:
awk_segmenter = AwkIntervalSegmenter(n_intervals=3)
X_awk_seg = awk_segmenter.fit_transform(X_awk)

print(X_awk_seg.__class__)
print("Representation in the console: ")
print(X_awk_seg)

<class 'awkward.array.jagged.JaggedArray'>
Representation in the console: 
[[[-0.66464 -0.66412 -0.6643 ... -0.63656 -0.63317 -0.63304] [-0.67374 -0.66839 -0.66464 ... -0.63972 -0.63973 -0.64018] [-0.63656 -0.63317 -0.63304 ... -0.63303 -0.63522 -0.63447]] [[-0.67507 -0.66984 -0.66321 ... -0.64535 -0.64491 -0.64518] [-0.69251 -0.68554 -0.67507 ... -0.64143 -0.63927 -0.6378] [-0.64535 -0.64491 -0.64518 ... -0.64499 -0.64382 -0.64296]] [[-0.57186 -0.55389 -0.53961 ... -0.69088 -0.70717 -0.72228] [-0.60814 -0.58754 -0.57186 ... -0.71871 -0.71353 -0.71002] [-0.69088 -0.70717 -0.72228 ... -0.73874 -0.73924 -0.73841]] ... [[-0.54054 -0.5337 -0.53461 ... -0.50954 -0.51249 -0.51424] [-0.5578 -0.5497 -0.54054 ... -0.51271 -0.51012 -0.51187] [-0.50954 -0.51249 -0.51424 ... -0.51596 -0.51494 -0.51681]] [[-0.45623 -0.50398 -0.55318 ... -0.72823 -0.72739 -0.72803] [-0.40864 -0.42688 -0.45623 ... -0.64071 -0.6398 -0.63955] [-0.72823 -0.72739 -0.72803 ... -0.71407 -0.70227 -0.68975]] [[0.17029 0.0273

In [11]:
ext_segmenter = ExtIntervalSegmenter(n_intervals=3, random_state=2)
X_ext_seg = ext_segmenter.fit_transform(X_ext)

print(X_ext_seg.__class__)
print(X_ext_seg.dtypes) # Note: the segments remain TimeSeries with all attached functionality
print("Class of a single cell: " + str(X_ext_seg.iloc[0,0].__class__)) 
print("Representation in the console: ")
print(X_ext_seg.iloc[0:2, 0:3])

<class 'pandas.core.frame.DataFrame'>
dim_0_15_60     timeseries
dim_0_72_149    timeseries
dim_0_22_128    timeseries
dtype: object
Class of a single cell: <class 'numpy.ndarray'>
Representation in the console: 
                                         dim_0_15_60  \
0  [-0.66099 -0.66156 -0.66226 -0.66191 -0.66274 ...   
1  [-0.6287   -0.62747  -0.62409  -0.62038  -0.61...   

                                        dim_0_72_149  \
0  [ 1.8102    1.8206    1.8172    1.8316    1.83...   
1  [ 1.9125    1.9097    1.9175    1.919     1.91...   

                                        dim_0_22_128  
0  [-0.66219  -0.66234  -0.66171  -0.66139  -0.66...  
1  [-0.60814  -0.60875  -0.60932  -0.60964  -0.61...  


## Functions (mean)

In [12]:
base_mean_trans = RowwiseTransformer(FunctionTransformer(func=np.mean, validate=False))
X_base_mean = base_mean_trans.fit_transform(X_base_seg)

print(X_base_mean.__class__)
print(X_base_mean.dtypes)
print("Class of a single cell: " + str(X_base_mean.iloc[0,0].__class__)) 
print("Representation in the console: ")
print(X_base_mean.iloc[0:2, 0:3])

<class 'pandas.core.frame.DataFrame'>
dim_0_15_60     float64
dim_0_72_149    float64
dim_0_22_128    float64
dtype: object
Class of a single cell: <class 'numpy.float64'>
Representation in the console: 
   dim_0_15_60  dim_0_72_149  dim_0_22_128
0    -0.604105      0.298917      0.267474
1    -0.413653      0.112757      0.264326


In [13]:
awk_mean_trans = AwkUniversalFunctionTransformer("mean")
X_awk_mean = awk_mean_trans.fit_transform(X_awk_seg)

print(X_awk_mean.__class__)
print("Representation in the console: ")
print(X_awk_mean)

<class 'awkward.array.jagged.JaggedArray'>
Representation in the console: 
[[-0.6501045454545454 -0.6449611538461538 -0.6340714285714286] [-0.6544218181818182 -0.6510542307692307 -0.6447428571428571] [-0.61485 -0.6699869230769231 -0.7241957142857143] ... [-0.5215654545454546 -0.519678076923077 -0.5138542857142857] [-0.6432827272727273 -0.6376080769230769 -0.71622] [-0.6670011818181818 -1.0160997307692308 -1.4685857142857144]]


In [14]:
ext_mean_trans = ExtUniversalFunctionTransformer("mean")
X_ext_mean = ext_mean_trans.fit_transform(X_ext_seg)

print(X_ext_mean.__class__)
print(X_ext_mean.dtypes) 
print("Class of a single cell: " + str(X_ext_mean.iloc[0,0].__class__)) 
print("Representation in the console: ")
print(X_ext_mean.iloc[0:2, 0:3])


<class 'pandas.core.frame.DataFrame'>
dim_0_15_60     float64
dim_0_72_149    float64
dim_0_22_128    float64
dtype: object
Class of a single cell: <class 'numpy.float64'>
Representation in the console: 
   dim_0_15_60  dim_0_72_149  dim_0_22_128
0    -0.604105      0.298917      0.267474
1    -0.413653      0.112757      0.264326


**Note:** The above example only shows the application of numpy's universal functions. Both new implementations use a separate GenericFunctionTransformer to apply a custom slope function. This is not shown here, since it has the same output structur. It should also be possible to create a FunctionTransformer that can handle both universal functions and generic functions at the same time (e.g. through simple if-else statements).



## Run the full benchmarking

In [16]:
%run ./comparison_benchmark.py



Awkward Array Tabularize: [5.456199999912315e-05, 4.581600000051367e-05, 5.961800000022777e-05, 4.822999999987587e-05]
Awkward Array Segmenter: [0.0038566760000003342, 0.0039109199999995785, 0.003465754000000061, 0.0037042080000003353]
Awkward Array Mean: [0.0023629619999996974, 0.0007000980000009349, 0.0020150280000007115, 0.0028593320000004497]
Awkward Array Std: [0.005291322000000491, 0.0092077740000002, 0.008265671999999995, 0.0044292859999995925]
Awkward Array Slope: [0.0076036940000005875, 0.014474813999999014, 0.016675112000000353, 0.014450940000000401]
Awkward Array Union: [9.919999999965511e-05, 6.53639999995903e-05, 8.403199999975186e-05, 7.364199999983612e-05]
Awkward Array Classifier: [0.0013443999999992683, 0.0012758040000005622, 0.00139108199999896, 0.0013827140000000782]
Awkward Array Pipeline: [0.09462375800000018, 0.09275876400000016, 0.09302262199999972, 0.09370050199999923]

Extension Array Tabularize: [0.0016493000000002667, 0.0017025239999998122, 0.001646096000000