In [64]:
# General imports
from sklearn.tree import DecisionTreeClassifier
from sktime.datasets import load_gunpoint
from sktime.pipeline import Pipeline, FeatureUnion
import pandas as pd
import numpy as np

# Imports for the current sktime implementation (i.e. pd.Series)
from sklearn.preprocessing import FunctionTransformer
from sktime.transformers.segment import RandomIntervalSegmenter as BaseIntervalSegmenter
from sktime.transformers.compose import RowwiseTransformer, Tabularizer
from sktime.utils.time_series import time_series_slope
from sktime.utils.data_container import tabularize

# Imports for the awkwardarray-based implementation
from awkwardarray.transformers import \
    RandomIntervalSegmenter as AwkIntervalSegmenter, \
    FeatureUnionTransformer as AwkFeatureUnionTransformer, \
    UniversalFunctionTransformer as AwkUniversalFunctionTransformer, \
    GenericFunctionTransformer as AwkGenericFunctionTransformer, TabularTransformer
from awkwardarray.utils import awkward_build, awkward_slope_func, awkward_tabularize

# Imports for the extensionarray-based implementation
from extensionarray.timeframe import TimeFrame
from extensionarray.reimplement import \
    RandomIntervalSegmenter as ExtIntervalSegmenter, \
    UniversalFunctionTransformer as ExtUniversalFunctionTransformer, \
    GenericFunctionTransformer as ExtGenericFunctionTransformer, \
    extarray_slope_func

## Prepare the data for each implementation


In [2]:
# Replicate the gunpoint dataset 20 times to get a reasonably sized dataset
X = pd.concat([load_gunpoint(return_X_y=False) for _ in range(20)])
y = X['class_val']
X.drop('class_val', axis=1, inplace=True)


In [61]:
# This is the current implementation
X_base = X.copy()

print(X_base.dtypes)
print("Class of a single cell: " + str(X_base.iloc[0,0].__class__))
print("Representation in the console: ")
print(X_base.head(2))

dim_0    object
dtype: object
Class of a single cell: <class 'pandas.core.series.Series'>
Representation in the console: 
                                               dim_0
0  0     -0.64789
1     -0.64199
2     -0.63819
3...
1  0     -0.64443
1     -0.64540
2     -0.64706
3...


In [10]:
# Convert to an awkward array
X_awk = awkward_build(X)

print("Representation in the console: ")
print(X_awk)

[[[-0.64789 -0.64199 -0.63819 ... -0.64043 -0.63867 -0.63866]] [[-0.64443 -0.6454 -0.64706 ... -0.63493 -0.6345 -0.6316]] [[-0.77835 -0.77828 -0.77715 ... -0.7042 -0.70761 -0.70712]] ... [[-0.77913 -0.77838 -0.77574 ... -0.50501 -0.50373 -0.50438]] [[-0.70303 -0.70262 -0.7025 ... -0.64113 -0.64141 -0.64211]] [[-1.4357 -1.4323 -1.4329 ... -1.4355 -1.4353 -1.4309]]]


In [62]:
# Convert to an extensionarray/TimeFrame
X_ext = TimeFrame(data={'dim_0': tabularize(X['dim_0'], return_array=True)})

print(X_ext.dtypes)
print("Class of a single cell: " + str(X_ext.iloc[0,0].__class__))
print("Representation in the console: ")
print(X_ext.head(2))


dim_0    timeseries
dtype: object
Class of a single cell: <class 'numpy.ndarray'>
Representation in the console: 
                                               dim_0
0  [-0.64789  -0.64199  -0.63819  -0.63826  -0.63...
1  [-0.64443  -0.6454   -0.64706  -0.64749  -0.64...


In [None]:
## Tabularize

In [32]:
base_tabularizer = Tabularizer(check_input=False)
X_base_tab = base_tabularizer.fit_transform(X)

print(X_base_tab.__class__)
print(X_base_tab.dtypes)
print("Representation in the console: ")
print(X_base_tab.iloc[0:2, 0:5])

<class 'pandas.core.frame.DataFrame'>
dim_0__0      float64
dim_0__1      float64
dim_0__2      float64
dim_0__3      float64
dim_0__4      float64
               ...   
dim_0__145    float64
dim_0__146    float64
dim_0__147    float64
dim_0__148    float64
dim_0__149    float64
Length: 150, dtype: object
   dim_0__0  dim_0__1  dim_0__2  dim_0__3  dim_0__4
0  -0.64789  -0.64199  -0.63819  -0.63826  -0.63835
1  -0.64443  -0.64540  -0.64706  -0.64749  -0.64691


In [18]:
awk_tabularizer = TabularTransformer()
X_awk_tab = awk_tabularizer.fit_transform(X_awk)
print(X_awk_tab.__class__)
print("Representation in the console: ")
print(X_awk_tab)

<class 'awkward.array.jagged.JaggedArray'>
[[-0.64789 -0.64199 -0.63819 ... -0.64043 -0.63867 -0.63866] [-0.64443 -0.6454 -0.64706 ... -0.63493 -0.6345 -0.6316] [-0.77835 -0.77828 -0.77715 ... -0.7042 -0.70761 -0.70712] ... [-0.77913 -0.77838 -0.77574 ... -0.50501 -0.50373 -0.50438] [-0.70303 -0.70262 -0.7025 ... -0.64113 -0.64141 -0.64211] [-1.4357 -1.4323 -1.4329 ... -1.4355 -1.4353 -1.4309]]


In [24]:
X_ext_tab = X_ext.tabularise()

print(X_ext_tab.__class__)
print("Representation in the console: ")
print(X_ext_tab.iloc[0:2, 0:5])


<class 'pandas.core.frame.DataFrame'>
   dim_0_0  dim_0_1  dim_0_2  dim_0_3  dim_0_4
0 -0.64789 -0.64199 -0.63819 -0.63826 -0.63835
1 -0.64443 -0.64540 -0.64706 -0.64749 -0.64691


In [28]:
## Segmentation

In [56]:
base_segmenter = BaseIntervalSegmenter(n_intervals=3)
X_base_seg = base_segmenter.fit_transform(X_base)

print(X_base_seg.__class__)
print(X_base_seg.dtypes)
print("Class of a single cell: " + str(X_base_seg.iloc[0,0].__class__)) # Note: this has changed from pd.Series to np.ndarray
print("Representation in the console: ")
print(X_base_seg.iloc[0:2, 0:3])

<class 'pandas.core.frame.DataFrame'>
dim_0_117_146    object
dim_0_87_118     object
dim_0_135_141    object
dtype: object
Class of a single cell: <class 'numpy.ndarray'>
                                       dim_0_117_146  \
0  [-0.68631, -0.67374, -0.66839, -0.66464, -0.66...   
1  [-0.7005, -0.69251, -0.68554, -0.67507, -0.669...   

                                        dim_0_87_118  \
0  [1.8292, 1.8224, 1.8137, 1.8066, 1.7817, 1.768...   
1  [1.7656, 1.6632, 1.5312, 1.3838, 1.2475, 1.114...   

                                       dim_0_135_141  
0  [-0.63579, -0.63628, -0.6354, -0.63607, -0.637...  
1  [-0.64283, -0.64348, -0.64357, -0.64187, -0.64...  


In [27]:
awk_segmenter = AwkIntervalSegmenter(n_intervals=3)
X_awk_seg = awk_segmenter.fit_transform(X_awk)

print(X_awk_seg.__class__)
print("Representation in the console: ")
print(X_awk_seg)

<class 'awkward.array.jagged.JaggedArray'>
[[-0.64789 -0.64199 -0.63819 ... -0.64043 -0.63867 -0.63866] [-0.64443 -0.6454 -0.64706 ... -0.63493 -0.6345 -0.6316] [-0.77835 -0.77828 -0.77715 ... -0.7042 -0.70761 -0.70712] ... [-0.77913 -0.77838 -0.77574 ... -0.50501 -0.50373 -0.50438] [-0.70303 -0.70262 -0.7025 ... -0.64113 -0.64141 -0.64211] [-1.4357 -1.4323 -1.4329 ... -1.4355 -1.4353 -1.4309]]


In [53]:
ext_segmenter = ExtIntervalSegmenter(n_intervals=3)
X_ext_seg = ext_segmenter.fit_transform(X_ext)

print(X_ext_seg.__class__)
print(X_ext_seg.dtypes) # Note: the segments remain TimeSeries with all attached functionality
print("Class of a single cell: " + X_ext_seg.iloc[0,0].__class__) 
print("Representation in the console: ")
print(X_ext_seg.iloc[0:2, 0:3])

<class 'pandas.core.frame.DataFrame'>
dim_0_122_143    timeseries
dim_0_71_77      timeseries
dim_0_138_150    timeseries
dtype: object
<class 'numpy.ndarray'>
                                       dim_0_122_143  \
0  [-0.6643  -0.6612  -0.65935 -0.65258 -0.64332 ...   
1  [-0.66321 -0.65961 -0.65295 -0.64982 -0.6472  ...   

                                   dim_0_71_77  \
0  [1.7971 1.8102 1.8206 1.8172 1.8316 1.8316]   
1  [1.9094 1.9125 1.9097 1.9175 1.919  1.919 ]   

                                       dim_0_138_150  
0  [-0.63607 -0.63755 -0.63926 -0.63972 -0.63973 ...  
1  [-0.64187 -0.64157 -0.64114 -0.64143 -0.63927 ...  


In [None]:
## Functions (mean)

In [68]:
base_mean_trans = RowwiseTransformer(FunctionTransformer(func=np.mean, validate=False))
X_base_mean = base_mean_trans.fit_transform(X_base_seg)

print(X_base_mean.__class__)
print(X_base_mean.dtypes)
print("Class of a single cell: " + str(X_base_mean.iloc[0,0].__class__)) # Note: this has changed from pd.Series to np.ndarray
print("Representation in the console: ")
print(X_base_mean.iloc[0:2, 0:3])

<class 'pandas.core.frame.DataFrame'>
dim_0_117_146    float64
dim_0_87_118     float64
dim_0_135_141    float64
dtype: object
Class of a single cell: <class 'numpy.float64'>
Representation in the console: 
   dim_0_117_146  dim_0_87_118  dim_0_135_141
0      -0.645997      0.499598      -0.636725
1      -0.651753      0.005404      -0.642410


In [65]:
awk_mean_trans = AwkUniversalFunctionTransformer("mean")
X_awk_mean = awk_mean_trans.fit_transform(X_awk_seg)

print(X_awk_mean.__class__)
print("Representation in the console: ")
print(X_awk_mean)

<class 'awkward.array.jagged.JaggedArray'>
Representation in the console: 
[[-0.63931 -0.659584 1.527622307692308] [-0.6364700000000001 -0.6633417142857142 1.0755012307692309] [-0.703695 -0.6655291428571429 1.147088846153846] ... [-0.50953 -0.5284925714285714 1.0813631538461537] [-0.6385149999999999 -0.6135448571428572 0.6340556153846154] [-1.43735 -0.7737629428571429 0.7811576923076923]]


In [67]:
ext_mean_trans = ExtUniversalFunctionTransformer("mean")
X_ext_mean = ext_mean_trans.fit_transform(X_ext_seg)

print(X_ext_mean.__class__)
print(X_ext_mean.dtypes) # Note: the segments remain TimeSeries with all attached functionality
print("Class of a single cell: " + str(X_ext_mean.iloc[0,0].__class__)) 
print("Representation in the console: ")
print(X_ext_mean.iloc[0:2, 0:3])


<class 'pandas.core.frame.DataFrame'>
dim_0_122_143    float64
dim_0_71_77      float64
dim_0_138_150    float64
dtype: object
Class of a single cell: <class 'numpy.float64'>
Representation in the console: 
   dim_0_122_143  dim_0_71_77  dim_0_138_150
0      -0.640853     1.818050      -0.639093
1      -0.646031     1.914517      -0.637712


**Note:** Both new implementations use a separate GenericFunctionTransformer to apply a custom slope function. This is not shown here, since it has the same output. It should also be possible to create a FunctionTransformer that can handle both universal functions and generic functions at the same time (e.g. through simple if-else statements).



## Run the full benchmarking

In [69]:
%run ./comparison_benchmark.py


Awkward Array Tabularize: [6.817999999384484e-05, 6.693499999528285e-05, 8.150000001023727e-05, 6.53149999834568e-05, 5.734999999731372e-05, 6.3905000001796e-05, 6.859000000076776e-05, 6.573000000571483e-05, 6.51049999987663e-05, 6.201999999575492e-05, 5.875000001651642e-05, 5.680500000835309e-05, 6.108000000040192e-05, 6.52949999903285e-05, 6.395000000338768e-05, 6.545500000356696e-05, 5.6259999996655094e-05, 6.725500002175977e-05, 5.752000001848501e-05, 7.399500000246917e-05]
Awkward Array Detabularize: [0.0001596299999846451, 0.0001224800000045434, 0.0001412400000162961, 0.00014536500000303932, 0.00011879500000304688, 0.00015437499998824933, 0.00013801499999317458, 0.0001462049999872761, 0.00012991500000225642, 0.00013178500000776694, 0.00013113999998495273, 0.00013593999999557128, 0.00013593500000297354, 0.0001557450000063909, 0.00012319500001467532, 0.00013513000001239562, 0.00011666500001865643, 0.00014235999999527847, 0.00014779999999063874, 0.0001262300000007599]
Awkward Array

AttributeError: 'Series' object has no attribute 'time_index'