In [33]:
import pandas as pd
from sktime.datasets import load_basic_motions, load_arrow_head
from sktime.utils.data_container import nested_to_3d_numpy, from_nested_to_long
import numpy as np
import awkward1 as ak

In [34]:
import numpy as np
from sktime.utils._testing.series_as_features import \
    make_classification_problem

from benchmarks.benchmark import ak_3d_arr
from benchmarks.benchmark import ak_record_arr
from benchmarks.benchmark import np_arr


def _mean(X, axis=-1):
    return np.mean(X, axis=axis)


X, y = make_classification_problem(n_instances=100, n_timepoints=100)

expected = _mean(np_arr(X))


## Using pandas with multi-indexing

In [2]:
X, y = load_basic_motions(return_X_y=True)
X = X.reset_index(drop=True)
nested_to_3d_numpy(X).shape

(80, 6, 100)

In [4]:
# nested format 
X.head()

Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5
0,0 0.079106 1 0.079106 2 -0.903497 3...,0 0.394032 1 0.394032 2 -3.666397 3...,0 0.551444 1 0.551444 2 -0.282844 3...,0 0.351565 1 0.351565 2 -0.095881 3...,0 0.023970 1 0.023970 2 -0.319605 3...,0 0.633883 1 0.633883 2 0.972131 3...
1,0 0.377751 1 0.377751 2 2.952965 3...,0 -0.610850 1 -0.610850 2 0.970717 3...,0 -0.147376 1 -0.147376 2 -5.962515 3...,0 -0.103872 1 -0.103872 2 -7.593275 3...,0 -0.109198 1 -0.109198 2 -0.697804 3...,0 -0.037287 1 -0.037287 2 -2.865789 3...
2,0 -0.813905 1 -0.813905 2 -0.424628 3...,0 0.825666 1 0.825666 2 -1.305033 3...,0 0.032712 1 0.032712 2 0.826170 3...,0 0.021307 1 0.021307 2 -0.372872 3...,0 0.122515 1 0.122515 2 -0.045277 3...,0 0.775041 1 0.775041 2 0.383526 3...
3,0 0.289855 1 0.289855 2 -0.669185 3...,0 0.284130 1 0.284130 2 -0.210466 3...,0 0.213680 1 0.213680 2 0.252267 3...,0 -0.314278 1 -0.314278 2 0.018644 3...,0 0.074574 1 0.074574 2 0.007990 3...,0 -0.079901 1 -0.079901 2 0.237040 3...
4,0 -0.123238 1 -0.123238 2 -0.249547 3...,0 0.379341 1 0.379341 2 0.541501 3...,0 -0.286006 1 -0.286006 2 0.208420 3...,0 -0.098545 1 -0.098545 2 -0.023970 3...,0 0.058594 1 0.058594 2 0.175783 3...,0 -0.074574 1 -0.074574 2 0.114525 3...


In [7]:
X.iloc[0, 0]

0     0.079106
1     0.079106
2    -0.903497
3     1.116125
4     1.638200
        ...   
95   -0.167918
96   -0.227670
97   -0.193271
98   -0.193271
99   -0.205150
Length: 100, dtype: float64

### Wide format: multi-index columns
* assumes instances share time index

### Long format: multi-index over rows
* does not work with time-constant variables
* does not work with type-heterogenous data

In [4]:
long = from_nested_to_long(X)
long = long.rename(columns={"index": "instance", "column": "variable", "time_index": "time"})
long.time = long.time.astype(int)
long = long.set_index(["instance", "variable", "time"])

# sorting seems necessary to avoid performance issues in later slicing operations and for prettier display of index 
long = long.sort_index()
long.shape

(48000, 1)

In [164]:
# multi-index df
long.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,value
instance,variable,time,Unnamed: 3_level_1
0,dim_0,0,0.079106
0,dim_0,1,0.079106
0,dim_0,2,-0.903497
0,dim_0,3,1.116125
0,dim_0,4,1.6382


In [166]:
type(long.index)

pandas.core.indexes.multi.MultiIndex

In [168]:
long.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 48000 entries, (0, 'dim_0', 0) to (79, 'dim_5', 99)
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   value   48000 non-null  float64
dtypes: float64(1)
memory usage: 1.8+ MB


In [167]:
long.to_numpy().shape

(48000, 1)

#### loc indexing
loc indexing works as expected but requires knowledge of index labels (i.e. instance names, variable names, time point labels)

In [162]:
long.loc[0, "dim_0", 3:5]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,value
instance,variable,time,Unnamed: 3_level_1
0,dim_0,3,1.116125
0,dim_0,4,1.6382
0,dim_0,5,1.003448


In [163]:
# but requires to know variable names, so this does not work:
# long.loc[0, 0, 3:5]

#### iloc indexing
iloc indexing does *not* work as desired, in the sense that it does not return first instance, first variable, 3rd-5th time point

```python
long.iloc[0, 0, 3:5]
```

In [16]:
# instead returns simply first rows
long.iloc[1:4]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,value
instance,variable,time,Unnamed: 3_level_1
0,dim_0,1,0.079106
0,dim_0,2,-0.903497
0,dim_0,3,1.116125


#### Work around to still be able to use iloc

* we would want `.iloc[0]` to return the first instance, not the first row, and `.iloc[0, 0, 0]` to return the first row
* could we rewrite iloc to behave like this? 

In [154]:
# go from iloc to loc values
instances = long.index.get_level_values("instance").unique()
variables = long.index.get_level_values("variable").unique()
timepoints = long.index.get_level_values("time").unique()

instances_loc = instances[11:13]
variables_loc = variables[2:4]
timepoints_loc = timepoints[3:6]

a = long.loc[instances_loc, variables_loc, timepoints_loc]
a

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,value
instance,variable,time,Unnamed: 3_level_1
11,dim_2,3,-3.230503
11,dim_2,4,-3.833633
11,dim_2,5,-9.051298
11,dim_3,3,2.972324
11,dim_3,4,3.803296
11,dim_3,5,1.984213
12,dim_2,3,-0.526095
12,dim_2,4,-3.895602
12,dim_2,5,-5.755945
12,dim_3,3,0.22106


In [155]:
# operations over levels
a.mean(level=["instance", "variable"])

Unnamed: 0_level_0,Unnamed: 1_level_0,value
instance,variable,Unnamed: 2_level_1
11,dim_2,-5.371811
11,dim_3,2.919944
12,dim_2,-3.392547
12,dim_3,2.090747


---
## Awkward array
* like 3d numpy, but support for ragged arrays with varying number of time points across instances and/or variables
* in its simplest form, does not keep track of time index
* question: how much more complicated/less efficient will it be if we keep track of index?

In [2]:
from sktime.datasets import load_arrow_head, load_basic_motions
from sktime.utils.data_container import tabularize, nested_to_3d_numpy
import awkward1 as ak
import numpy as np
import pandas as pd

## Using awkward array

### Time-homogeneous panel data: simple 3d ak.array

#### Univariate panel data

In [47]:
equal, target = load_arrow_head(return_X_y=True)
X = nested_to_3d_numpy(equal)
X.shape

(211, 1, 251)

In [30]:
%timeit np.mean(X, axis=-1)

52.8 µs ± 161 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [48]:
X = ak.Array(X)
X

<Array [[[-1.96, -1.96, ... -1.6, -1.62]]] type='211 * 1 * 251 * float64'>

In [50]:
X[0, 0].shape

(251,)

In [32]:
%timeit np.mean(X, axis=-1)

2.69 ms ± 9.31 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [356]:
X.shape

(211,)

In [357]:
# temporal slice
X[:, :, :20]

<Array [[[-1.96, -1.96, ... -0.903, -0.861]]] type='211 * 1 * 20 * float64'>

In [358]:
layout = X.layout
type(layout)

awkward1._ext.NumpyArray

In [92]:
layout.shape

[211, 1, 251]

In [95]:
layout.form

{
    "class": "NumpyArray",
    "inner_shape": [
        1,
        251
    ],
    "itemsize": 8,
    "format": "d",
    "primitive": "float64"
}

In [96]:
layout.format

'd'

In [99]:
layout.ndim

3

#### Multivariate panel data

In [20]:
X, target = load_basic_motions(return_X_y=True)
X = nested_to_3d_numpy(X)
X.shape

(80, 6, 100)

In [516]:
X = ak.Array(X)
X

<Array [[[0.0791, 0.0791, ... -0.722, -1.78]]] type='80 * 6 * 100 * float64'>

In [517]:
X[5:20, 3:5, 45:75]

<Array [[[-0.00799, 0.0186, ... 3.6, 3.6]]] type='15 * 2 * 30 * float64'>

### Time-heterogeneous panel data

#### Univariate unequal length data

In [5]:
X, target = load_arrow_head(return_X_y=True)
unequal = X.copy()
l = unequal.iloc[:, 0].tolist()
l[0] = l[0][:200]
unequal = pd.DataFrame(pd.Series([pd.Series(s) for s in l]))
assert unequal.iloc[0, 0].shape[0] != unequal.iloc[1, 0].shape[0]
unequal.head()

Unnamed: 0,0
0,0 -1.9630 1 -1.9578 2 -1.9561 3 ...
1,0 -1.7746 1 -1.7740 2 -1.7766 3 ...
2,0 -1.8660 1 -1.8420 2 -1.8350 3 ...
3,0 -2.0738 1 -2.0733 2 -2.0446 3 ...
4,0 -1.7463 1 -1.7413 2 -1.7227 3 ...


* why is the 2nd dimension "var" now and not "1"? 

In [53]:
X = ak.Array([[i] for i in unequal.iloc[:, 0].tolist()])
X

<Array [[[-1.96, -1.96, ... -1.6, -1.62]]] type='211 * var * var * float64'>

In [40]:
np.all([isinstance(j, list) for j in [[i] for i in unequal.iloc[:, 0].tolist()]])

True

In [43]:
X.shape

(211,)

In [44]:
layout = X.layout

In [45]:
layout.form

{
    "class": "ListOffsetArray64",
    "offsets": "i64",
    "content": {
        "class": "ListOffsetArray64",
        "offsets": "i64",
        "content": "float64"
    }
}

In [46]:
layout.offsets

<Index64 i="[0 1 2 3 4 ... 207 208 209 210 211]" offset="0" length="212" at="0x7fc2a3d44a00"/>

In [47]:
# temporal slicing
X[:, :, :20]

<Array [[[-1.96, -1.96, ... -0.903, -0.861]]] type='211 * var * var * float64'>

In [48]:
X[0, 0, :] # unequal length array

<Array [-1.96, -1.96, -1.96, ... 1.45, 1.47] type='200 * float64'>

In [52]:
np.testing.assert_almost_equal(ak.mean(X, axis=2)[0][0], np.mean(unequal[0][0]))

#### Multivariate unequal-length data

In [361]:
X, y = load_basic_motions(return_X_y=True)
unequal = X.copy()
l = unequal.iloc[:, 1].tolist()
l[0] = l[0][:50]
unequal.iloc[:, 1] = pd.Series([pd.Series(s) for s in l])

In [362]:
print(unequal.iloc[:, 1].iloc[0].shape, unequal.iloc[:, 1].iloc[1].shape)

(50,) (100,)


In [363]:
n_instances, n_variables = X.shape
il = []

for i in range(n_instances):
    xl = []
    for v in range(n_variables):
        # get time series
        series = X.iloc[i, v]
        
        # separate values/index
        values = series.to_numpy()
        index = series.index.to_numpy()
        
        x = {"values": values, "index": index}
        xl.append(x)
    il.append(xl)
arr = ak.Array(il)

In [366]:
arr[1:50, 2:4, :20].values

<Array [[[-0.147, -0.147, ... 0.0772, 0.152]]] type='49 * var * var * float64'>

### Keeping track of the time index: ak.RecordArray

#### Univariate unequal length panel data 

In [6]:
X = unequal.copy()
X.head()

Unnamed: 0,0
0,0 -1.9630 1 -1.9578 2 -1.9561 3 ...
1,0 -1.7746 1 -1.7740 2 -1.7766 3 ...
2,0 -1.8660 1 -1.8420 2 -1.8350 3 ...
3,0 -2.0738 1 -2.0733 2 -2.0446 3 ...
4,0 -1.7463 1 -1.7413 2 -1.7227 3 ...


In [10]:
n_instances, n_variables = X.shape
print("n_instances: ", n_instances, "\nn_variables: ", n_variables)
instances = []
for i in range(n_instances):
    instance = []
    for v in range(n_variables):
        # get time series
        series = X.iloc[i, v]
        
        # separate values/index
        values = series.to_numpy()
        index = series.index.to_numpy()
      
        variable = {"values": values, "index": index}
        instance.append(variable)
    instances.append(instance)
arr = ak.Array(instances)

n_instances:  211 
n_variables:  1


In [11]:
arr

<Array [[{values: [-1.96, -1.96, ... 250]}]] type='211 * var * {"values": var * ...'>

In [12]:
arr.layout

<ListOffsetArray64>
    <offsets><Index64 i="[0 1 2 3 4 ... 207 208 209 210 211]" offset="0" length="212" at="0x7f92bc59aa00"/></offsets>
    <content><RecordArray>
        <field index="0" key="values">
            <ListOffsetArray64>
                <offsets><Index64 i="[0 200 451 702 953 ... 51906 52157 52408 52659 52910]" offset="0" length="212" at="0x7f92bc59ca00"/></offsets>
                <content><NumpyArray format="d" shape="52910" data="-1.963 -1.9578 -1.9561 -1.9383 -1.8967 ... -1.5136 -1.5504 -1.5816 -1.5953 -1.6208" at="0x7f92c39bc000"/></content>
            </ListOffsetArray64>
        </field>
        <field index="1" key="index">
            <ListOffsetArray64>
                <offsets><Index64 i="[0 200 451 702 953 ... 51906 52157 52408 52659 52910]" offset="0" length="212" at="0x7f92bc5a0a00"/></offsets>
                <content><NumpyArray format="l" shape="52910" data="0 1 2 3 4 ... 246 247 248 249 250" at="0x7f92c03c9000"/></content>
            </ListOffsetArray

In [13]:
# returns series of first 20 observations, incl both values and time index/points
arr[0, 0, :20]

<Record ... 12, 13, 14, 15, 16, 17, 18, 19]} type='{"values": var * float64, "in...'>

In [14]:
# returns values of first 20 time points
arr[0, 0, :20, "values"]

<Array [-1.96, -1.96, ... -0.969, -0.832] type='20 * float64'>

In [15]:
# works the same
arr[0, 0, :20].values

<Array [-1.96, -1.96, ... -0.969, -0.832] type='20 * float64'>

In [16]:
# returns index of first 20 time points
arr[0, 0, :20, "index"]

<Array [0, 1, 2, 3, 4, ... 15, 16, 17, 18, 19] type='20 * int64'>

In [17]:
# works the same
arr[0, 0, :20].index 

<Array [0, 1, 2, 3, 4, ... 15, 16, 17, 18, 19] type='20 * int64'>

In [527]:
# transform to pd.Index
pd.Index(np.array(arr[0, 0, :20].index))

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
            19],
           dtype='int64')

* this works, but note that "index" and "values" are not linked, deleting a value from "index" won't delete the corresponding value from "values"
* can we transform a one-dimensional ak.array to a `pd.Series` or `pd.Index`?

In [529]:
pd.Series(np.array(arr[0, 0, :20].values))

0    -1.96300
1    -1.95780
2    -1.95610
3    -1.93830
4    -1.89670
5    -1.86990
6    -1.83870
7    -1.81230
8    -1.73640
9    -1.67330
10   -1.62310
11   -1.58590
12   -1.54380
13   -1.45680
14   -1.37870
15   -1.29250
16   -1.21700
17   -1.10900
18   -0.96869
19   -0.83160
dtype: float64

In [334]:
arr.layout

<ListOffsetArray64>
    <offsets><Index64 i="[0 1 2 3 4 ... 207 208 209 210 211]" offset="0" length="212" at="0x7fd2623dc200"/></offsets>
    <content><RecordArray>
        <field index="0" key="values">
            <ListOffsetArray64>
                <offsets><Index64 i="[0 200 451 702 953 ... 51906 52157 52408 52659 52910]" offset="0" length="212" at="0x7fd2623e0200"/></offsets>
                <content><NumpyArray format="d" shape="52910" data="-1.963 -1.9578 -1.9561 -1.9383 -1.8967 ... -1.5136 -1.5504 -1.5816 -1.5953 -1.6208" at="0x7fd2382b5000"/></content>
            </ListOffsetArray64>
        </field>
        <field index="1" key="index">
            <ListOffsetArray64>
                <offsets><Index64 i="[0 200 451 702 953 ... 51906 52157 52408 52659 52910]" offset="0" length="212" at="0x7fd2623e4200"/></offsets>
                <content><NumpyArray format="l" shape="52910" data="0 1 2 3 4 ... 246 247 248 249 250" at="0x7fd238329000"/></content>
            </ListOffsetArray

#### Linking index with value
* based on conversation with Jim

In [3]:
def make_ak_array(Xl, n_instances, n_variables):
    assert isinstance(Xl, list)
    instances = []
    for i in range(n_instances):
        instance = []
        for v in range(n_variables):
            # get time series
            series = Xl[i][v]

            # separate values/index
            values = series.to_numpy()
            times = series.index.to_numpy()

            variable = [{"time": time, "value": value} 
                        for time, value in zip(times, values)]
            instance.append(variable)
        instances.append(instance)
    return ak.Array(instances)

In [21]:
def make_heterogeneous_time_ak_array(X):
    Xc = X.copy()
    n_instances, n_variables = Xc.shape
    n_timepoints = Xc.iloc[0, 0].shape[0]
    variable_names = list(X.columns)
    min_n_timepoints = 10

    # convert data into nested list 
    Xl = []
    for i in range(n_instances):
        variables = []
        for v in range(n_variables):
            series = Xc.iloc[i, v]
            assert series.shape == (n_timepoints,)
            variables.append(series)
        Xl.append(variables)
    assert len(Xl) == n_instances

    # select instances, variables and time points for cutting
    instances_to_cut = np.random.choice(np.arange(n_instances), replace=False, 
                                        size=n_instances)
    variables_to_cut = np.random.choice(np.arange(n_variables), replace=False, 
                                 size=n_variables)
    timepoints_to_cut = np.random.randint(min_n_timepoints, n_timepoints, 
                                         size=(len(instances_to_cut), len(variables_to_cut)))

    # cut 
    for ii, i in enumerate(instances_to_cut):
        variables = []
        for vv, v in enumerate(variables_to_cut):
            timepoint = timepoints_to_cut[ii, vv]
            Xl[i][v] = Xl[i][v].iloc[:timepoint]
            assert len(Xl[i][v]) == timepoint

    # create array
    arr = make_ak_array(Xl, n_instances=n_instances, n_variables=n_variables)

    # quick check
    for ii, i in enumerate(instances_to_cut):
        for vv, v in enumerate(variables_to_cut):
            assert len(arr[int(i), int(v)]) == timepoints_to_cut[ii, vv]
    
    return arr

In [22]:
X, _ = load_basic_motions(return_X_y=True)
arr = make_heterogeneous_time_ak_array(X)
arr

<Array [[[{time: 0, ... value: 0.959}]]] type='80 * var * var * {"time": int64, ...'>

In [24]:
arr[0, 1, :10, "value"]

<Array [0.394, 0.394, -3.67, ... 1.96, 1.96] type='10 * float64'>

In [25]:
arr[0, 1, :10, "time"]

<Array [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] type='10 * int64'>

In [27]:
arr[:5, , :10]

<Array [[{time: 0, ... value: 0.238}]] type='5 * var * {"time": int64, "value": ...'>

In [6]:
arr.layout

<ListOffsetArray64>
    <offsets><Index64 i="[0 6 12 18 24 ... 456 462 468 474 480]" offset="0" length="81" at="0x7fba7c234a00"/></offsets>
    <content><ListOffsetArray64>
        <offsets><Index64 i="[0 96 146 170 205 ... 25734 25788 25828 25894 25971]" offset="0" length="481" at="0x7fba7c23ba00"/></offsets>
        <content><RecordArray>
            <field index="0" key="time">
                <NumpyArray format="l" shape="25971" data="0 1 2 3 4 ... 72 73 74 75 76" at="0x7fba83767000"/>
            </field>
            <field index="1" key="value">
                <NumpyArray format="d" shape="25971" data="0.079106 0.079106 -0.903497 1.11613 1.6382 ... -1.35299 -1.1346 -1.98155 -1.60601 -6.79426" at="0x7fba8379b000"/>
            </field>
        </RecordArray></content>
    </ListOffsetArray64></content>
</ListOffsetArray64>

In [7]:
# multidimensional slicing works
arr[0, 0, :10]

<Array [{time: 0, ... value: -0.12}] type='10 * {"time": int64, "value": float64}'>

In [8]:
arr[3:6, 2:5, 0, "value"]

<Array [[0.214, -0.314, ... 0.0746, 0.16]] type='3 * var * float64'>

In [9]:
# applying functions works
ak.mean(arr["value"], axis=-1)

<Array [[-0.0812, 0.279, ... -0.364, -0.592]] type='80 * var * ?float64'>

In [10]:
# subsetting based on time index doesn't work directly 
arr[0, 0, "time" == 10]

<Record {time: 0, value: 0.0791} type='{"time": int64, "value": float64}'>

In [11]:
# but this works as expected
arr[0, 0, arr[0, 0, "time"] == 10]

<Array [{time: 10, value: 0.667}] type='1 * {"time": int64, "value": float64}'>

In [12]:
# and this also works for subsetting based on values
arr[0, 0, arr[0, 0, "value"] < 0]

<Array [{time: 2, ... value: -0.168}] type='75 * {"time": int64, "value": float64}'>

In [13]:
ak.to_numpy(arr[0, 0, "time"])

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95])

In [20]:
arr[0, 0, 1, "value"]

0.079106

### Dictionary based transform and histogram data

In [401]:
from sktime.transformers.series_as_features.dictionary_based import SAX

X, target = load_arrow_head(return_X_y=True)

sax = SAX(save_words=True)
Xt = sax.fit_transform(X)

In [402]:
Xt.shape

(211, 1)

In [403]:
# check if unequal length
for i in range(1, X.shape[0]):
    if not len(Xt.iloc[0, 0]) == len(Xt.iloc[i, 0]):
        print(i, len(Xt.iloc[i, 0]))
        break

1 62


In [404]:
# print single series
a = pd.DataFrame(Xt.iloc[i, 0]).reset_index().rename(columns={0: "counts"})
a

Unnamed: 0,index,counts
0,431,21
1,1455,32
2,1471,26
3,447,6
4,1727,1
...,...,...
57,12180,1
58,44868,1
59,64848,1
60,63824,1


In [277]:
ll = [
    np.column_stack([
    Xt.iloc[i, 0].index.to_numpy(), # index
    Xt.iloc[i, 0].to_numpy() # counts
    ]).T
for i in range(Xt.shape[0])]
a = ak.Array(ll)
print("index:", a[1, 0, :3], "counts:", ak.Array(ll)[1, 1, :3])

index: [431, 1455, 1471] counts: [21, 32, 26]


In [276]:
# alternatively
ll = [{
    "index": Xt.iloc[i, 0].index.to_numpy(), # index
    "counts": Xt.iloc[i, 0].to_numpy() # counts
}
for i in range(Xt.shape[0])]
a = ak.Array(ll)
print("index:", a[1, "index", :3], "counts:", ak.Array(ll)[1, "counts", :3])

index: [431, 1455, 1471] counts: [21, 32, 26]


In [192]:
r = ak.Array([[i] for i in Xt.iloc[:, 0].tolist()])
r

<Array [[[3, 25, 19, 14, 10, ... 1, 1, 1, 1]]] type='211 * var * var * int64'>

In [205]:
r[0, 0, :]

<Array [3, 25, 19, 14, 10, ... 1, 1, 2, 1, 1] type='88 * int64'>

### Univariate data (single-instance data)

In [541]:
from sktime.datasets import load_airline
y = load_airline()

# date/time index
y.index = pd.date_range(start="1960", periods=len(y), freq="M")
y.index

DatetimeIndex(['1960-01-31', '1960-02-29', '1960-03-31', '1960-04-30',
               '1960-05-31', '1960-06-30', '1960-07-31', '1960-08-31',
               '1960-09-30', '1960-10-31',
               ...
               '1971-03-31', '1971-04-30', '1971-05-31', '1971-06-30',
               '1971-07-31', '1971-08-31', '1971-09-30', '1971-10-31',
               '1971-11-30', '1971-12-31'],
              dtype='datetime64[ns]', length=144, freq='M')

In [542]:
pd.DatetimeIndex(y.index.to_native_types())  # freq is lost, otherwise restores index

DatetimeIndex(['1960-01-31', '1960-02-29', '1960-03-31', '1960-04-30',
               '1960-05-31', '1960-06-30', '1960-07-31', '1960-08-31',
               '1960-09-30', '1960-10-31',
               ...
               '1971-03-31', '1971-04-30', '1971-05-31', '1971-06-30',
               '1971-07-31', '1971-08-31', '1971-09-30', '1971-10-31',
               '1971-11-30', '1971-12-31'],
              dtype='datetime64[ns]', length=144, freq=None)

In [543]:
# freq can be inferred automatically in some cases
# or we keep track of it somewhere in the estimator
pd.infer_freq(y.index.to_native_types())

'M'

In [544]:
arr = ak.Array(y)
arr.shape

(144,)

In [545]:
# uniform representation across learning tasks: [instance, variable, time points, values/index]
arr = ak.Array([[{"values": y.to_numpy(), "index": y.index.to_native_types()}]])
arr.shape

(1,)

In [546]:
arr[0, 0, :20, "values"]  # [instance, variable, time points, values/index]

<Array [112, 118, 132, 129, ... 149, 170, 170] type='20 * int64'>

In [547]:
arr[0, 0, :20, "index"]  # [instance, variable, time points, values/index]

<Array ['1960-01-31', ... '1961-08-31'] type='20 * string'>

In [548]:
ak.pandas.df(arr) # returns unexpected result

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,values,index
entry,subentry,subsubentry,subsubsubentry,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,0,0,112,49
0,0,0,1,112,57
0,0,0,2,112,54
0,0,0,3,112,48
0,0,0,4,112,45
0,0,...,...,...,...
0,0,143,5,432,49
0,0,143,6,432,50
0,0,143,7,432,45
0,0,143,8,432,51


In [549]:
# alternatively, for single-instances settings: [variable, time points, values/index]
arr = ak.Array([{"values": y.to_numpy(), "index": y.index.to_native_types()}])
arr.shape

(1,)

In [550]:
arr[0, :20, "values"] # [variable, time points, index/values]

<Array [112, 118, 132, 129, ... 149, 170, 170] type='20 * int64'>

In [566]:
# index based slicing
mask = arr[0, :, "index"] == "1965-01-31"
arr[0, mask, "values"]

<Array [204] type='1 * int64'>

* for one-dimensional representation, if we want to keep track of the index, we cannot use a ak.Array but would have to use ak.Record 

In [505]:
# for univariate data we can use ak.Record
arr = ak.Record({"values": y.to_numpy(), "index": y.index.to_native_types()})
arr.values

<Array [112, 118, 132, 129, ... 461, 390, 432] type='144 * int64'>

In [513]:
arr.index

<Array ['1960-12-31', ... '2103-12-31'] type='144 * string'>

In [512]:
# or just a series
pd.Series(np.array(arr.values), index=pd.DatetimeIndex(np.array(arr.index)))

1960-12-31    112
1961-12-31    118
1962-12-31    132
1963-12-31    129
1964-12-31    121
             ... 
2099-12-31    606
2100-12-31    508
2101-12-31    461
2102-12-31    390
2103-12-31    432
Length: 144, dtype: int64

* conversion to pd.DataFrame and pd.Series seems a bit tricky still, with an intermediate step to numpy

## Benchmarking

In [123]:
from benchmark import make_ak_array
import awkward1 as ak
import numpy as np
from sklearn.base import clone
from sktime.classification.interval_based import TimeSeriesForest
from sktime.datasets import load_arrow_head
from tsf_awkward import TimeSeriesForestAwkward
from sktime.utils.data_container import tabularize

In [124]:
X_train, y_train = load_arrow_head(split="train", return_X_y=True)
X_test, y_test = load_arrow_head(split="test", return_X_y=True)

In [118]:
def slope_np(Y):
    x = np.arange(Y.shape[1]) + 1
    slope = (np.mean(x * Y, axis=1) - np.mean(x) * np.mean(Y, axis=1)) / ((x * x).mean() - x.mean() ** 2)
    return slope

X = tabularize(X_train).to_numpy()
a = slope_np(X)
assert a.shape == (X_train.shape[0],)

def slope_ak(Y):
    x = np.arange(Y[0].shape[0]) + 1
    slope = (np.mean(x * np.array(Y), axis=1) - np.mean(x) * np.mean(Y, axis=1)) / ((x * x).mean() - x.mean() ** 2)
    return slope

X = make_ak_array(X_train)[:, 0, "value"]
b = slope_ak(X)
assert a.shape == (X_train.shape[0],)

np.testing.assert_array_almost_equal(a, b)

In [119]:
X = tabularize(X_train).to_numpy()
%timeit slope_np(X)

69.8 µs ± 2.3 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [120]:
X = make_ak_array(X_train)[:, 0, "value"]
%timeit b = slope_ak(X)

29.8 ms ± 1.61 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [121]:
X = tabularize(X_train).to_numpy()
%lprun -f slope_np slope_np(X)

Timer unit: 1e-06 s

Total time: 0.00025 s
File: <ipython-input-118-39fb83790250>
Function: slope_np at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def slope_np(Y):
     2         1         17.0     17.0      6.8      x = np.arange(Y.shape[1]) + 1
     3         1        232.0    232.0     92.8      slope = (np.mean(x * Y, axis=1) - np.mean(x) * np.mean(Y, axis=1)) / ((x * x).mean() - x.mean() ** 2)
     4         1          1.0      1.0      0.4      return slope

In [122]:
X = make_ak_array(X_train)[:, 0, "value"]
%lprun -f slope_ak slope_ak(X)

Timer unit: 1e-06 s

Total time: 0.088384 s
File: <ipython-input-118-39fb83790250>
Function: slope_ak at line 10

Line #      Hits         Time  Per Hit   % Time  Line Contents
    10                                           def slope_ak(Y):
    11         1         95.0     95.0      0.1      x = np.arange(Y[0].shape[0]) + 1
    12         1      88288.0  88288.0     99.9      slope = (np.mean(x * np.array(Y), axis=1) - np.mean(x) * np.mean(Y, axis=1)) / ((x * x).mean() - x.mean() ** 2)
    13         1          1.0      1.0      0.0      return slope

In [100]:
%load_ext line_profiler

In [129]:
X = tabularize(X_train).to_numpy()
a = X.mean(axis=1)
%timeit X.mean(axis=1)

X = make_ak_array(X_train)[:, 0, "value"]
b = ak.mean(X, axis=1)
%timeit ak.mean(X, axis=1)
np.testing.assert_array_almost_equal(a, b)

10.3 µs ± 185 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
510 µs ± 8.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [130]:
X = tabularize(X_train).to_numpy()
a = np.std(X, axis=1)
%timeit np.std(X, axis=1)

X = make_ak_array(X_train)[:, 0, "value"]
b = ak.std(X, axis=1)
%timeit ak.std(X, axis=1)
np.testing.assert_array_almost_equal(a, b)

36.9 µs ± 138 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
2.48 ms ± 7.33 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## Alternatives

In [311]:
arr = ak.Array([
     [[{
         "series": [
             {"time": t, "value": v} for t, v in zip(np.arange(10), np.repeat(1.0, 10))],
         "meta": "meta"
     }],   
     [{
         "series": [
             {"time": t, "value": v} for t, v in zip(np.arange(10), np.repeat(2.0, 10))],
         "meta": "meta"
     }]], 
     [[{
         "series": [
             {"time": t, "value": v} for t, v in zip(np.arange(10), np.repeat(3.0, 10))],
         "meta": "meta"
     }],   
     [{
         "series": [
             {"time": t, "value": v} for t, v in zip(np.arange(10), np.repeat(4.0, 10))],
         "meta": "meta"
     }]]
])

In [312]:
arr[0, 0, "series", :3, "value"]

<Array [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]] type='1 * var * float64'>

In [314]:
# works, but does not allow to use arbitrary objects because "meta" is still stored as an array
arr[1, 1, "meta", 0]

'meta'

In [None]:




array = ak.Array([{"patient": "A", "data": [{"variable": "pressure", "series": [{"idx": 0, "time": 0.0, "value": 22.8}]}, {"variable": "temperature", "series": [{"idx": 0, "time": 0.0, "value": 98.6}]}]}, {"patient": "B", "data": [{"variable": "pressure", "series": [{"idx": 0, "time": 0.0, "value": 22.8}]}, {"variable": "temperature", "series": [{"idx": 0, "time": 0.0, "value": 98.6}]}]}])





array = ak.Array([("A", [{"variable": "pressure", "series": [{"idx": 0, "time": 0.0, "value": 22.8}]}, {"variable": "temperature", "series": [{"idx": 0, "time": 0.0, "value": 98.6}]}]), ("B", [{"variable": "pressure", "series": [{"idx": 0, "time": 0.0, "value": 22.8}]}, {"variable": "temperature", "series": [{"idx": 0, "time": 0.0, "value": 98.6}]}])])


array[:, "1", "series", :, 0].tolist()
[[{'idx': 0, 'time': 0.0, 'value': 22.8}, {'idx': 0, 'time': 0.0, 'value': 98.6}], [{'idx': 0, 'time': 0.0, 'value': 22.8}, {'idx': 0, 'time': 0.0, 'value': 98.6}]]
array.slot1[:, "series", :, 0].tolist()
[[{'idx': 0, 'time': 0.0, 'value': 22.8}, {'idx': 0, 'time': 0.0, 'value': 98.6}], [{'idx': 0, 'time': 0.0, 'value': 22.8}, {'idx': 0, 'time': 0.0, 'value': 98.6}]]
array.slot0
<Array ['A', 'B'] type='2 * string'>


array = ak.Array([{"patient": "A", "data": [{"variable": "pressure", "series": [{"idx": 0, "time": 0.0, "value": 22.8}]}, {"variable": "temperature", "series": [{"idx": 0, "time": 0.0, "value": 98.6}]}]}, {"patient": "B", "data": [{"variable": "pressure", "series": [{"idx": 0, "time": 0.0, "value": 22.8}]}, {"variable": "temperature", "series": [{"idx": 0, "time": 0.0, "value": 98.6}]}]}])



array[:, "data", 0]
<Array [{variable: 'pressure', ... ] type='2 * {"variable": string, "series": va...'>
array[:, "data", 0].tolist()
[{'variable': 'pressure', 'series': [{'idx': 0, 'time': 0.0, 'value': 22.8}]}, {'variable': 'pressure', 'series': [{'idx': 0, 'time': 0.0, 'value': 22.8}]}]
array[:, "data", 0, "series", "value"].tolist()
[[22.8], [22.8]]
ak.mean(array[:, "data", 0, "series", "value"])
22.8
ak.mean(array[:, "data", 0, "series", "value"], axis=None)
22.8
ak.mean(array[:, "data", 0, "series", "value"], axis=-1)
<Array [22.8, 22.8] type='2 * ?float64'>
