In [7]:
import random
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sktime.datasets import load_basic_motions
from sktime.transformations.series_as_features.compose import ColumnConcatenator
from sktime.classification.compose import TimeSeriesForestClassifier

# Ordinary situation

In [15]:
X, y = load_basic_motions(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(60, 6) (60,) (20, 6) (20,)


In [16]:
steps = [
    ('concatenate', ColumnConcatenator()),
    ('classify', TimeSeriesForestClassifier(n_estimators=100))]
clf = Pipeline(steps)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

1.0

# If time serial are unequal length -> error inside algorithm

In [12]:
# randomly cut the data series
def random_cut(df):
    for row_i in range(df.shape[0]):
        for dim_i in range(df.shape[1]):
            ts = df.at[row_i, f'dim_{dim_i}']
            df.at[row_i, f'dim_{dim_i}'] = pd.Series(ts.tolist()[:random.randint(len(ts)-5, len(ts)-3)]) # here is a problem

In [13]:
X, y = load_basic_motions(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(60, 6) (60,) (20, 6) (20,)


In [14]:
for df in [X_train, X_test]:
    random_cut(df)
    
try:
    steps = [
        ('concatenate', ColumnConcatenator()),
        ('classify', TimeSeriesForestClassifier(n_estimators=100))]
    clf = Pipeline(steps)
    clf.fit(X_train, y_train)
    clf.score(X_test, y_test)
except IndexError as e:
    print(f"IndexError: {e}")

  


ValueError: NumPy boolean array indexing assignment cannot assign 0 input values to the 2 output values where the mask is true

# Let's investigate the error

There are two errors. First is in the way that np.hstack stacks columns that have inequal sized arrays in cells.

In [13]:
# all ok
np.hstack([ [[1,1],[2,2],[3,3]], [[4,4],[5,5],[6,6]] ]).shape

(3, 4)

In [10]:
# this throws an error
np.hstack([ [[1,1,6],[2,2],[3,3]], [[4,4],[5,5],[6,6]] ]).shape

ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)

In [12]:
# and this not this thing stacks not horizontally but vertically...
np.hstack([ [[1,1,6],[2,2],[3,3]], [[4,4],[5,5, 2],[6,6]] ]).shape

(6,)

second error may appear if you use unidimensional time series

In [17]:
# here error is in the algorithm itself - index out of range

from sktime.datasets import load_gunpoint

X_train, y_train = load_gunpoint(split='TRAIN', return_X_y=True)
X_test, y_test = load_gunpoint(split='TEST', return_X_y=True)

for df in [X_train, X_test]:
    random_cut(df)
    
clf = TimeSeriesForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

IndexError: too many indices for array

# Now the resizing enrolls

In [8]:
from sktime.transformations.resizing import TSResizeTransform

X_train, y_train = load_basic_motions(split='TRAIN', return_X_y=True)
X_test, y_test = load_basic_motions(split='TEST', return_X_y=True)
            
for df in [X_train, X_test]:
    random_cut(df)
    
steps = [
    ('transform', TSResizeTransform(50)),
    ('concatenate', ColumnConcatenator()),
    ('classify', TimeSeriesForestClassifier(n_estimators=100))]
clf = Pipeline(steps)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)



0.975

In [None]:
# code 


import numpy as np
import pandas as pd
from scipy import interpolate

from sktime.transformations.base import BaseTransformer




class TSResizeTransform(BaseTransformer):
    """Transformer that get casual dataframe of time series and resizes 
            Series to user length via scipy interp1d between received points.
    """

    def __init__(self, length):
        """
        Parameters
        ----------
        length : integer, the length of time series to resize to.
        """
        assert(length>0)
        self.length = length
        super(TSResizeTransform).__init__()
        
    def __resizeCell(self, cell):
        f = interpolate.interp1d(list(np.linspace(0, 1, len(cell))), list(cell))
        return f(np.linspace(0, 1, self.length))
    
    def __resizeCol(self, coll):
        return coll.apply(self.__resizeCell)
    
    def transform(self, X, y=None):
        """Resizes time series in each cell of dataframe and returns it.

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_samples, n_features]
            Nested dataframe with time-series in cells.

        Returns
        -------
        Xt : pandas DataFrame
          Transformed pandas DataFrame with same number of rows and columns
        """
        return X.apply(self.__resizeCol)