[ENH] Improve vectorized metric calculation and deprecate VectorizedD…

…F.__getitem__ and VectorizedDF.get_iloc_indexer (#4228) Followup to #4195 Contributes to #4139 This PR implements `BaseForecastingErrorMetric._evaluate_vectorized` using `VectorizedDF.vectorize_est`. Removes the last reference to `VectorizedDF.__getitem__`. Random access is not needed, and developers should use `__iter__` for iteration instead (implemented in #4195). Also, unused method `get_iloc_indexer` is marked as deprecated and should be removed in a future version.
sktime · Mar 7, 2023 · d85102f · d85102f
1 parent 925f55e
commit d85102f
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 133 deletions.
diff --git a/sktime/datatypes/_vectorize.py b/sktime/datatypes/_vectorize.py
@@ -9,10 +9,10 @@
 
 
 import itertools
-from itertools import product
 
 import numpy as np
 import pandas as pd
+from deprecated.sphinx import deprecated
 
 from sktime.datatypes._check import check_is_scitype, mtype
 from sktime.datatypes._convert import convert_to
@@ -47,11 +47,11 @@ class VectorizedDF:
 
     Methods
     -------
-    self[i] or self.__getitem__(i)
-        Returns i-th Series/Panel (depending on iterate_as) in X
+    iter(self) or self.__iter__()
+        Iterates over each Series/Panel (depending on iterate_as) in X
         as pandas.DataFrame with Index or MultiIndex (in sktime pandas format)
-    len(self) or self.__len__
-        returns number of Series/Panel in X
+    len(self) or self.__len__()
+        returns number of Series/Panels in X
     get_iter_indices()
         Returns pandas.(Multi)Index that are iterated over
     reconstruct(self, df_list, convert_back=False)
@@ -189,6 +189,12 @@ def get_iter_indices(self):
         """
         return self.iter_indices
 
+    # TODO: remove in v0.18.0
+    @deprecated(
+        version="0.16.2",
+        reason="get_iloc_indexer will be removed in v0.18.0",
+        category=FutureWarning,
+    )
     def get_iloc_indexer(self, i: int):
         """Get iloc row/column indexer for i-th list element.
 
@@ -210,109 +216,10 @@ def get_iloc_indexer(self, i: int):
             col_n = len(col_ix)
             return (i // col_n, i % col_n)
 
-    def _iter_indices(self, X=None):
-        """Get indices that are iterated over in vectorization.
-
-        Allows specifying `X` other than self, in which case indices are references
-        to row and column indices of `X`.
-
-        Parameters
-        ----------
-        X : `None`, `VectorizedDF`, or pd.DataFrame; optional, default=self
-          must be in one of the `sktime` time series formats, with last column time
-          if not `self`, the highest levels of row or column index in `X`
-          must agree with those indices of `self` that are non-trivially vectorized
-
-        Returns
-        -------
-        list of pair of `pandas.Index` or `pandas.MultiIndex`
-            iterable with unique indices that are iterated over
-            use to reconstruct data frame after iteration
-            `i`-th element of list selects rows/columns in `i`-th iterate sub-DataFrame
-            first element of pair are rows, second element are columns selected
-            references are `loc` references, to rows and columns of `X` (default=self)
-        """
-        if X is None:
-            X = self.X_multiindex
-        elif isinstance(X, VectorizedDF):
-            X = X.X_multiindex
-
-        row_ix, col_ix = self.get_iter_indices()
-
-        if row_ix is None and col_ix is None:
-            ret = [(X.index, X.columns)]
-        elif row_ix is None:
-            ret = product([X.index], col_ix)
-        elif col_ix is None:
-            ret = product(row_ix, [X.columns])
-        else:  # if row_ix and col_ix are both not None
-            ret = product(row_ix, col_ix)
-        return list(ret)
-
     def __len__(self):
         """Return number of indices to iterate over."""
         return np.prod(self.shape)
 
-    def __getitem__(self, i: int):
-        """Return the i-th element iterated over in vectorization."""
-        row_ind, col_ind = self._get_item_indexer(i=i)
-        return self._get_X_at_index(row_ind=row_ind, col_ind=col_ind)
-
-    def _get_X_at_index(self, row_ind=None, col_ind=None, X=None):
-        """Return subset of self, at row_ind and col_ind.
-
-        Parameters
-        ----------
-        row_ind : `None`, or `pd.Index` coercible; optional, default=None
-        col_ind : `None`, or `pd.Index` coercible; optional, default=None
-        X : `None`, `VectorizedDF`, or pd.DataFrame; optional, default=self
-          must be in one of the `sktime` time series formats, with last column time
-
-        Returns
-        -------
-        `pd.DataFrame`, loc-subset of `X` to `row_ind` at rows, and `col_ind` at cols
-
-        * if `row_ind` or `col_ind` are `None`, rows/cols are not subsetted
-        * if `X` is `VectorizedDF`, it is replaced by `X.X_multiindex` (`pandas` form)
-        * the `freq` attribute of the last index level is preserved in subsetting
-        """
-        if X is None:
-            X = self.X_multiindex
-        elif isinstance(X, VectorizedDF):
-            X = X.X_multiindex
-
-        if col_ind is None and row_ind is None:
-            return X
-        elif col_ind is None:
-            res = X.loc[row_ind]
-        elif row_ind is None:
-            res = X[col_ind]
-        else:
-            res = X.loc[row_ind, col_ind]
-        res = _enforce_index_freq(res)
-        return res.copy()
-
-    def _get_item_indexer(self, i: int, X=None):
-        """Get the i-th indexer from _iter_indices.
-
-        Parameters
-        ----------
-        X : `None`, `VectorizedDF`, or pd.DataFrame; optional, default=self
-          must be in one of the `sktime` time series formats, with last column time
-          if not `self`, the highest levels of row or column index in `X`
-          must agree with those indices of `self` that are non-trivially vectorized
-
-        Returns
-        -------
-        self._iter_indices(X=X)[i], tuple elements coerced to pd.Index coercible
-        """
-        row_ind, col_ind = self._iter_indices(X=X)[i]
-        if isinstance(col_ind, list):
-            col_ind = pd.Index(col_ind)
-        elif not isinstance(col_ind, pd.Index):
-            col_ind = [col_ind]
-        return row_ind, col_ind
-
     def __iter__(self):
         """Iterate over all instances.
 
@@ -329,6 +236,10 @@ def __iter__(self):
             )
         )
 
+    def __getitem__(self, i: int):
+        """Return the i-th element iterated over in vectorization."""
+        return next(itertools.islice(self, i, None))
+
     def items(self, iterate_as=None, iterate_cols=None):
         """Iterate over (group name, column name, instance) tuples.
 
@@ -446,7 +357,7 @@ def reconstruct(
 
         Parameters
         ----------
-        df_list : iterable of objects of same type and sequence as __getitem__ returns.
+        df_list : iterable of objects of same type and sequence as __iter__ returns.
             can be self, but will in general be another object to be useful.
             Example: [some_operation(df) for df in self] that leaves types the same
         convert_back : bool, optional, default = False
@@ -615,7 +526,7 @@ def vectorize_est(
         return_type : str, one of "pd.DataFrame" or "list"
             the return will be of this type;
             if `pd.DataFrame`, with row/col indices being `self.get_iter_indices()`
-            if `list`, entries in sequence corresponding to `self__getitem__`
+            if `list`, entries in sequence corresponding to `self__iter__`
         rowname_default : str, optional, default="estimators"
             used as index name of single row if no row vectorization is performed
         colname_default : str, optional, default="estimators"

diff --git a/sktime/performance_metrics/forecasting/_classes.py b/sktime/performance_metrics/forecasting/_classes.py
@@ -7,7 +7,6 @@
 Classes named as ``*Error`` or ``*Loss`` return a value to minimize:
 the lower the better.
 """
-from copy import deepcopy
 from inspect import getfullargspec, isfunction, signature
 from warnings import warn
 
@@ -265,32 +264,26 @@ def _evaluate_vectorized(self, y_true, y_pred, **kwargs):
 
         Parameters
         ----------
-        y_true : pandas.DataFrame with MultiIndex, last level time-like
-        y_pred : pandas.DataFrame with MultiIndex, last level time-like
-        non-time-like instanceso of y_true, y_pred must be identical
+        y_true : VectorizedDF
+        y_pred : VectorizedDF
+        non-time-like instances of y_true, y_pred must be identical
         """
-        kwargsi = deepcopy(kwargs)
-        n_batches = len(y_true)
-        res = []
-        for i in range(n_batches):
-            if "y_train" in kwargs:
-                kwargsi["y_train"] = kwargs["y_train"][i]
-            if "y_pred_benchmark" in kwargs:
-                kwargsi["y_pred_benchmark"] = kwargs["y_pred_benchmark"][i]
-            resi = self._evaluate(y_true=y_true[i], y_pred=y_pred[i], **kwargsi)
-            if isinstance(resi, float):
-                resi = pd.Series(resi)
-            if self.multioutput == "raw_values":
-                assert isinstance(resi, np.ndarray)
-                df = pd.DataFrame(columns=y_true.X.columns)
-                df.loc[0] = resi
-                resi = df
-            res += [resi]
-        out_df = y_true.reconstruct(res)
-        if out_df.index.nlevels == y_true.X.index.nlevels:
-            out_df.index = out_df.index.droplevel(-1)
+        eval_result = y_true.vectorize_est(
+            estimator=self.clone(),
+            method="_evaluate",
+            varname_of_self="y_true",
+            args={**kwargs, "y_pred": y_pred},
+            colname_default=self.name,
+        )
 
-        return out_df
+        if self.multioutput == "raw_values":
+            return pd.DataFrame(
+                eval_result.iloc[:, 0].to_list(),
+                index=eval_result.index,
+                columns=y_true.X.columns,
+            )
+        else:
+            return eval_result
 
     def evaluate_by_index(self, y_true, y_pred, **kwargs):
         """Return the metric evaluated at each time point.
@@ -578,7 +571,6 @@ def get_test_params(cls, parameter_set="default"):
         """
 
         def custom_mape(y_true, y_pred) -> float:
-
             eps = np.finfo(np.float64).eps
 
             result = np.mean(np.abs(y_true - y_pred) / np.maximum(np.abs(y_true), eps))
@@ -836,7 +828,6 @@ def __init__(
         multilevel="uniform_average",
         sp=1,
     ):
-
         self.sp = sp
         super().__init__(multioutput=multioutput, multilevel=multilevel)