Skip to content

Commit

Permalink
[ENH] Improve vectorized metric calculation and deprecate VectorizedD…
Browse files Browse the repository at this point in the history
…F.__getitem__ and VectorizedDF.get_iloc_indexer (#4228)

Followup to #4195
Contributes to #4139

This PR implements `BaseForecastingErrorMetric._evaluate_vectorized` using `VectorizedDF.vectorize_est`.
Removes the last reference to `VectorizedDF.__getitem__`.
Random access is not needed, and developers should use `__iter__` for iteration instead (implemented in #4195).
Also, unused method `get_iloc_indexer` is marked as deprecated and should be removed in a future version.
  • Loading branch information
hoesler committed Mar 7, 2023
1 parent 925f55e commit d85102f
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 133 deletions.
123 changes: 17 additions & 106 deletions sktime/datatypes/_vectorize.py
Expand Up @@ -9,10 +9,10 @@


import itertools
from itertools import product

import numpy as np
import pandas as pd
from deprecated.sphinx import deprecated

from sktime.datatypes._check import check_is_scitype, mtype
from sktime.datatypes._convert import convert_to
Expand Down Expand Up @@ -47,11 +47,11 @@ class VectorizedDF:
Methods
-------
self[i] or self.__getitem__(i)
Returns i-th Series/Panel (depending on iterate_as) in X
iter(self) or self.__iter__()
Iterates over each Series/Panel (depending on iterate_as) in X
as pandas.DataFrame with Index or MultiIndex (in sktime pandas format)
len(self) or self.__len__
returns number of Series/Panel in X
len(self) or self.__len__()
returns number of Series/Panels in X
get_iter_indices()
Returns pandas.(Multi)Index that are iterated over
reconstruct(self, df_list, convert_back=False)
Expand Down Expand Up @@ -189,6 +189,12 @@ def get_iter_indices(self):
"""
return self.iter_indices

# TODO: remove in v0.18.0
@deprecated(
version="0.16.2",
reason="get_iloc_indexer will be removed in v0.18.0",
category=FutureWarning,
)
def get_iloc_indexer(self, i: int):
"""Get iloc row/column indexer for i-th list element.
Expand All @@ -210,109 +216,10 @@ def get_iloc_indexer(self, i: int):
col_n = len(col_ix)
return (i // col_n, i % col_n)

def _iter_indices(self, X=None):
"""Get indices that are iterated over in vectorization.
Allows specifying `X` other than self, in which case indices are references
to row and column indices of `X`.
Parameters
----------
X : `None`, `VectorizedDF`, or pd.DataFrame; optional, default=self
must be in one of the `sktime` time series formats, with last column time
if not `self`, the highest levels of row or column index in `X`
must agree with those indices of `self` that are non-trivially vectorized
Returns
-------
list of pair of `pandas.Index` or `pandas.MultiIndex`
iterable with unique indices that are iterated over
use to reconstruct data frame after iteration
`i`-th element of list selects rows/columns in `i`-th iterate sub-DataFrame
first element of pair are rows, second element are columns selected
references are `loc` references, to rows and columns of `X` (default=self)
"""
if X is None:
X = self.X_multiindex
elif isinstance(X, VectorizedDF):
X = X.X_multiindex

row_ix, col_ix = self.get_iter_indices()

if row_ix is None and col_ix is None:
ret = [(X.index, X.columns)]
elif row_ix is None:
ret = product([X.index], col_ix)
elif col_ix is None:
ret = product(row_ix, [X.columns])
else: # if row_ix and col_ix are both not None
ret = product(row_ix, col_ix)
return list(ret)

def __len__(self):
"""Return number of indices to iterate over."""
return np.prod(self.shape)

def __getitem__(self, i: int):
"""Return the i-th element iterated over in vectorization."""
row_ind, col_ind = self._get_item_indexer(i=i)
return self._get_X_at_index(row_ind=row_ind, col_ind=col_ind)

def _get_X_at_index(self, row_ind=None, col_ind=None, X=None):
"""Return subset of self, at row_ind and col_ind.
Parameters
----------
row_ind : `None`, or `pd.Index` coercible; optional, default=None
col_ind : `None`, or `pd.Index` coercible; optional, default=None
X : `None`, `VectorizedDF`, or pd.DataFrame; optional, default=self
must be in one of the `sktime` time series formats, with last column time
Returns
-------
`pd.DataFrame`, loc-subset of `X` to `row_ind` at rows, and `col_ind` at cols
* if `row_ind` or `col_ind` are `None`, rows/cols are not subsetted
* if `X` is `VectorizedDF`, it is replaced by `X.X_multiindex` (`pandas` form)
* the `freq` attribute of the last index level is preserved in subsetting
"""
if X is None:
X = self.X_multiindex
elif isinstance(X, VectorizedDF):
X = X.X_multiindex

if col_ind is None and row_ind is None:
return X
elif col_ind is None:
res = X.loc[row_ind]
elif row_ind is None:
res = X[col_ind]
else:
res = X.loc[row_ind, col_ind]
res = _enforce_index_freq(res)
return res.copy()

def _get_item_indexer(self, i: int, X=None):
"""Get the i-th indexer from _iter_indices.
Parameters
----------
X : `None`, `VectorizedDF`, or pd.DataFrame; optional, default=self
must be in one of the `sktime` time series formats, with last column time
if not `self`, the highest levels of row or column index in `X`
must agree with those indices of `self` that are non-trivially vectorized
Returns
-------
self._iter_indices(X=X)[i], tuple elements coerced to pd.Index coercible
"""
row_ind, col_ind = self._iter_indices(X=X)[i]
if isinstance(col_ind, list):
col_ind = pd.Index(col_ind)
elif not isinstance(col_ind, pd.Index):
col_ind = [col_ind]
return row_ind, col_ind

def __iter__(self):
"""Iterate over all instances.
Expand All @@ -329,6 +236,10 @@ def __iter__(self):
)
)

def __getitem__(self, i: int):
"""Return the i-th element iterated over in vectorization."""
return next(itertools.islice(self, i, None))

def items(self, iterate_as=None, iterate_cols=None):
"""Iterate over (group name, column name, instance) tuples.
Expand Down Expand Up @@ -446,7 +357,7 @@ def reconstruct(
Parameters
----------
df_list : iterable of objects of same type and sequence as __getitem__ returns.
df_list : iterable of objects of same type and sequence as __iter__ returns.
can be self, but will in general be another object to be useful.
Example: [some_operation(df) for df in self] that leaves types the same
convert_back : bool, optional, default = False
Expand Down Expand Up @@ -615,7 +526,7 @@ def vectorize_est(
return_type : str, one of "pd.DataFrame" or "list"
the return will be of this type;
if `pd.DataFrame`, with row/col indices being `self.get_iter_indices()`
if `list`, entries in sequence corresponding to `self__getitem__`
if `list`, entries in sequence corresponding to `self__iter__`
rowname_default : str, optional, default="estimators"
used as index name of single row if no row vectorization is performed
colname_default : str, optional, default="estimators"
Expand Down
45 changes: 18 additions & 27 deletions sktime/performance_metrics/forecasting/_classes.py
Expand Up @@ -7,7 +7,6 @@
Classes named as ``*Error`` or ``*Loss`` return a value to minimize:
the lower the better.
"""
from copy import deepcopy
from inspect import getfullargspec, isfunction, signature
from warnings import warn

Expand Down Expand Up @@ -265,32 +264,26 @@ def _evaluate_vectorized(self, y_true, y_pred, **kwargs):
Parameters
----------
y_true : pandas.DataFrame with MultiIndex, last level time-like
y_pred : pandas.DataFrame with MultiIndex, last level time-like
non-time-like instanceso of y_true, y_pred must be identical
y_true : VectorizedDF
y_pred : VectorizedDF
non-time-like instances of y_true, y_pred must be identical
"""
kwargsi = deepcopy(kwargs)
n_batches = len(y_true)
res = []
for i in range(n_batches):
if "y_train" in kwargs:
kwargsi["y_train"] = kwargs["y_train"][i]
if "y_pred_benchmark" in kwargs:
kwargsi["y_pred_benchmark"] = kwargs["y_pred_benchmark"][i]
resi = self._evaluate(y_true=y_true[i], y_pred=y_pred[i], **kwargsi)
if isinstance(resi, float):
resi = pd.Series(resi)
if self.multioutput == "raw_values":
assert isinstance(resi, np.ndarray)
df = pd.DataFrame(columns=y_true.X.columns)
df.loc[0] = resi
resi = df
res += [resi]
out_df = y_true.reconstruct(res)
if out_df.index.nlevels == y_true.X.index.nlevels:
out_df.index = out_df.index.droplevel(-1)
eval_result = y_true.vectorize_est(
estimator=self.clone(),
method="_evaluate",
varname_of_self="y_true",
args={**kwargs, "y_pred": y_pred},
colname_default=self.name,
)

return out_df
if self.multioutput == "raw_values":
return pd.DataFrame(
eval_result.iloc[:, 0].to_list(),
index=eval_result.index,
columns=y_true.X.columns,
)
else:
return eval_result

def evaluate_by_index(self, y_true, y_pred, **kwargs):
"""Return the metric evaluated at each time point.
Expand Down Expand Up @@ -578,7 +571,6 @@ def get_test_params(cls, parameter_set="default"):
"""

def custom_mape(y_true, y_pred) -> float:

eps = np.finfo(np.float64).eps

result = np.mean(np.abs(y_true - y_pred) / np.maximum(np.abs(y_true), eps))
Expand Down Expand Up @@ -836,7 +828,6 @@ def __init__(
multilevel="uniform_average",
sp=1,
):

self.sp = sp
super().__init__(multioutput=multioutput, multilevel=multilevel)

Expand Down

0 comments on commit d85102f

Please sign in to comment.