Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Improve vectorized metric calculation and deprecate VectorizedDF.__getitem__ and VectorizedDF.get_iloc_indexer #4228

Merged
merged 7 commits into from Mar 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
123 changes: 17 additions & 106 deletions sktime/datatypes/_vectorize.py
Expand Up @@ -5,10 +5,10 @@
Contains VectorizedDF class.
"""
import itertools
from itertools import product

import numpy as np
import pandas as pd
from deprecated.sphinx import deprecated

from sktime.datatypes._check import check_is_scitype, mtype
from sktime.datatypes._convert import convert_to
Expand Down Expand Up @@ -43,11 +43,11 @@ class VectorizedDF:

Methods
-------
self[i] or self.__getitem__(i)
Returns i-th Series/Panel (depending on iterate_as) in X
iter(self) or self.__iter__()
Iterates over each Series/Panel (depending on iterate_as) in X
as pandas.DataFrame with Index or MultiIndex (in sktime pandas format)
len(self) or self.__len__
returns number of Series/Panel in X
len(self) or self.__len__()
returns number of Series/Panels in X
get_iter_indices()
Returns pandas.(Multi)Index that are iterated over
reconstruct(self, df_list, convert_back=False)
Expand Down Expand Up @@ -185,6 +185,12 @@ def get_iter_indices(self):
"""
return self.iter_indices

# TODO: remove in v0.18.0
@deprecated(
version="0.16.2",
reason="get_iloc_indexer will be removed in v0.18.0",
category=FutureWarning,
)
def get_iloc_indexer(self, i: int):
"""Get iloc row/column indexer for i-th list element.

Expand All @@ -206,109 +212,10 @@ def get_iloc_indexer(self, i: int):
col_n = len(col_ix)
return (i // col_n, i % col_n)

def _iter_indices(self, X=None):
"""Get indices that are iterated over in vectorization.

Allows specifying `X` other than self, in which case indices are references
to row and column indices of `X`.

Parameters
----------
X : `None`, `VectorizedDF`, or pd.DataFrame; optional, default=self
must be in one of the `sktime` time series formats, with last column time
if not `self`, the highest levels of row or column index in `X`
must agree with those indices of `self` that are non-trivially vectorized

Returns
-------
list of pair of `pandas.Index` or `pandas.MultiIndex`
iterable with unique indices that are iterated over
use to reconstruct data frame after iteration
`i`-th element of list selects rows/columns in `i`-th iterate sub-DataFrame
first element of pair are rows, second element are columns selected
references are `loc` references, to rows and columns of `X` (default=self)
"""
if X is None:
X = self.X_multiindex
elif isinstance(X, VectorizedDF):
X = X.X_multiindex

row_ix, col_ix = self.get_iter_indices()

if row_ix is None and col_ix is None:
ret = [(X.index, X.columns)]
elif row_ix is None:
ret = product([X.index], col_ix)
elif col_ix is None:
ret = product(row_ix, [X.columns])
else: # if row_ix and col_ix are both not None
ret = product(row_ix, col_ix)
return list(ret)

def __len__(self):
"""Return number of indices to iterate over."""
return np.prod(self.shape)

def __getitem__(self, i: int):
"""Return the i-th element iterated over in vectorization."""
row_ind, col_ind = self._get_item_indexer(i=i)
return self._get_X_at_index(row_ind=row_ind, col_ind=col_ind)

def _get_X_at_index(self, row_ind=None, col_ind=None, X=None):
"""Return subset of self, at row_ind and col_ind.

Parameters
----------
row_ind : `None`, or `pd.Index` coercible; optional, default=None
col_ind : `None`, or `pd.Index` coercible; optional, default=None
X : `None`, `VectorizedDF`, or pd.DataFrame; optional, default=self
must be in one of the `sktime` time series formats, with last column time

Returns
-------
`pd.DataFrame`, loc-subset of `X` to `row_ind` at rows, and `col_ind` at cols

* if `row_ind` or `col_ind` are `None`, rows/cols are not subsetted
* if `X` is `VectorizedDF`, it is replaced by `X.X_multiindex` (`pandas` form)
* the `freq` attribute of the last index level is preserved in subsetting
"""
if X is None:
X = self.X_multiindex
elif isinstance(X, VectorizedDF):
X = X.X_multiindex

if col_ind is None and row_ind is None:
return X
elif col_ind is None:
res = X.loc[row_ind]
elif row_ind is None:
res = X[col_ind]
else:
res = X.loc[row_ind, col_ind]
res = _enforce_index_freq(res)
return res.copy()

def _get_item_indexer(self, i: int, X=None):
"""Get the i-th indexer from _iter_indices.

Parameters
----------
X : `None`, `VectorizedDF`, or pd.DataFrame; optional, default=self
must be in one of the `sktime` time series formats, with last column time
if not `self`, the highest levels of row or column index in `X`
must agree with those indices of `self` that are non-trivially vectorized

Returns
-------
self._iter_indices(X=X)[i], tuple elements coerced to pd.Index coercible
"""
row_ind, col_ind = self._iter_indices(X=X)[i]
if isinstance(col_ind, list):
col_ind = pd.Index(col_ind)
elif not isinstance(col_ind, pd.Index):
col_ind = [col_ind]
return row_ind, col_ind

def __iter__(self):
"""Iterate over all instances.

Expand All @@ -323,6 +230,10 @@ def __iter__(self):
)
)

def __getitem__(self, i: int):
"""Return the i-th element iterated over in vectorization."""
return next(itertools.islice(self, i, None))

def items(self, iterate_as=None, iterate_cols=None):
"""Iterate over (group name, column name, instance) tuples.

Expand Down Expand Up @@ -436,7 +347,7 @@ def reconstruct(

Parameters
----------
df_list : iterable of objects of same type and sequence as __getitem__ returns.
df_list : iterable of objects of same type and sequence as __iter__ returns.
can be self, but will in general be another object to be useful.
Example: [some_operation(df) for df in self] that leaves types the same
convert_back : bool, optional, default = False
Expand Down Expand Up @@ -605,7 +516,7 @@ def vectorize_est(
return_type : str, one of "pd.DataFrame" or "list"
the return will be of this type;
if `pd.DataFrame`, with row/col indices being `self.get_iter_indices()`
if `list`, entries in sequence corresponding to `self__getitem__`
if `list`, entries in sequence corresponding to `self__iter__`
rowname_default : str, optional, default="estimators"
used as index name of single row if no row vectorization is performed
colname_default : str, optional, default="estimators"
Expand Down
45 changes: 18 additions & 27 deletions sktime/performance_metrics/forecasting/_classes.py
Expand Up @@ -7,7 +7,6 @@
Classes named as ``*Error`` or ``*Loss`` return a value to minimize:
the lower the better.
"""
from copy import deepcopy
from inspect import getfullargspec, isfunction, signature
from warnings import warn

Expand Down Expand Up @@ -265,32 +264,26 @@ def _evaluate_vectorized(self, y_true, y_pred, **kwargs):

Parameters
----------
y_true : pandas.DataFrame with MultiIndex, last level time-like
y_pred : pandas.DataFrame with MultiIndex, last level time-like
non-time-like instanceso of y_true, y_pred must be identical
y_true : VectorizedDF
y_pred : VectorizedDF
non-time-like instances of y_true, y_pred must be identical
"""
kwargsi = deepcopy(kwargs)
n_batches = len(y_true)
res = []
for i in range(n_batches):
if "y_train" in kwargs:
kwargsi["y_train"] = kwargs["y_train"][i]
if "y_pred_benchmark" in kwargs:
kwargsi["y_pred_benchmark"] = kwargs["y_pred_benchmark"][i]
resi = self._evaluate(y_true=y_true[i], y_pred=y_pred[i], **kwargsi)
if isinstance(resi, float):
resi = pd.Series(resi)
if self.multioutput == "raw_values":
assert isinstance(resi, np.ndarray)
df = pd.DataFrame(columns=y_true.X.columns)
df.loc[0] = resi
resi = df
res += [resi]
out_df = y_true.reconstruct(res)
if out_df.index.nlevels == y_true.X.index.nlevels:
out_df.index = out_df.index.droplevel(-1)
eval_result = y_true.vectorize_est(
estimator=self.clone(),
method="_evaluate",
varname_of_self="y_true",
args={**kwargs, "y_pred": y_pred},
colname_default=self.name,
)

return out_df
if self.multioutput == "raw_values":
return pd.DataFrame(
eval_result.iloc[:, 0].to_list(),
index=eval_result.index,
columns=y_true.X.columns,
)
else:
return eval_result

def evaluate_by_index(self, y_true, y_pred, **kwargs):
"""Return the metric evaluated at each time point.
Expand Down Expand Up @@ -578,7 +571,6 @@ def get_test_params(cls, parameter_set="default"):
"""

def custom_mape(y_true, y_pred) -> float:

eps = np.finfo(np.float64).eps

result = np.mean(np.abs(y_true - y_pred) / np.maximum(np.abs(y_true), eps))
Expand Down Expand Up @@ -805,7 +797,6 @@ def __init__(
multilevel="uniform_average",
sp=1,
):

self.sp = sp
super().__init__(multioutput=multioutput, multilevel=multilevel)

Expand Down