Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

&SmirnGregHM [BUG] ensure forecasting tuners do not vectorize over columns (variables) #5145

Merged
merged 12 commits into from Sep 2, 2023
24 changes: 23 additions & 1 deletion sktime/forecasting/model_selection/_tune.py
Expand Up @@ -69,7 +69,6 @@ def __init__(
"capability:pred_int",
"capability:pred_int:insample",
"capability:insample",
"scitype:y",
"ignores-exogeneous-X",
"handles-missing-data",
"y_inner_mtype",
Expand All @@ -86,11 +85,28 @@ def __init__(
# see further details in _DelegatedForecaster docstring
_delegate_name = "best_forecaster_"

# ensure informative exception is raised for forecasters_ attribute
@property
def forecasters_(self):
raise AttributeError(
f"{self.__class__.__name__}.forecasters_ property should not be called "
"as tuning wrappers do not broadcast over instances or variables, "
"this is left to the underlying forecaster. "
"A previous, erroneous instance of column broadcasting was removed "
"in 0.22.1, see issue #5143 for a discussion. "
"To tune per column/variable, wrap the tuner in ColumnEnsembleForecaster. "
"To tune per instance or hierarchy level, wrap the tuner in "
"ForecastByLevel. In both cases, individual tuned forecasters will be "
"present in the forecasters_ attribute."
)

def _extend_to_all_scitypes(self, tagname):
"""Ensure mtypes for all scitypes are in the tag with tagname.

Mutates self tag with name `tagname`.
If no mtypes are present of a time series scitype, adds a pandas based one.
If only univariate pandas scitype is present for Series ("pd.Series"),
also adds the multivariate one ("pd.DataFrame").

Parameters
----------
Expand All @@ -104,10 +120,16 @@ def _extend_to_all_scitypes(self, tagname):
if not isinstance(tagval, list):
tagval = [tagval]
scitypes = mtype_to_scitype(tagval, return_unique=True)
# if no Series mtypes are present, add pd.DataFrame
if "Series" not in scitypes:
tagval = tagval + ["pd.DataFrame"]
# ensure we have a Series mtype capable of multivariate
elif "pd.Series" in tagval and "pd.DataFrame" not in tagval:
tagval = ["pd.DataFrame"] + tagval
# if no Panel mtypes are present, add pd.DataFrame based one
if "Panel" not in scitypes:
tagval = tagval + ["pd-multiindex"]
# if no Hierarchical mtypes are present, add pd.DataFrame based one
if "Hierarchical" not in scitypes:
tagval = tagval + ["pd_multiindex_hier"]
self.set_tags(**{tagname: tagval})
Expand Down
23 changes: 18 additions & 5 deletions sktime/forecasting/model_selection/tests/test_tune.py
Expand Up @@ -81,12 +81,13 @@ def _check_cv(forecaster, tuner, cv, param_grid, y, X, scoring):
assert param_grid[best_idx].items() <= fitted_params.items()


def _create_hierarchical_data():
def _create_hierarchical_data(n_columns=1):
y = _make_hierarchical(
random_state=TEST_RANDOM_SEEDS[0],
hierarchy_levels=(2, 2),
min_timepoints=15,
max_timepoints=15,
n_columns=n_columns,
)
X = _make_hierarchical(
random_state=TEST_RANDOM_SEEDS[1],
Expand All @@ -97,7 +98,10 @@ def _create_hierarchical_data():
return y, X


NAIVE = NaiveForecaster(strategy="mean")
# estimator fixtures used for tuning
# set_tags in NaiveForecaster ensures that it is univariate and broadcasts
# this is currently the case, but a future improved NaiveForecaster may reduce coverage
NAIVE = NaiveForecaster(strategy="mean").set_tags(**{"scitype:y": "univariate"})
NAIVE_GRID = {"window_length": TEST_WINDOW_LENGTHS_INT}
PIPE = TransformedTargetForecaster(
[
Expand Down Expand Up @@ -126,9 +130,15 @@ def _create_hierarchical_data():
@pytest.mark.parametrize("scoring", TEST_METRICS)
@pytest.mark.parametrize("cv", CVs)
@pytest.mark.parametrize("error_score", ERROR_SCORES)
def test_gscv(forecaster, param_grid, cv, scoring, error_score):
@pytest.mark.parametrize("multivariate", [True, False])
def test_gscv(forecaster, param_grid, cv, scoring, error_score, multivariate):
"""Test ForecastingGridSearchCV."""
y, X = load_longley()
fkiraly marked this conversation as resolved.
Show resolved Hide resolved
if multivariate:
X, y = load_longley()
else:
y, X = load_longley()

gscv = ForecastingGridSearchCV(
forecaster,
param_grid=param_grid,
Expand Down Expand Up @@ -177,6 +187,7 @@ def test_rscv(forecaster, param_grid, cv, scoring, error_score, n_iter, random_s
ParameterSampler(param_grid, n_iter, random_state=random_state)
)
_check_cv(forecaster, rscv, cv, param_distributions, y, X, scoring)
_check_fitted_params_keys(rscv.get_fitted_params())


@pytest.mark.skipif(
Expand All @@ -189,9 +200,10 @@ def test_rscv(forecaster, param_grid, cv, scoring, error_score, n_iter, random_s
@pytest.mark.parametrize("scoring", TEST_METRICS)
@pytest.mark.parametrize("cv", CVs)
@pytest.mark.parametrize("error_score", ERROR_SCORES)
def test_gscv_hierarchical(forecaster, param_grid, cv, scoring, error_score):
@pytest.mark.parametrize("n_cols", [1, 2])
def test_gscv_hierarchical(forecaster, param_grid, cv, scoring, error_score, n_cols):
"""Test ForecastingGridSearchCV."""
y, X = _create_hierarchical_data()
y, X = _create_hierarchical_data(n_columns=n_cols)
gscv = ForecastingGridSearchCV(
forecaster,
param_grid=param_grid,
Expand All @@ -203,6 +215,7 @@ def test_gscv_hierarchical(forecaster, param_grid, cv, scoring, error_score):

param_grid = ParameterGrid(param_grid)
_check_cv(forecaster, gscv, cv, param_grid, y, X, scoring)
_check_fitted_params_keys(gscv.get_fitted_params())


@pytest.mark.skipif(
Expand Down