Skip to content

Commit

Permalink
[ENH] rewrite test_probabilistic_metrics using proper pytest fixtures (
Browse files Browse the repository at this point in the history
…#4946)

#### Reference Issues/PRs
Fixes #4907 

#### What does this implement/fix? Explain your changes.
Moved the sample data generation inside fixtures, so it doesn't run with
each import, but only when testing is performed.
Moreover refactored the tests such that the output tests for interval
predictions and quantile predictions are separated.

Reason: With only one parametrized test were 96 parameter combinations,
which made the test very slow and CPU heavy.
Also, some parameter combinations are not necessary (quantile metrics
belong to quantile forecast only and interval metrics only to interval
forecast) - please correct me if I'm wrong.
Separating the tests speeds up the testing process considerably and
makes debugging easier.


#### Did you add any tests for the change?

Added the test `test_sample_data()` to check if the sample data
generating fixture `sample_data()` is working correctly.

#### Any other comments?
The output test function contains many nested if - else statements. In
my opinion, these should be separate tests with separate parametrized
inputs, instead of putting all inputs in one test, and then
distinguishing with if-else statements.
  • Loading branch information
julia-kraus committed Jul 24, 2023
1 parent ed4efe9 commit f4815d7
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 74 deletions.
1 change: 1 addition & 0 deletions CONTRIBUTORS.md
Expand Up @@ -282,6 +282,7 @@ Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/d
<td align="center" valign="top" width="11.11%"><a href="https://github.com/vincent-nich12"><img src="https://avatars3.githubusercontent.com/u/36476633?v=4?s=100" width="100px;" alt="vincent-nich12"/><br /><sub><b>vincent-nich12</b></sub></a><br /><a href="https://github.com/sktime/sktime/commits?author=vincent-nich12" title="Code">💻</a></td>
<td align="center" valign="top" width="11.11%"><a href="https://github.com/vollmersj"><img src="https://avatars2.githubusercontent.com/u/12613127?v=4?s=100" width="100px;" alt="vollmersj"/><br /><sub><b>vollmersj</b></sub></a><br /><a href="https://github.com/sktime/sktime/commits?author=vollmersj" title="Documentation">📖</a></td>
<td align="center" valign="top" width="11.11%"><a href="https://github.com/xiaobenbenecho"><img src="https://avatars.githubusercontent.com/u/17461849?v=4?s=100" width="100px;" alt="xiaobenbenecho"/><br /><sub><b>xiaobenbenecho</b></sub></a><br /><a href="https://github.com/sktime/sktime/commits?author=xiaobenbenecho" title="Code">💻</a></td>
<td align="center" valign="top" width="11.11%"><a href="https://github.com/julia-kraus"><img src="https://avatars.githubusercontent.com/u/22000879?v=4" width="100px;" alt="julia-kraus"/><br /><sub><b>julia-kraus</b></sub></a><br /><a href="https://github.com/sktime/sktime/commits?author=julia-kraus" title="Code">💻</a></td>
</tr>
</tbody>
</table>
Expand Down
@@ -1,4 +1,4 @@
"""Tests for probabilistic quantiles."""
"""Tests for probabilistic performance metrics."""
import warnings

import numpy as np
Expand All @@ -25,78 +25,62 @@
ConstraintViolation,
]

all_metrics = quantile_metrics + interval_metrics


y_uni = _make_series(n_columns=1)
y_train_uni, y_test_uni = temporal_train_test_split(y_uni)
fh_uni = np.arange(len(y_test_uni)) + 1
f_uni = NaiveVariance(NaiveForecaster())
f_uni.fit(y_train_uni)

y_multi = _make_series(n_columns=3)
y_train_multi, y_test_multi = temporal_train_test_split(y_multi)
fh_multi = np.arange(len(y_test_multi)) + 1
f_multi = NaiveVariance(NaiveForecaster())
f_multi.fit(y_train_multi)
"""Cases we need to test score average = TRUE/FALSE multivariable = TRUE/FALSE
multiscores = TRUE/FALSE.
Data types Univariate and single score Univariate and multi score Multivariate and
single score Multivariate and multiscor
For each of the data types we need to test with score average = T/F and multioutput with
"raw_values" and "uniform_average"
"""
quantile_pred_uni_s = f_uni.predict_quantiles(fh=fh_uni, alpha=[0.5])
interval_pred_uni_s = f_uni.predict_interval(fh=fh_uni, coverage=0.9)
quantile_pred_uni_m = f_uni.predict_quantiles(fh=fh_uni, alpha=[0.05, 0.5, 0.95])
interval_pred_uni_m = f_uni.predict_interval(fh=fh_uni, coverage=[0.7, 0.8, 0.9, 0.99])

quantile_pred_multi_s = f_multi.predict_quantiles(fh=fh_multi, alpha=[0.5])
interval_pred_multi_s = f_multi.predict_interval(fh=fh_multi, coverage=0.9)
quantile_pred_multi_m = f_multi.predict_quantiles(fh=fh_multi, alpha=[0.05, 0.5, 0.95])
interval_pred_multi_m = f_multi.predict_interval(
fh=fh_multi, coverage=[0.7, 0.8, 0.9, 0.99]
)
all_metrics = interval_metrics + quantile_metrics

uni_data = [
quantile_pred_uni_s,
interval_pred_uni_s,
quantile_pred_uni_m,
interval_pred_uni_m,
]
alpha_s = [0.5]
alpha_m = [0.05, 0.5, 0.95]
coverage_s = 0.9
coverage_m = [0.7, 0.8, 0.9, 0.99]

multi_data = [
quantile_pred_multi_s,
interval_pred_multi_s,
quantile_pred_multi_m,
interval_pred_multi_m,
]

quantile_data = [
quantile_pred_uni_s,
quantile_pred_uni_m,
quantile_pred_multi_s,
quantile_pred_multi_m,
]
@pytest.fixture
def sample_data(request):
n_columns, coverage_or_alpha, pred_type = request.param

interval_data = [
interval_pred_uni_s,
interval_pred_uni_m,
interval_pred_multi_s,
interval_pred_multi_m,
]
y = _make_series(n_columns=n_columns)
y_train, y_test = temporal_train_test_split(y)
fh = np.arange(len(y_test)) + 1

# fit model
f = NaiveVariance(NaiveForecaster())
f.fit(y_train)

# predict model

if pred_type == "interval":
interval_pred = f.predict_interval(fh=fh, coverage=coverage_or_alpha)
return y_test, interval_pred

elif pred_type == "quantile":
quantile_pred = f.predict_quantiles(fh=fh, alpha=coverage_or_alpha)
return y_test, quantile_pred

return


# Test the parametrized fixture
@pytest.mark.parametrize(
"y_true, y_pred",
list(zip([y_test_uni] * 4, uni_data)) + list(zip([y_test_multi] * 4, multi_data)),
"sample_data",
[
(1, alpha_s, "quantile"),
(3, alpha_s, "quantile"),
(1, alpha_m, "quantile"),
(3, alpha_m, "quantile"),
(1, coverage_s, "interval"),
(3, coverage_s, "interval"),
(1, coverage_m, "interval"),
(3, coverage_m, "interval"),
],
indirect=True,
)
@pytest.mark.parametrize("metric", all_metrics)
@pytest.mark.parametrize("multioutput", ["uniform_average", "raw_values"])
@pytest.mark.parametrize("score_average", [True, False])
def test_output(metric, score_average, multioutput, y_true, y_pred):
def test_sample_data(sample_data):
y_true, y_pred = sample_data
assert isinstance(y_true, (pd.Series, pd.DataFrame))
assert isinstance(y_pred, pd.DataFrame)


def helper_check_output(metric, score_average, multioutput, sample_data):
y_true, y_pred = sample_data
"""Test output is correct class and shape for given data."""
loss = metric.create_test_instance()
loss.set_params(score_average=score_average, multioutput=multioutput)
Expand Down Expand Up @@ -158,32 +142,85 @@ def test_output(metric, score_average, multioutput, y_true, y_pred):
assert len(eval_loss) == no_vars


@pytest.mark.parametrize("Metric", quantile_metrics)
@pytest.mark.parametrize(
"y_pred, y_true", list(zip(quantile_data, [y_test_uni] * 2 + [y_test_multi] * 2))
"sample_data",
[
(1, alpha_s, "quantile"),
(3, alpha_s, "quantile"),
(1, alpha_m, "quantile"),
(3, alpha_m, "quantile"),
],
indirect=True,
)
def test_evaluate_alpha_positive(Metric, y_pred, y_true):
@pytest.mark.parametrize("metric", all_metrics)
@pytest.mark.parametrize("multioutput", ["uniform_average", "raw_values"])
@pytest.mark.parametrize("score_average", [True, False])
def test_output_quantiles(metric, score_average, multioutput, sample_data):
helper_check_output(metric, score_average, multioutput, sample_data)


@pytest.mark.parametrize(
"sample_data",
[
(1, coverage_s, "interval"),
(3, coverage_s, "interval"),
(1, coverage_m, "interval"),
(3, coverage_m, "interval"),
],
indirect=True,
)
@pytest.mark.parametrize("metric", all_metrics)
@pytest.mark.parametrize("multioutput", ["uniform_average", "raw_values"])
@pytest.mark.parametrize("score_average", [True, False])
def test_output_intervals(metric, score_average, multioutput, sample_data):
helper_check_output(metric, score_average, multioutput, sample_data)


@pytest.mark.parametrize("metric", quantile_metrics)
@pytest.mark.parametrize(
"sample_data",
[
(1, alpha_s, "quantile"),
(3, alpha_s, "quantile"),
(1, alpha_m, "quantile"),
(3, alpha_m, "quantile"),
],
indirect=True,
)
def test_evaluate_alpha_positive(metric, sample_data):
"""Tests output when required quantile is present."""
# 0.5 in test quantile data don't raise error.
Loss = Metric.create_test_instance().set_params(alpha=0.5, score_average=False)

y_true, y_pred = sample_data

Loss = metric.create_test_instance().set_params(alpha=0.5, score_average=False)
res = Loss(y_true=y_true, y_pred=y_pred)
assert len(res) == 1

if all(x in y_pred.columns.get_level_values(1) for x in [0.5, 0.95]):
Loss = Metric.create_test_instance().set_params(
Loss = metric.create_test_instance().set_params(
alpha=[0.5, 0.95], score_average=False
)
res = Loss(y_true=y_true, y_pred=y_pred)
assert len(res) == 2


@pytest.mark.parametrize("Metric", quantile_metrics)
# This test tests quantile data
@pytest.mark.parametrize(
"y_pred, y_true", list(zip(quantile_data, [y_test_uni] * 2 + [y_test_multi] * 2))
"sample_data",
[
(1, alpha_s, "quantile"),
(3, alpha_s, "quantile"),
(1, alpha_m, "quantile"),
(3, alpha_m, "quantile"),
],
indirect=True,
)
def test_evaluate_alpha_negative(Metric, y_pred, y_true):
@pytest.mark.parametrize("metric", quantile_metrics)
def test_evaluate_alpha_negative(metric, sample_data):
"""Tests whether correct error raised when required quantile not present."""
y_true, y_pred = sample_data
with pytest.raises(ValueError):
# 0.3 not in test quantile data so raise error.
Loss = Metric.create_test_instance().set_params(alpha=0.3)
Loss = metric.create_test_instance().set_params(alpha=0.3)
res = Loss(y_true=y_true, y_pred=y_pred) # noqa

0 comments on commit f4815d7

Please sign in to comment.