diff --git a/sktime/performance_metrics/forecasting/probabilistic/_classes.py b/sktime/performance_metrics/forecasting/probabilistic/_classes.py index 5c3e6d9ae67..756d6c3b8f7 100644 --- a/sktime/performance_metrics/forecasting/probabilistic/_classes.py +++ b/sktime/performance_metrics/forecasting/probabilistic/_classes.py @@ -117,14 +117,21 @@ def evaluate(self, y_true, y_pred, multioutput=None, **kwargs): # pass to inner function out = self._evaluate(y_true_inner, y_pred_inner, multioutput, **kwargs) - if self.score_average and multioutput == "uniform_average": - out = float(out.mean(axis=1).iloc[0]) # average over all - if self.score_average and multioutput == "raw_values": - out = out.groupby(axis=1, level=0).mean() # average over scores - if not self.score_average and multioutput == "uniform_average": - out = out.groupby(axis=1, level=1).mean() # average over variables - if not self.score_average and multioutput == "raw_values": - out = out # don't average + if isinstance(multioutput, str): + if self.score_average and multioutput == "uniform_average": + out = float(out.mean(axis=1).iloc[0]) # average over all + if self.score_average and multioutput == "raw_values": + out = out.groupby(axis=1, level=0).mean() # average over scores + if not self.score_average and multioutput == "uniform_average": + out = out.groupby(axis=1, level=1).mean() # average over variables + if not self.score_average and multioutput == "raw_values": + out = out # don't average + else: # is np.array with weights + if self.score_average: + out_raw = out.groupby(axis=1, level=0).mean() + out = out_raw.dot(multioutput)[0] + else: + out = _groupby_dot(out, multioutput) if isinstance(out, pd.DataFrame): out = out.squeeze(axis=0) @@ -216,9 +223,9 @@ def evaluate_by_index(self, y_true, y_pred, multioutput=None, **kwargs): else: # numpy array if self.score_average: out_raw = out.groupby(axis=1, level=0).mean() - out = out_raw.groupby(axis=1, level=1).dot(multioutput) + out = out_raw.dot(multioutput) else: - out = out.groupby(axis=1, level=1).dot(multioutput) + out = _groupby_dot(out, multioutput) return out @@ -382,6 +389,27 @@ def _handle_multioutput(self, loss, multioutput): return out +def _groupby_dot(df, weights): + """Groupby dot product. + + Groups df by axis 1, level 1, and applies dot product with weights. + + Parameters + ---------- + df : pd.DataFrame + dataframe to groupby + weights : np.array + weights to apply to each group + + Returns + ------- + out : pd.DataFrame + dataframe with weighted groupby dot product + """ + out = df.groupby(axis=1, level=1).apply(lambda x: x.dot(weights)) + return out + + class PinballLoss(_BaseProbaForecastingErrorMetric): """Pinball loss aka quantile loss for quantile/interval predictions. diff --git a/sktime/performance_metrics/forecasting/probabilistic/tests/test_probabilistic_metrics.py b/sktime/performance_metrics/forecasting/probabilistic/tests/test_probabilistic_metrics.py index 1426803c603..de91e230610 100644 --- a/sktime/performance_metrics/forecasting/probabilistic/tests/test_probabilistic_metrics.py +++ b/sktime/performance_metrics/forecasting/probabilistic/tests/test_probabilistic_metrics.py @@ -224,3 +224,52 @@ def test_evaluate_alpha_negative(metric, sample_data): # 0.3 not in test quantile data so raise error. Loss = metric.create_test_instance().set_params(alpha=0.3) res = Loss(y_true=y_true, y_pred=y_pred) # noqa + + +@pytest.mark.parametrize("metric", all_metrics) +@pytest.mark.parametrize("score_average", [True, False]) +def test_multioutput_weighted(metric, score_average): + """Test output contracts for multioutput weights.""" + y_true = pd.DataFrame({"var1": [3, -0.5, 2, 7, 2], "var2": [4, 0.5, 3, 8, 3]}) + y_pred = pd.DataFrame( + { + ("var1", 0.05): [1.5, -1, 1, 4, 0.65], + ("var1", 0.5): [2.5, 0, 2, 8, 1.25], + ("var1", 0.95): [3.5, 4, 3, 12, 1.85], + ("var2", 0.05): [2.5, 0, 2, 8, 1.25], + ("var2", 0.5): [5.0, 1, 4, 16, 2.5], + ("var2", 0.95): [7.5, 2, 6, 24, 3.75], + } + ) + + weights = np.array([0.3, 0.7]) + + loss = metric.create_test_instance() + loss.set_params(score_average=score_average, multioutput=weights) + + eval_loss = loss(y_true, y_pred) + + if loss.get_tag("scitype:y_pred") == "pred_interval": + # 1 full interval, lower = 0.05, upper = 0.95 + expected_score_ix = [0.9] + else: + # 3 quantile scores, 0.05, 0.5, 0.95 + expected_score_ix = [0.05, 0.5, 0.95] + no_expected_scores = len(expected_score_ix) + expected_timepoints = len(y_pred) + + if score_average: + assert isinstance(eval_loss, float) + else: + assert isinstance(eval_loss, pd.Series) + assert len(eval_loss) == no_expected_scores + + eval_loss_by_index = loss.evaluate_by_index(y_true, y_pred) + assert len(eval_loss_by_index) == expected_timepoints + + if score_average: + assert isinstance(eval_loss_by_index, pd.Series) + else: + assert isinstance(eval_loss_by_index, pd.DataFrame) + assert eval_loss_by_index.shape == (expected_timepoints, no_expected_scores) + assert eval_loss_by_index.columns.to_list() == expected_score_ix