From d3cb87c5ac4cafb5c7df024d18cc6a3bacc5fe85 Mon Sep 17 00:00:00 2001 From: Vasudeva Kilaru <70791259+Vasudeva-bit@users.noreply.github.com> Date: Thu, 14 Dec 2023 03:38:40 +0530 Subject: [PATCH] [ENH] in `ARCH`, fix `str` coercion of `pd.Series` name (#5407) Related to #5349 (issue), #5384 (PR). > After debugging for a while, > > 1. Firstly, the function `_check_predict_proba` is only testing for `y_train` generated from `_make_data(n_columns=n_columns)`. If the forecaster is univariate, then the `y_train` generated would always have column name `None` of type `str`. The `_check_predict_proba` checks whether the `_predict_proba` is changing `None` to `0` internally. Since every forecaster's `_predict_proba` doing it, all forecasters passed. > 2. Actually `ARCH` should fail this test, because `_predict_proba` function is deleted in this PR. But, the `type(y_train.name)` is `str` (`'None'`), not `None`. Therefore, `None` case is skipped and checks whether `(pred_cols == y_train.name).all()` which is `None` of `str` equals `None` of `str` so, `ARCH` also passed. > > I think it's my bad, there is no bug in `_check_predict_proba` as it is not designed to check whether column names are preserved. Changes should only be in `ARCH`'s `_predict_proba` like below: > > 1. If the column name is `None` of type `None`/`str`, do `pred_dist.name = pd.Index([0])`. > 2. Otherwise, just return output of `Super()._predict_proba()` as it is, i.e., column names are preserved by default. I think there was some ambiguity in my explanation for #5349 (issue) in #5384 (PR), but what I meant was exactly the changes in this PR. --- sktime/forecasting/arch/_uarch.py | 16 +++++++++++----- sktime/forecasting/tests/test_all_forecasters.py | 2 +- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/sktime/forecasting/arch/_uarch.py b/sktime/forecasting/arch/_uarch.py index c637902150e..62098f2530f 100644 --- a/sktime/forecasting/arch/_uarch.py +++ b/sktime/forecasting/arch/_uarch.py @@ -251,6 +251,7 @@ def _fit(self, y, X=None, fh=None): if fh: self._horizon = fh + y_name = y.name self._forecaster = _ARCH( y=y, x=X, @@ -277,6 +278,7 @@ def _fit(self, y, X=None, fh=None): options=self.options, backcast=self.backcast, ) + y.name = y_name return self def _get_arch_result_object(self, fh=None, X=None): @@ -344,7 +346,7 @@ def _predict(self, fh, X=None): y_pred = pd.Series( ArchResultObject.mean.values[-1], index=full_range, - name=str(self._y.name), + name=self._y.name, ) y_pred = y_pred.loc[abs_idx.to_pandas()] y_pred.index = self._horizon.to_absolute_index(self.cutoff) @@ -398,11 +400,15 @@ def _predict_interval(self, fh, X, coverage): upper_int = mean_forecast + (z_critical * std_err) lower_df = pd.DataFrame( lower_int, - columns=[y_col_name + " " + str(alpha) + " " + "lower"], + columns=[ + y_col_name if y_col_name else "0" + " " + str(alpha) + " " + "lower" + ], ) upper_df = pd.DataFrame( upper_int, - columns=[y_col_name + " " + str(alpha) + " " + "upper"], + columns=[ + y_col_name if y_col_name else "0" + " " + str(alpha) + " " + "upper" + ], ) df_list.append(pd.concat((lower_df, upper_df), axis=1)) concat_df = pd.concat(df_list, axis=1) @@ -410,7 +416,7 @@ def _predict_interval(self, fh, X, coverage): OrderedDict.fromkeys( [ col_df - for col in y_col_name + for col in (y_col_name if y_col_name else "0") for col_df in concat_df.columns if col in col_df ] @@ -425,7 +431,7 @@ def _predict_interval(self, fh, X, coverage): final_columns = list( itertools.product( *[ - [y_col_name], + [y_col_name if y_col_name else 0], coverage, df.columns.get_level_values(2).unique(), ] diff --git a/sktime/forecasting/tests/test_all_forecasters.py b/sktime/forecasting/tests/test_all_forecasters.py index 3bff7ad35e4..07929a18e30 100644 --- a/sktime/forecasting/tests/test_all_forecasters.py +++ b/sktime/forecasting/tests/test_all_forecasters.py @@ -383,7 +383,7 @@ def get_expected_columns(): found = pred_ints.columns.to_flat_index() msg = ( - "columns of returned prediction interval DataFrame do not" + "columns of returned prediction interval DataFrame do not " f"match up with expected columns. Expected: {expected}," f"found: {found}" )