From d3cb87c5ac4cafb5c7df024d18cc6a3bacc5fe85 Mon Sep 17 00:00:00 2001
From: Vasudeva Kilaru <70791259+Vasudeva-bit@users.noreply.github.com>
Date: Thu, 14 Dec 2023 03:38:40 +0530
Subject: [PATCH] [ENH] in `ARCH`, fix `str` coercion of `pd.Series` name
 (#5407)

Related to #5349 (issue), #5384 (PR).


> After debugging for a while,
>
> 1. Firstly, the function `_check_predict_proba` is only testing for
`y_train` generated from `_make_data(n_columns=n_columns)`. If the
forecaster is univariate, then the `y_train` generated would always have
column name `None` of type `str`. The `_check_predict_proba` checks
whether the `_predict_proba` is changing `None` to `0` internally. Since
every forecaster's `_predict_proba` doing it, all forecasters passed.
> 2. Actually `ARCH` should fail this test, because `_predict_proba`
function is deleted in this PR. But, the `type(y_train.name)` is `str`
(`'None'`), not `None`. Therefore, `None` case is skipped and checks
whether `(pred_cols == y_train.name).all()` which is `None` of `str`
equals `None` of `str` so, `ARCH` also passed.
>
> I think it's my bad, there is no bug in `_check_predict_proba` as it
is not designed to check whether column names are preserved. Changes
should only be in `ARCH`'s `_predict_proba` like below:
>
> 1. If the column name is `None` of type `None`/`str`, do
`pred_dist.name = pd.Index([0])`.
> 2. Otherwise, just return output of `Super()._predict_proba()` as it
is, i.e., column names are preserved by default.

I think there was some ambiguity in my explanation for #5349 (issue) in
#5384 (PR), but what I meant was exactly the changes in this PR.
---
 sktime/forecasting/arch/_uarch.py                | 16 +++++++++++-----
 sktime/forecasting/tests/test_all_forecasters.py |  2 +-
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/sktime/forecasting/arch/_uarch.py b/sktime/forecasting/arch/_uarch.py
index c637902150e..62098f2530f 100644
--- a/sktime/forecasting/arch/_uarch.py
+++ b/sktime/forecasting/arch/_uarch.py
@@ -251,6 +251,7 @@ def _fit(self, y, X=None, fh=None):
         if fh:
             self._horizon = fh
 
+        y_name = y.name
         self._forecaster = _ARCH(
             y=y,
             x=X,
@@ -277,6 +278,7 @@ def _fit(self, y, X=None, fh=None):
             options=self.options,
             backcast=self.backcast,
         )
+        y.name = y_name
         return self
 
     def _get_arch_result_object(self, fh=None, X=None):
@@ -344,7 +346,7 @@ def _predict(self, fh, X=None):
         y_pred = pd.Series(
             ArchResultObject.mean.values[-1],
             index=full_range,
-            name=str(self._y.name),
+            name=self._y.name,
         )
         y_pred = y_pred.loc[abs_idx.to_pandas()]
         y_pred.index = self._horizon.to_absolute_index(self.cutoff)
@@ -398,11 +400,15 @@ def _predict_interval(self, fh, X, coverage):
             upper_int = mean_forecast + (z_critical * std_err)
             lower_df = pd.DataFrame(
                 lower_int,
-                columns=[y_col_name + " " + str(alpha) + " " + "lower"],
+                columns=[
+                    y_col_name if y_col_name else "0" + " " + str(alpha) + " " + "lower"
+                ],
             )
             upper_df = pd.DataFrame(
                 upper_int,
-                columns=[y_col_name + " " + str(alpha) + " " + "upper"],
+                columns=[
+                    y_col_name if y_col_name else "0" + " " + str(alpha) + " " + "upper"
+                ],
             )
             df_list.append(pd.concat((lower_df, upper_df), axis=1))
         concat_df = pd.concat(df_list, axis=1)
@@ -410,7 +416,7 @@ def _predict_interval(self, fh, X, coverage):
             OrderedDict.fromkeys(
                 [
                     col_df
-                    for col in y_col_name
+                    for col in (y_col_name if y_col_name else "0")
                     for col_df in concat_df.columns
                     if col in col_df
                 ]
@@ -425,7 +431,7 @@ def _predict_interval(self, fh, X, coverage):
         final_columns = list(
             itertools.product(
                 *[
-                    [y_col_name],
+                    [y_col_name if y_col_name else 0],
                     coverage,
                     df.columns.get_level_values(2).unique(),
                 ]
diff --git a/sktime/forecasting/tests/test_all_forecasters.py b/sktime/forecasting/tests/test_all_forecasters.py
index 3bff7ad35e4..07929a18e30 100644
--- a/sktime/forecasting/tests/test_all_forecasters.py
+++ b/sktime/forecasting/tests/test_all_forecasters.py
@@ -383,7 +383,7 @@ def get_expected_columns():
 
         found = pred_ints.columns.to_flat_index()
         msg = (
-            "columns of returned prediction interval DataFrame do not"
+            "columns of returned prediction interval DataFrame do not "
             f"match up with expected columns. Expected: {expected},"
             f"found: {found}"
         )