[MNT] pandas 2 compatibility fixes (#4427)

Fixes #4426. This PR contains some tentative fixes for compatibility issues arising from the `pandas` 2 release on Apr 3: * fixes to index handling in `ForecastingHorizon.to_absolute` * fixes to many instances of using `ForecastingHorizon` as `pd.DataFrame` `index` in constructor or setter (`.index = ...`), this is no longer possible * writing to a `loc` index that does not exist no longer creates it but raises an exception - this was replaced by ensuring the right rows and columns are present before the write - `AutoETS`, `UnobservedComponents` * replace deprecated `is_integer` and `is_numeric` with recommended replacement calls to `pandas.api.types` * fixed old `from_nested_to_multi_index` utility producing `object` valued multiindex (instead of integer typed etc) * same for `_convert_tsf_to_hierarchical` utility in data loaders * replace deprecated `pandas.util.testing` imports with `pandas.testing` * #4435 * #4436 * replace in-place nested pandas operations with construction of new frame: `test_interpolate` * #4438 * replace single-index temporal slicing at lower granularity with slice-based slicing Relies on #4429 for testing. Does *not* change the dependency bounds.
sktime · Apr 10, 2023 · d0d8298 · d0d8298
1 parent 52a9cf7
commit d0d8298
Show file tree

Hide file tree

Showing 37 changed files with 162 additions and 142 deletions.
diff --git a/sktime/datasets/_data_io.py b/sktime/datasets/_data_io.py
@@ -2036,11 +2036,10 @@ def _convert_tsf_to_hierarchical(
     columns = [value_column_name, "timestamp"]
     index_columns = [c for c in list(df.columns) if c not in drop_columns + columns]
     result = pd.DataFrame({c: df[c].explode() for c in columns})
-    df = (
-        df.drop(columns=columns + drop_columns)
-        .join(result)
-        .set_index(index_columns + ["timestamp"])
-    )
+    df = df.drop(columns=columns + drop_columns).join(result)
+    if df["timestamp"].dtype == "object":
+        df = df.astype({"timestamp": "int64"})
+    df = df.set_index(index_columns + ["timestamp"])
     df = df.astype({value_column_name: "float"}, errors="ignore")
 
     return df

diff --git a/sktime/datatypes/_panel/_convert.py b/sktime/datatypes/_panel/_convert.py
@@ -810,8 +810,9 @@ def from_nested_to_multi_index(X, instance_index=None, time_index=None):
 
         # create the right MultiIndex and assign to X_mi
         idx_df = X[[c]].applymap(lambda x: x.index).explode(c)
-        idx_df = idx_df.set_index(c, append=True)
-        X_col.index = idx_df.index.set_names([instance_index, time_index])
+        index = pd.MultiIndex.from_arrays([idx_df.index, idx_df[c].values])
+        index = index.set_names([instance_index, time_index])
+        X_col.index = index
 
         X_mi[[c]] = X_col
 

diff --git a/sktime/datatypes/tests/test_utils.py b/sktime/datatypes/tests/test_utils.py
@@ -207,7 +207,7 @@ def test_get_cutoff_from_index(reverse_order):
     )
 
     assert isinstance(cutoff, pd.Index) and len(cutoff) == 1
-    assert cutoff.is_integer()
+    assert pd.api.types.is_integer_dtype(cutoff)
     assert idx == cutoff[0]
 
     if reverse_order:

diff --git a/sktime/forecasting/ardl.py b/sktime/forecasting/ardl.py
@@ -324,7 +324,7 @@ def _fit(self, y, X=None, fh=None):
 
         # statsmodels does not support the pd.Int64Index as required,
         # so we coerce them here to pd.RangeIndex
-        if isinstance(y, pd.Series) and y.index.is_integer():
+        if isinstance(y, pd.Series) and pd.api.types.is_integer_dtype(y.index):
             y, X = _coerce_int_to_range_index(y, X)
 
         # validity check of passed params

diff --git a/sktime/forecasting/base/_base.py b/sktime/forecasting/base/_base.py
@@ -1589,32 +1589,6 @@ def _update_y_X(self, y, X=None, enforce_index_type=None):
             else:
                 self._X = update_data(self._X, X)
 
-    def _get_y_pred(self, y_in_sample, y_out_sample):
-        """Combine in- & out-sample prediction, slices given fh.
-
-        Parameters
-        ----------
-        y_in_sample : pd.Series
-            In-sample prediction
-        y_out_sample : pd.Series
-            Out-sample prediction
-
-        Returns
-        -------
-        pd.Series
-            y_pred, sliced by fh
-        """
-        y_pred = pd.concat([y_in_sample, y_out_sample], ignore_index=True).rename(
-            "y_pred"
-        )
-        y_pred = pd.DataFrame(y_pred)
-        # Workaround for slicing with negative index
-        y_pred["idx"] = [x for x in range(-len(y_in_sample), len(y_out_sample))]
-        y_pred = y_pred.loc[y_pred["idx"].isin(self.fh.to_indexer(self.cutoff).values)]
-        y_pred.index = self.fh.to_absolute(self.cutoff)
-        y_pred = y_pred["y_pred"].rename(None)
-        return y_pred
-
     @property
     def cutoff(self):
         """Cut-off = "present time" state of forecaster.

diff --git a/sktime/forecasting/base/_fh.py b/sktime/forecasting/base/_fh.py
@@ -216,17 +216,16 @@ class ForecastingHorizon:
     >>> y_train, y_test = temporal_train_test_split(y, test_size=6)
 
         List as ForecastingHorizon
-    >>> ForecastingHorizon([1, 2, 3])
-    ForecastingHorizon([1, 2, 3], dtype='int64', is_relative=True)
+    >>> ForecastingHorizon([1, 2, 3])  # doctest: +SKIP
+    >>> # ForecastingHorizon([1, 2, 3], is_relative=True)
 
         Numpy as ForecastingHorizon
-    >>> ForecastingHorizon(np.arange(1, 7))
-    ForecastingHorizon([1, 2, 3, 4, 5, 6], dtype='int64', is_relative=True)
+    >>> ForecastingHorizon(np.arange(1, 7))  # doctest: +SKIP
+    >>> # ForecastingHorizon([1, 2, 3, 4, 5, 6], is_relative=True)
 
         Absolute ForecastingHorizon with a pandas Index
     >>> ForecastingHorizon(y_test.index, is_relative=False) # doctest: +SKIP
-    ForecastingHorizon(['1960-07', '1960-08', '1960-09', '1960-10',
-        '1960-11', '1960-12'], dtype='period[M]', name='Period', is_relative=False)
+    >>> # ForecastingHorizon(['1960-07', ..., '1960-12'], is_relative=False)
 
         Converting
     >>> # set cutoff (last time point of training data)
@@ -235,27 +234,26 @@ class ForecastingHorizon:
     Period('1960-06', 'M')
     >>> # to_relative
     >>> fh = ForecastingHorizon(y_test.index, is_relative=False)
-    >>> fh.to_relative(cutoff=cutoff)
-    ForecastingHorizon([1, 2, 3, 4, 5, 6], dtype='int64', is_relative=True)
+    >>> fh.to_relative(cutoff=cutoff)  # doctest: +SKIP
+    >>> # ForecastingHorizon([1, 2, 3, 4, 5, 6], is_relative=True)
 
     >>> # to_absolute
     >>> fh = ForecastingHorizon([1, 2, 3, 4, 5, 6], is_relative=True)
-    >>> fh.to_absolute(cutoff=cutoff) # doctest: +SKIP
-    ForecastingHorizon(['1960-07', '1960-08', '1960-09', '1960-10',
-        '1960-11', '1960-12'], dtype='period[M]', is_relative=False)
+    >>> fh = fh.to_absolute(cutoff=cutoff) # doctest: +SKIP
+    >>> # ForecastingHorizon(['1960-07', ..., '1960-12'], is_relative=False)
 
         Automatically casted ForecastingHorizon from list when calling predict()
     >>> forecaster = NaiveForecaster(strategy="drift")
     >>> forecaster.fit(y_train)
     NaiveForecaster(...)
     >>> y_pred = forecaster.predict(fh=[1,2,3])
-    >>> forecaster.fh
-    ForecastingHorizon([1, 2, 3], dtype='int64', is_relative=True)
+    >>> forecaster.fh  # doctest: +SKIP
+    >>> # ForecastingHorizon([1, 2, 3], dtype='int64', is_relative=True)
 
         This is identical to give an object of ForecastingHorizon
     >>> y_pred = forecaster.predict(fh=ForecastingHorizon([1,2,3]))
-    >>> forecaster.fh
-    ForecastingHorizon([1, 2, 3], dtype='int64', is_relative=True)
+    >>> forecaster.fh  # doctest: +SKIP
+    >>> # ForecastingHorizon([1, 2, 3], dtype='int64', is_relative=True)
     """
 
     def __new__(
@@ -756,6 +754,13 @@ def _to_absolute(fh: ForecastingHorizon, cutoff) -> ForecastingHorizon:
             # computations of time deltas
             cutoff = _coerce_to_period(cutoff, freq=fh.freq)
 
+        if _check_soft_dependencies("pandas>=2.0.0", severity="none"):
+            if is_timestamp or isinstance(cutoff, pd.Period):
+                cutoff = pd.PeriodIndex([cutoff])
+
+            if isinstance(cutoff, pd.Index):
+                cutoff = cutoff[[0] * len(relative)]
+
         absolute = cutoff + relative
 
         if is_timestamp:

diff --git a/sktime/forecasting/base/_sktime.py b/sktime/forecasting/base/_sktime.py
@@ -74,7 +74,7 @@ def _predict_fixed_cutoff(
         if isinstance(y_pred, pd.Series) or isinstance(y_pred, pd.DataFrame):
             return y_pred
         else:
-            index = fh.to_absolute(self.cutoff)
+            index = fh.to_absolute(self.cutoff).to_pandas()
             return pd.Series(y_pred, index=index)
 
     def _predict_in_sample(

diff --git a/sktime/forecasting/base/adapters/_fbprophet.py b/sktime/forecasting/base/adapters/_fbprophet.py
@@ -41,15 +41,15 @@ def _convert_input_to_date(self, y):
         elif type(y.index) is pd.PeriodIndex:
             y = y.copy()
             y.index = y.index.to_timestamp()
-        elif y.index.is_integer():
+        elif pd.api.types.is_integer_dtype(y.index):
             y = self._convert_int_to_date(y)
         # else y is pd.DatetimeIndex as prophet expects, and needs no conversion
         return y
 
     def _remember_y_input_index_type(self, y):
         """Remember input type of y by setting attributes, for use in _fit."""
         self.y_index_was_period_ = type(y.index) is pd.PeriodIndex
-        self.y_index_was_int_ = y.index.is_integer()
+        self.y_index_was_int_ = pd.api.types.is_integer_dtype(y.index)
 
     def _fit(self, y, X=None, fh=None):
         """Fit to training data.
@@ -144,7 +144,7 @@ def _convert_X_for_exog(self, X, fh):
             X = X.copy()
             X = X.loc[self.fh.to_absolute(self.cutoff).to_pandas()]
             X.index = X.index.to_timestamp()
-        elif X.index.is_integer():
+        elif pd.api.types.is_integer_dtype(X.index):
             X = X.copy()
             X = X.loc[self.fh.to_absolute(self.cutoff).to_numpy()]
             X.index = fh
@@ -202,7 +202,7 @@ def _predict(self, fh=None, X=None):
         y_pred.columns = self._y.columns
 
         if self.y_index_was_int_ or self.y_index_was_period_:
-            y_pred.index = self.fh.to_absolute(cutoff=self.cutoff)
+            y_pred.index = self.fh.to_absolute(cutoff=self.cutoff).to_pandas()
 
         return y_pred
 
@@ -276,7 +276,7 @@ def _predict_interval(self, fh, X=None, coverage=0.90):
             pred_int[("Coverage", c, "upper")] = out_prophet.max(axis=1)
 
         if self.y_index_was_int_ or self.y_index_was_period_:
-            pred_int.index = self.fh.to_absolute(cutoff=self.cutoff)
+            pred_int.index = self.fh.to_absolute(cutoff=self.cutoff).to_pandas()
 
         return pred_int
 

diff --git a/sktime/forecasting/base/adapters/_pmdarima.py b/sktime/forecasting/base/adapters/_pmdarima.py
@@ -165,7 +165,9 @@ def _predict_in_sample(
         if return_pred_int:
             pred_ints = []
             for a in alpha:
-                pred_int = pd.DataFrame(index=fh_abs, columns=["lower", "upper"])
+                pred_int = pd.DataFrame(
+                    index=fh_abs.to_pandas(), columns=["lower", "upper"]
+                )
                 result = self._forecaster.predict_in_sample(
                     start=start,
                     end=end,
@@ -222,13 +224,15 @@ def _predict_fixed_cutoff(
                 )
                 pred_int = result[1]
                 pred_int = pd.DataFrame(
-                    pred_int[fh_idx, :], index=fh_abs, columns=["lower", "upper"]
+                    pred_int[fh_idx, :],
+                    index=fh_abs.to_pandas(),
+                    columns=["lower", "upper"],
                 )
                 pred_ints.append(pred_int)
             return result[0], pred_ints
         else:
             result = pd.Series(result).iloc[fh_idx]
-            result.index = fh_abs
+            result.index = fh_abs.to_pandas()
             return result
 
     def _predict_interval(self, fh, X=None, coverage=0.90):

diff --git a/sktime/forecasting/base/adapters/_statsforecast.py b/sktime/forecasting/base/adapters/_statsforecast.py
@@ -145,7 +145,9 @@ def _predict_in_sample(
         if return_pred_int:
             pred_ints = []
             for a in alpha:
-                pred_int = pd.DataFrame(index=fh_abs, columns=["lower", "upper"])
+                pred_int = pd.DataFrame(
+                    index=fh_abs.to_pandas(), columns=["lower", "upper"]
+                )
                 result = self._forecaster.predict_in_sample(level=int(100 * a))
                 pred_int.loc[fh_abs] = result.drop("mean", axis=1).values[fh_idx, :]
                 pred_ints.append(pred_int)
@@ -178,7 +180,7 @@ def _predict_fixed_cutoff(
 
         fh_abs = fh.to_absolute(self.cutoff)
         fh_idx = fh.to_indexer(self.cutoff)
-        mean = pd.Series(result["mean"].values[fh_idx], index=fh_abs)
+        mean = pd.Series(result["mean"].values[fh_idx], index=fh_abs.to_pandas())
         if return_pred_int:
             pred_ints = []
             for a in alpha:
@@ -189,12 +191,14 @@ def _predict_fixed_cutoff(
                 )
                 pred_int = result.drop("mean", axis=1).values
                 pred_int = pd.DataFrame(
-                    pred_int[fh_idx, :], index=fh_abs, columns=["lower", "upper"]
+                    pred_int[fh_idx, :],
+                    index=fh_abs.to_pandas(),
+                    columns=["lower", "upper"],
                 )
                 pred_ints.append(pred_int)
             return mean, pred_ints
         else:
-            return pd.Series(mean, index=fh_abs)
+            return pd.Series(mean, index=fh_abs.to_pandas())
 
     def _predict_interval(self, fh, X=None, coverage=0.90):
         """Compute/return prediction quantiles for a forecast.

diff --git a/sktime/forecasting/base/adapters/_statsmodels.py b/sktime/forecasting/base/adapters/_statsmodels.py
@@ -50,7 +50,7 @@ def _fit(self, y, X=None, fh=None):
         """
         # statsmodels does not support the pd.Int64Index as required,
         # so we coerce them here to pd.RangeIndex
-        if isinstance(y, pd.Series) and y.index.is_integer():
+        if isinstance(y, pd.Series) and pd.api.types.is_integer_dtype(y.index):
             y, X = _coerce_int_to_range_index(y, X)
         self._fit_forecaster(y, X)
         return self

diff --git a/sktime/forecasting/base/adapters/_tbats.py b/sktime/forecasting/base/adapters/_tbats.py
@@ -151,6 +151,32 @@ def _predict(self, fh, X=None):
         """
         return self._tbats_forecast(fh)
 
+    def _get_y_pred(self, y_in_sample, y_out_sample):
+        """Combine in- & out-sample prediction, slices given fh.
+
+        Parameters
+        ----------
+        y_in_sample : pd.Series
+            In-sample prediction
+        y_out_sample : pd.Series
+            Out-sample prediction
+
+        Returns
+        -------
+        pd.Series
+            y_pred, sliced by fh
+        """
+        y_pred = pd.concat([y_in_sample, y_out_sample], ignore_index=True).rename(
+            "y_pred"
+        )
+        y_pred = pd.DataFrame(y_pred)
+        # Workaround for slicing with negative index
+        y_pred["idx"] = [x for x in range(-len(y_in_sample), len(y_out_sample))]
+        y_pred = y_pred.loc[y_pred["idx"].isin(self.fh.to_indexer(self.cutoff).values)]
+        y_pred.index = self.fh.to_absolute(self.cutoff).to_pandas()
+        y_pred = y_pred["y_pred"].rename(None)
+        return y_pred
+
     def _tbats_forecast(self, fh):
         """TBATS forecast without confidence interval.
 
@@ -218,7 +244,7 @@ def _tbats_forecast_with_interval(self, fh, conf_lev):
 
             if len(fh) != len(fh_out):
                 epred_int = pd.DataFrame({"lower": nans(len_fh), "upper": nans(len_fh)})
-                epred_int.index = fh.to_absolute(self.cutoff)
+                epred_int.index = fh.to_absolute(self.cutoff).to_pandas()
 
                 in_pred_int = epred_int.index.isin(pred_int.index)
                 epred_int[in_pred_int] = pred_int
@@ -227,7 +253,7 @@ def _tbats_forecast_with_interval(self, fh, conf_lev):
         else:
             y_out = nans(len_fh)
             pred_int = pd.DataFrame({"lower": nans(len_fh), "upper": nans(len_fh)})
-            pred_int.index = fh.to_absolute(self.cutoff)
+            pred_int.index = fh.to_absolute(self.cutoff).to_pandas()
 
         # y_pred
         y_in_sample = pd.Series(self._forecaster.y_hat)
@@ -279,7 +305,9 @@ def _predict_interval(self, fh, X, coverage):
         # accumulator of results
         var_names = ["Coverage"]
         int_idx = pd.MultiIndex.from_product([var_names, coverage, ["lower", "upper"]])
-        pred_int = pd.DataFrame(columns=int_idx, index=fh.to_absolute(cutoff))
+        pred_int = pd.DataFrame(
+            columns=int_idx, index=fh.to_absolute(cutoff).to_pandas()
+        )
 
         for c in coverage:
 
@@ -342,7 +370,7 @@ def _get_pred_int(self, lower, upper):
         pred_int = pred_int.loc[
             pred_int["idx"].isin(fh_out.to_indexer(self.cutoff).values)
         ]
-        pred_int.index = fh_out.to_absolute(self.cutoff)
+        pred_int.index = fh_out.to_absolute(self.cutoff).to_pandas()
         pred_int = pred_int.drop(columns=["idx"])
         return pred_int
 

diff --git a/sktime/forecasting/bats.py b/sktime/forecasting/bats.py
@@ -9,7 +9,7 @@
 Wrapping implementation in [1]_ of method proposed in [2]_.
 """
 
-__author__ = ["Martin Walter"]
+__author__ = ["aiwalter"]
 __all__ = ["BATS"]
 
 from sktime.forecasting.base.adapters import _TbatsAdapter

diff --git a/sktime/forecasting/compose/_reduce.py b/sktime/forecasting/compose/_reduce.py
@@ -1681,9 +1681,9 @@ def _get_expected_pred_idx(self, fh):
             CAVEAT: sorted by index level -1, since reduction is applied by fh
         """
         if isinstance(fh, ForecastingHorizon):
-            fh_idx = pd.Index(fh.to_absolute(self.cutoff))
+            fh_idx = pd.Index(fh.to_absolute(self.cutoff).to_pandas())
         else:
-            fh_idx = pd.Index(fh)
+            fh_idx = pd.Index(fh.to_pandas())
         y_index = self._y.index
 
         if isinstance(y_index, pd.MultiIndex):

diff --git a/sktime/forecasting/compose/_stack.py b/sktime/forecasting/compose/_stack.py
@@ -165,7 +165,7 @@ def _predict(self, fh=None, X=None):
         y_preds = np.column_stack(self._predict_forecasters(fh=fh, X=X))
         y_pred = self.regressor_.predict(y_preds)
         # index = y_preds.index
-        index = self.fh.to_absolute(self.cutoff)
+        index = self.fh.to_absolute(self.cutoff).to_pandas()
         return pd.Series(y_pred, index=index, name=self._y.name)
 
     @classmethod