Skip to content

Commit

Permalink
[MNT] pandas 2 compatibility fixes (#4427)
Browse files Browse the repository at this point in the history
Fixes #4426.
This PR contains some tentative fixes for compatibility issues arising from the `pandas` 2 release on Apr 3:

* fixes to index handling in `ForecastingHorizon.to_absolute`
* fixes to many instances of using `ForecastingHorizon` as `pd.DataFrame` `index` in constructor or setter (`.index = ...`), this is no longer possible
* writing to a `loc` index that does not exist no longer creates it but raises an exception - this was replaced by ensuring the right rows and columns are present before the write - `AutoETS`, `UnobservedComponents`
* replace deprecated `is_integer` and `is_numeric` with recommended replacement calls to `pandas.api.types`
* fixed old `from_nested_to_multi_index` utility producing `object` valued multiindex (instead of integer typed etc)
* same for `_convert_tsf_to_hierarchical` utility in data loaders
* replace deprecated `pandas.util.testing` imports with `pandas.testing`
* #4435
* #4436
* replace in-place nested pandas operations with construction of new frame: `test_interpolate`
* #4438
* replace single-index temporal slicing at lower granularity with slice-based slicing

Relies on #4429 for testing.

Does *not* change the dependency bounds.
  • Loading branch information
fkiraly committed Apr 10, 2023
1 parent 52a9cf7 commit d0d8298
Show file tree
Hide file tree
Showing 37 changed files with 162 additions and 142 deletions.
9 changes: 4 additions & 5 deletions sktime/datasets/_data_io.py
Expand Up @@ -2036,11 +2036,10 @@ def _convert_tsf_to_hierarchical(
columns = [value_column_name, "timestamp"]
index_columns = [c for c in list(df.columns) if c not in drop_columns + columns]
result = pd.DataFrame({c: df[c].explode() for c in columns})
df = (
df.drop(columns=columns + drop_columns)
.join(result)
.set_index(index_columns + ["timestamp"])
)
df = df.drop(columns=columns + drop_columns).join(result)
if df["timestamp"].dtype == "object":
df = df.astype({"timestamp": "int64"})
df = df.set_index(index_columns + ["timestamp"])
df = df.astype({value_column_name: "float"}, errors="ignore")

return df
Expand Down
5 changes: 3 additions & 2 deletions sktime/datatypes/_panel/_convert.py
Expand Up @@ -810,8 +810,9 @@ def from_nested_to_multi_index(X, instance_index=None, time_index=None):

# create the right MultiIndex and assign to X_mi
idx_df = X[[c]].applymap(lambda x: x.index).explode(c)
idx_df = idx_df.set_index(c, append=True)
X_col.index = idx_df.index.set_names([instance_index, time_index])
index = pd.MultiIndex.from_arrays([idx_df.index, idx_df[c].values])
index = index.set_names([instance_index, time_index])
X_col.index = index

X_mi[[c]] = X_col

Expand Down
2 changes: 1 addition & 1 deletion sktime/datatypes/tests/test_utils.py
Expand Up @@ -207,7 +207,7 @@ def test_get_cutoff_from_index(reverse_order):
)

assert isinstance(cutoff, pd.Index) and len(cutoff) == 1
assert cutoff.is_integer()
assert pd.api.types.is_integer_dtype(cutoff)
assert idx == cutoff[0]

if reverse_order:
Expand Down
2 changes: 1 addition & 1 deletion sktime/forecasting/ardl.py
Expand Up @@ -324,7 +324,7 @@ def _fit(self, y, X=None, fh=None):

# statsmodels does not support the pd.Int64Index as required,
# so we coerce them here to pd.RangeIndex
if isinstance(y, pd.Series) and y.index.is_integer():
if isinstance(y, pd.Series) and pd.api.types.is_integer_dtype(y.index):
y, X = _coerce_int_to_range_index(y, X)

# validity check of passed params
Expand Down
26 changes: 0 additions & 26 deletions sktime/forecasting/base/_base.py
Expand Up @@ -1589,32 +1589,6 @@ def _update_y_X(self, y, X=None, enforce_index_type=None):
else:
self._X = update_data(self._X, X)

def _get_y_pred(self, y_in_sample, y_out_sample):
"""Combine in- & out-sample prediction, slices given fh.
Parameters
----------
y_in_sample : pd.Series
In-sample prediction
y_out_sample : pd.Series
Out-sample prediction
Returns
-------
pd.Series
y_pred, sliced by fh
"""
y_pred = pd.concat([y_in_sample, y_out_sample], ignore_index=True).rename(
"y_pred"
)
y_pred = pd.DataFrame(y_pred)
# Workaround for slicing with negative index
y_pred["idx"] = [x for x in range(-len(y_in_sample), len(y_out_sample))]
y_pred = y_pred.loc[y_pred["idx"].isin(self.fh.to_indexer(self.cutoff).values)]
y_pred.index = self.fh.to_absolute(self.cutoff)
y_pred = y_pred["y_pred"].rename(None)
return y_pred

@property
def cutoff(self):
"""Cut-off = "present time" state of forecaster.
Expand Down
35 changes: 20 additions & 15 deletions sktime/forecasting/base/_fh.py
Expand Up @@ -216,17 +216,16 @@ class ForecastingHorizon:
>>> y_train, y_test = temporal_train_test_split(y, test_size=6)
List as ForecastingHorizon
>>> ForecastingHorizon([1, 2, 3])
ForecastingHorizon([1, 2, 3], dtype='int64', is_relative=True)
>>> ForecastingHorizon([1, 2, 3]) # doctest: +SKIP
>>> # ForecastingHorizon([1, 2, 3], is_relative=True)
Numpy as ForecastingHorizon
>>> ForecastingHorizon(np.arange(1, 7))
ForecastingHorizon([1, 2, 3, 4, 5, 6], dtype='int64', is_relative=True)
>>> ForecastingHorizon(np.arange(1, 7)) # doctest: +SKIP
>>> # ForecastingHorizon([1, 2, 3, 4, 5, 6], is_relative=True)
Absolute ForecastingHorizon with a pandas Index
>>> ForecastingHorizon(y_test.index, is_relative=False) # doctest: +SKIP
ForecastingHorizon(['1960-07', '1960-08', '1960-09', '1960-10',
'1960-11', '1960-12'], dtype='period[M]', name='Period', is_relative=False)
>>> # ForecastingHorizon(['1960-07', ..., '1960-12'], is_relative=False)
Converting
>>> # set cutoff (last time point of training data)
Expand All @@ -235,27 +234,26 @@ class ForecastingHorizon:
Period('1960-06', 'M')
>>> # to_relative
>>> fh = ForecastingHorizon(y_test.index, is_relative=False)
>>> fh.to_relative(cutoff=cutoff)
ForecastingHorizon([1, 2, 3, 4, 5, 6], dtype='int64', is_relative=True)
>>> fh.to_relative(cutoff=cutoff) # doctest: +SKIP
>>> # ForecastingHorizon([1, 2, 3, 4, 5, 6], is_relative=True)
>>> # to_absolute
>>> fh = ForecastingHorizon([1, 2, 3, 4, 5, 6], is_relative=True)
>>> fh.to_absolute(cutoff=cutoff) # doctest: +SKIP
ForecastingHorizon(['1960-07', '1960-08', '1960-09', '1960-10',
'1960-11', '1960-12'], dtype='period[M]', is_relative=False)
>>> fh = fh.to_absolute(cutoff=cutoff) # doctest: +SKIP
>>> # ForecastingHorizon(['1960-07', ..., '1960-12'], is_relative=False)
Automatically casted ForecastingHorizon from list when calling predict()
>>> forecaster = NaiveForecaster(strategy="drift")
>>> forecaster.fit(y_train)
NaiveForecaster(...)
>>> y_pred = forecaster.predict(fh=[1,2,3])
>>> forecaster.fh
ForecastingHorizon([1, 2, 3], dtype='int64', is_relative=True)
>>> forecaster.fh # doctest: +SKIP
>>> # ForecastingHorizon([1, 2, 3], dtype='int64', is_relative=True)
This is identical to give an object of ForecastingHorizon
>>> y_pred = forecaster.predict(fh=ForecastingHorizon([1,2,3]))
>>> forecaster.fh
ForecastingHorizon([1, 2, 3], dtype='int64', is_relative=True)
>>> forecaster.fh # doctest: +SKIP
>>> # ForecastingHorizon([1, 2, 3], dtype='int64', is_relative=True)
"""

def __new__(
Expand Down Expand Up @@ -756,6 +754,13 @@ def _to_absolute(fh: ForecastingHorizon, cutoff) -> ForecastingHorizon:
# computations of time deltas
cutoff = _coerce_to_period(cutoff, freq=fh.freq)

if _check_soft_dependencies("pandas>=2.0.0", severity="none"):
if is_timestamp or isinstance(cutoff, pd.Period):
cutoff = pd.PeriodIndex([cutoff])

if isinstance(cutoff, pd.Index):
cutoff = cutoff[[0] * len(relative)]

absolute = cutoff + relative

if is_timestamp:
Expand Down
2 changes: 1 addition & 1 deletion sktime/forecasting/base/_sktime.py
Expand Up @@ -74,7 +74,7 @@ def _predict_fixed_cutoff(
if isinstance(y_pred, pd.Series) or isinstance(y_pred, pd.DataFrame):
return y_pred
else:
index = fh.to_absolute(self.cutoff)
index = fh.to_absolute(self.cutoff).to_pandas()
return pd.Series(y_pred, index=index)

def _predict_in_sample(
Expand Down
10 changes: 5 additions & 5 deletions sktime/forecasting/base/adapters/_fbprophet.py
Expand Up @@ -41,15 +41,15 @@ def _convert_input_to_date(self, y):
elif type(y.index) is pd.PeriodIndex:
y = y.copy()
y.index = y.index.to_timestamp()
elif y.index.is_integer():
elif pd.api.types.is_integer_dtype(y.index):
y = self._convert_int_to_date(y)
# else y is pd.DatetimeIndex as prophet expects, and needs no conversion
return y

def _remember_y_input_index_type(self, y):
"""Remember input type of y by setting attributes, for use in _fit."""
self.y_index_was_period_ = type(y.index) is pd.PeriodIndex
self.y_index_was_int_ = y.index.is_integer()
self.y_index_was_int_ = pd.api.types.is_integer_dtype(y.index)

def _fit(self, y, X=None, fh=None):
"""Fit to training data.
Expand Down Expand Up @@ -144,7 +144,7 @@ def _convert_X_for_exog(self, X, fh):
X = X.copy()
X = X.loc[self.fh.to_absolute(self.cutoff).to_pandas()]
X.index = X.index.to_timestamp()
elif X.index.is_integer():
elif pd.api.types.is_integer_dtype(X.index):
X = X.copy()
X = X.loc[self.fh.to_absolute(self.cutoff).to_numpy()]
X.index = fh
Expand Down Expand Up @@ -202,7 +202,7 @@ def _predict(self, fh=None, X=None):
y_pred.columns = self._y.columns

if self.y_index_was_int_ or self.y_index_was_period_:
y_pred.index = self.fh.to_absolute(cutoff=self.cutoff)
y_pred.index = self.fh.to_absolute(cutoff=self.cutoff).to_pandas()

return y_pred

Expand Down Expand Up @@ -276,7 +276,7 @@ def _predict_interval(self, fh, X=None, coverage=0.90):
pred_int[("Coverage", c, "upper")] = out_prophet.max(axis=1)

if self.y_index_was_int_ or self.y_index_was_period_:
pred_int.index = self.fh.to_absolute(cutoff=self.cutoff)
pred_int.index = self.fh.to_absolute(cutoff=self.cutoff).to_pandas()

return pred_int

Expand Down
10 changes: 7 additions & 3 deletions sktime/forecasting/base/adapters/_pmdarima.py
Expand Up @@ -165,7 +165,9 @@ def _predict_in_sample(
if return_pred_int:
pred_ints = []
for a in alpha:
pred_int = pd.DataFrame(index=fh_abs, columns=["lower", "upper"])
pred_int = pd.DataFrame(
index=fh_abs.to_pandas(), columns=["lower", "upper"]
)
result = self._forecaster.predict_in_sample(
start=start,
end=end,
Expand Down Expand Up @@ -222,13 +224,15 @@ def _predict_fixed_cutoff(
)
pred_int = result[1]
pred_int = pd.DataFrame(
pred_int[fh_idx, :], index=fh_abs, columns=["lower", "upper"]
pred_int[fh_idx, :],
index=fh_abs.to_pandas(),
columns=["lower", "upper"],
)
pred_ints.append(pred_int)
return result[0], pred_ints
else:
result = pd.Series(result).iloc[fh_idx]
result.index = fh_abs
result.index = fh_abs.to_pandas()
return result

def _predict_interval(self, fh, X=None, coverage=0.90):
Expand Down
12 changes: 8 additions & 4 deletions sktime/forecasting/base/adapters/_statsforecast.py
Expand Up @@ -145,7 +145,9 @@ def _predict_in_sample(
if return_pred_int:
pred_ints = []
for a in alpha:
pred_int = pd.DataFrame(index=fh_abs, columns=["lower", "upper"])
pred_int = pd.DataFrame(
index=fh_abs.to_pandas(), columns=["lower", "upper"]
)
result = self._forecaster.predict_in_sample(level=int(100 * a))
pred_int.loc[fh_abs] = result.drop("mean", axis=1).values[fh_idx, :]
pred_ints.append(pred_int)
Expand Down Expand Up @@ -178,7 +180,7 @@ def _predict_fixed_cutoff(

fh_abs = fh.to_absolute(self.cutoff)
fh_idx = fh.to_indexer(self.cutoff)
mean = pd.Series(result["mean"].values[fh_idx], index=fh_abs)
mean = pd.Series(result["mean"].values[fh_idx], index=fh_abs.to_pandas())
if return_pred_int:
pred_ints = []
for a in alpha:
Expand All @@ -189,12 +191,14 @@ def _predict_fixed_cutoff(
)
pred_int = result.drop("mean", axis=1).values
pred_int = pd.DataFrame(
pred_int[fh_idx, :], index=fh_abs, columns=["lower", "upper"]
pred_int[fh_idx, :],
index=fh_abs.to_pandas(),
columns=["lower", "upper"],
)
pred_ints.append(pred_int)
return mean, pred_ints
else:
return pd.Series(mean, index=fh_abs)
return pd.Series(mean, index=fh_abs.to_pandas())

def _predict_interval(self, fh, X=None, coverage=0.90):
"""Compute/return prediction quantiles for a forecast.
Expand Down
2 changes: 1 addition & 1 deletion sktime/forecasting/base/adapters/_statsmodels.py
Expand Up @@ -50,7 +50,7 @@ def _fit(self, y, X=None, fh=None):
"""
# statsmodels does not support the pd.Int64Index as required,
# so we coerce them here to pd.RangeIndex
if isinstance(y, pd.Series) and y.index.is_integer():
if isinstance(y, pd.Series) and pd.api.types.is_integer_dtype(y.index):
y, X = _coerce_int_to_range_index(y, X)
self._fit_forecaster(y, X)
return self
Expand Down
36 changes: 32 additions & 4 deletions sktime/forecasting/base/adapters/_tbats.py
Expand Up @@ -151,6 +151,32 @@ def _predict(self, fh, X=None):
"""
return self._tbats_forecast(fh)

def _get_y_pred(self, y_in_sample, y_out_sample):
"""Combine in- & out-sample prediction, slices given fh.
Parameters
----------
y_in_sample : pd.Series
In-sample prediction
y_out_sample : pd.Series
Out-sample prediction
Returns
-------
pd.Series
y_pred, sliced by fh
"""
y_pred = pd.concat([y_in_sample, y_out_sample], ignore_index=True).rename(
"y_pred"
)
y_pred = pd.DataFrame(y_pred)
# Workaround for slicing with negative index
y_pred["idx"] = [x for x in range(-len(y_in_sample), len(y_out_sample))]
y_pred = y_pred.loc[y_pred["idx"].isin(self.fh.to_indexer(self.cutoff).values)]
y_pred.index = self.fh.to_absolute(self.cutoff).to_pandas()
y_pred = y_pred["y_pred"].rename(None)
return y_pred

def _tbats_forecast(self, fh):
"""TBATS forecast without confidence interval.
Expand Down Expand Up @@ -218,7 +244,7 @@ def _tbats_forecast_with_interval(self, fh, conf_lev):

if len(fh) != len(fh_out):
epred_int = pd.DataFrame({"lower": nans(len_fh), "upper": nans(len_fh)})
epred_int.index = fh.to_absolute(self.cutoff)
epred_int.index = fh.to_absolute(self.cutoff).to_pandas()

in_pred_int = epred_int.index.isin(pred_int.index)
epred_int[in_pred_int] = pred_int
Expand All @@ -227,7 +253,7 @@ def _tbats_forecast_with_interval(self, fh, conf_lev):
else:
y_out = nans(len_fh)
pred_int = pd.DataFrame({"lower": nans(len_fh), "upper": nans(len_fh)})
pred_int.index = fh.to_absolute(self.cutoff)
pred_int.index = fh.to_absolute(self.cutoff).to_pandas()

# y_pred
y_in_sample = pd.Series(self._forecaster.y_hat)
Expand Down Expand Up @@ -279,7 +305,9 @@ def _predict_interval(self, fh, X, coverage):
# accumulator of results
var_names = ["Coverage"]
int_idx = pd.MultiIndex.from_product([var_names, coverage, ["lower", "upper"]])
pred_int = pd.DataFrame(columns=int_idx, index=fh.to_absolute(cutoff))
pred_int = pd.DataFrame(
columns=int_idx, index=fh.to_absolute(cutoff).to_pandas()
)

for c in coverage:

Expand Down Expand Up @@ -342,7 +370,7 @@ def _get_pred_int(self, lower, upper):
pred_int = pred_int.loc[
pred_int["idx"].isin(fh_out.to_indexer(self.cutoff).values)
]
pred_int.index = fh_out.to_absolute(self.cutoff)
pred_int.index = fh_out.to_absolute(self.cutoff).to_pandas()
pred_int = pred_int.drop(columns=["idx"])
return pred_int

Expand Down
2 changes: 1 addition & 1 deletion sktime/forecasting/bats.py
Expand Up @@ -9,7 +9,7 @@
Wrapping implementation in [1]_ of method proposed in [2]_.
"""

__author__ = ["Martin Walter"]
__author__ = ["aiwalter"]
__all__ = ["BATS"]

from sktime.forecasting.base.adapters import _TbatsAdapter
Expand Down
4 changes: 2 additions & 2 deletions sktime/forecasting/compose/_reduce.py
Expand Up @@ -1681,9 +1681,9 @@ def _get_expected_pred_idx(self, fh):
CAVEAT: sorted by index level -1, since reduction is applied by fh
"""
if isinstance(fh, ForecastingHorizon):
fh_idx = pd.Index(fh.to_absolute(self.cutoff))
fh_idx = pd.Index(fh.to_absolute(self.cutoff).to_pandas())
else:
fh_idx = pd.Index(fh)
fh_idx = pd.Index(fh.to_pandas())
y_index = self._y.index

if isinstance(y_index, pd.MultiIndex):
Expand Down
2 changes: 1 addition & 1 deletion sktime/forecasting/compose/_stack.py
Expand Up @@ -165,7 +165,7 @@ def _predict(self, fh=None, X=None):
y_preds = np.column_stack(self._predict_forecasters(fh=fh, X=X))
y_pred = self.regressor_.predict(y_preds)
# index = y_preds.index
index = self.fh.to_absolute(self.cutoff)
index = self.fh.to_absolute(self.cutoff).to_pandas()
return pd.Series(y_pred, index=index, name=self._y.name)

@classmethod
Expand Down

0 comments on commit d0d8298

Please sign in to comment.