Skip to content

Commit

Permalink
[BUG] Imputer bugfix for issue #6224 (#6253)
Browse files Browse the repository at this point in the history
Fixes #6224

#### What does this implement/fix? Explain your changes.

For the Imputer:
- Add to docstring that after every method ffill then bfill will be used
- In case of pd-multiindex also always ffill then bfill to keep
behaviour consistent with single index case
- In case of pd-multiindex never impute on non-grouped data

For the Imputer tests:
- Add test for bug #6224
  • Loading branch information
Ram0nB committed Apr 10, 2024
1 parent 1e8b335 commit a63537b
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 11 deletions.
4 changes: 3 additions & 1 deletion .all-contributorsrc
Expand Up @@ -2343,7 +2343,9 @@
"profile": "https://github.com/Ram0nB",
"contributions": [
"doc",
"code"
"code",
"bug",
"test"
]
},
{
Expand Down
26 changes: 16 additions & 10 deletions sktime/transformations/series/impute.py
Expand Up @@ -23,7 +23,9 @@ class Imputer(BaseTransformer):
Parameters
----------
method : str, default="drift"
Method to fill the missing values.
Method to fill the missing values. Not all methods can extrapolate, so after
``method`` is applied the remaining missing values are filled with ``ffill``
then ``bfill``.
* "drift" : drift/trend values by sktime.PolynomialTrendForecaster(degree=1)
first, X in transform() is filled with ffill then bfill
Expand Down Expand Up @@ -231,22 +233,26 @@ def _transform(self, X, y=None):
elif self.method == "constant":
return X.fillna(value=self.value)
elif isinstance(index, pd.MultiIndex):
X_grouped = X.groupby(level=list(range(index.nlevels - 1)))
X_group_levels = list(range(index.nlevels - 1))

if self.method in ["backfill", "bfill"]:
X = X_grouped.bfill()
# fill trailing NAs of panel instances with reverse method
return X.ffill()
X = X.groupby(level=X_group_levels).bfill()
elif self.method in ["pad", "ffill"]:
X = X_grouped.ffill()
# fill leading NAs of panel instances with reverse method
return X.bfill()
X = X.groupby(level=X_group_levels).ffill()
elif self.method == "mean":
return X_grouped.fillna(value=self._mean)
X = X.groupby(level=X_group_levels).fillna(value=self._mean)
elif self.method == "median":
return X_grouped.fillna(value=self._median)
X = X.groupby(level=X_group_levels).fillna(value=self._median)
else:
raise AssertionError("Code should not be reached")

# fill first/last elements of series,
# as some methods can't impute those
X = X.groupby(level=X_group_levels).ffill()
X = X.groupby(level=X_group_levels).bfill()

return X

else:
if self.method in ["backfill", "bfill"]:
X = X.bfill()
Expand Down
40 changes: 40 additions & 0 deletions sktime/transformations/series/tests/test_imputer.py
Expand Up @@ -8,7 +8,9 @@
import numpy as np
import pytest

from sktime.datatypes import get_examples
from sktime.forecasting.naive import NaiveForecaster
from sktime.transformations.compose import TransformByLevel
from sktime.transformations.series.impute import Imputer
from sktime.utils._testing.forecasting import make_forecasting_problem
from sktime.utils._testing.hierarchical import _make_hierarchical
Expand Down Expand Up @@ -58,6 +60,44 @@ def test_imputer(method, Z, value, forecaster):
assert not y_hat.isnull().to_numpy().any()


@pytest.mark.parametrize(
"method",
[
"linear",
"nearest",
"mean",
"median",
"backfill",
"pad",
],
)
def test_impute_multiindex(method):
"""Test for data leakage in case of pd-multiindex data.
Failure case in bug #6224
"""

df = get_examples(mtype="pd-multiindex")[0].copy()
df.iloc[:3, :] = np.nan # instance 0 entirely missing
df.iloc[3:4, :] = np.nan # instance 1 first timepoint missing
df.iloc[8:, :] = np.nan # instance 2 last timepoint missing

imp = Imputer(method=method)
df_imp = imp.fit_transform(df)

# instance 0 entirely missing, so it should remain missing
assert np.array_equal(df.iloc[:3, :], df_imp.iloc[:3, :], equal_nan=True)

# instance 1 and 2 should not have any missing values
assert not df_imp.iloc[3:, :].isna().any().any()

# test consistency between applying the imputer to every instance separately,
# vs applying them to the panel
imp_tbl = TransformByLevel(Imputer(method=method))
df_imp_tbl = imp_tbl.fit_transform(df)
assert np.array_equal(df_imp, df_imp_tbl, equal_nan=True)


def test_imputer_forecaster_y():
"""Test that forecaster imputer works with y.
Expand Down

0 comments on commit a63537b

Please sign in to comment.