[BUG] Imputer bugfix for issue #6224 (#6253)

Fixes #6224 #### What does this implement/fix? Explain your changes. For the Imputer: - Add to docstring that after every method ffill then bfill will be used - In case of pd-multiindex also always ffill then bfill to keep behaviour consistent with single index case - In case of pd-multiindex never impute on non-grouped data For the Imputer tests: - Add test for bug #6224
sktime · Apr 10, 2024 · a63537b · a63537b
1 parent 1e8b335
commit a63537b
Show file tree

Hide file tree

Showing 3 changed files with 59 additions and 11 deletions.
diff --git a/.all-contributorsrc b/.all-contributorsrc
@@ -2343,7 +2343,9 @@
       "profile": "https://github.com/Ram0nB",
       "contributions": [
         "doc",
-        "code"
+        "code",
+        "bug",
+        "test"
       ]
     },
     {

diff --git a/sktime/transformations/series/impute.py b/sktime/transformations/series/impute.py
@@ -23,7 +23,9 @@ class Imputer(BaseTransformer):
     Parameters
     ----------
     method : str, default="drift"
-        Method to fill the missing values.
+        Method to fill the missing values. Not all methods can extrapolate, so after
+        ``method`` is applied the remaining missing values are filled with ``ffill``
+        then ``bfill``.
 
         * "drift" : drift/trend values by sktime.PolynomialTrendForecaster(degree=1)
           first, X in transform() is filled with ffill then bfill
@@ -231,22 +233,26 @@ def _transform(self, X, y=None):
         elif self.method == "constant":
             return X.fillna(value=self.value)
         elif isinstance(index, pd.MultiIndex):
-            X_grouped = X.groupby(level=list(range(index.nlevels - 1)))
+            X_group_levels = list(range(index.nlevels - 1))
 
             if self.method in ["backfill", "bfill"]:
-                X = X_grouped.bfill()
-                # fill trailing NAs of panel instances with reverse method
-                return X.ffill()
+                X = X.groupby(level=X_group_levels).bfill()
             elif self.method in ["pad", "ffill"]:
-                X = X_grouped.ffill()
-                # fill leading NAs of panel instances with reverse method
-                return X.bfill()
+                X = X.groupby(level=X_group_levels).ffill()
             elif self.method == "mean":
-                return X_grouped.fillna(value=self._mean)
+                X = X.groupby(level=X_group_levels).fillna(value=self._mean)
             elif self.method == "median":
-                return X_grouped.fillna(value=self._median)
+                X = X.groupby(level=X_group_levels).fillna(value=self._median)
             else:
                 raise AssertionError("Code should not be reached")
+
+            # fill first/last elements of series,
+            # as some methods can't impute those
+            X = X.groupby(level=X_group_levels).ffill()
+            X = X.groupby(level=X_group_levels).bfill()
+
+            return X
+
         else:
             if self.method in ["backfill", "bfill"]:
                 X = X.bfill()

diff --git a/sktime/transformations/series/tests/test_imputer.py b/sktime/transformations/series/tests/test_imputer.py
@@ -8,7 +8,9 @@
 import numpy as np
 import pytest
 
+from sktime.datatypes import get_examples
 from sktime.forecasting.naive import NaiveForecaster
+from sktime.transformations.compose import TransformByLevel
 from sktime.transformations.series.impute import Imputer
 from sktime.utils._testing.forecasting import make_forecasting_problem
 from sktime.utils._testing.hierarchical import _make_hierarchical
@@ -58,6 +60,44 @@ def test_imputer(method, Z, value, forecaster):
     assert not y_hat.isnull().to_numpy().any()
 
 
+@pytest.mark.parametrize(
+    "method",
+    [
+        "linear",
+        "nearest",
+        "mean",
+        "median",
+        "backfill",
+        "pad",
+    ],
+)
+def test_impute_multiindex(method):
+    """Test for data leakage in case of pd-multiindex data.
+
+    Failure case in bug #6224
+    """
+
+    df = get_examples(mtype="pd-multiindex")[0].copy()
+    df.iloc[:3, :] = np.nan  # instance 0 entirely missing
+    df.iloc[3:4, :] = np.nan  # instance 1 first timepoint missing
+    df.iloc[8:, :] = np.nan  # instance 2 last timepoint missing
+
+    imp = Imputer(method=method)
+    df_imp = imp.fit_transform(df)
+
+    # instance 0 entirely missing, so it should remain missing
+    assert np.array_equal(df.iloc[:3, :], df_imp.iloc[:3, :], equal_nan=True)
+
+    # instance 1 and 2 should not have any missing values
+    assert not df_imp.iloc[3:, :].isna().any().any()
+
+    # test consistency between applying the imputer to every instance separately,
+    # vs applying them to the panel
+    imp_tbl = TransformByLevel(Imputer(method=method))
+    df_imp_tbl = imp_tbl.fit_transform(df)
+    assert np.array_equal(df_imp, df_imp_tbl, equal_nan=True)
+
+
 def test_imputer_forecaster_y():
     """Test that forecaster imputer works with y.