DOC improved documentation of MissingIndicator (#12424)

scikit-learn · Nov 20, 2018 · 7922ec4 · 7922ec4
1 parent 705101b
commit 7922ec4
Show file tree

Hide file tree

Showing 2 changed files with 50 additions and 2 deletions.
diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
@@ -120,3 +120,44 @@ whether or not they contain missing values::
          [False,  True, False, False]])
   >>> indicator.features_
   array([0, 1, 2, 3])
+
+When using the :class:`MissingIndicator` in a :class:`Pipeline`, be sure to use
+the :class:`FeatureUnion` or :class:`ColumnTransformer` to add the indicator
+features to the regular features. First we obtain the `iris` dataset, and add
+some missing values to it.
+
+  >>> from sklearn.datasets import load_iris
+  >>> from sklearn.impute import SimpleImputer, MissingIndicator
+  >>> from sklearn.model_selection import train_test_split
+  >>> from sklearn.pipeline import FeatureUnion, make_pipeline
+  >>> from sklearn.tree import DecisionTreeClassifier
+  >>> X, y = load_iris(return_X_y=True)
+  >>> mask = np.random.randint(0, 2, size=X.shape).astype(np.bool)
+  >>> X[mask] = np.nan
+  >>> X_train, X_test, y_train, _ = train_test_split(X, y, test_size=100,
+  ...                                                random_state=0)
+
+Now we create a :class:`FeatureUnion`. All features will be imputed using
+:class:`SimpleImputer`, in order to enable classifiers to work with this data.
+Additionally, it adds the the indicator variables from
+:class:`MissingIndicator`.
+
+  >>> transformer = FeatureUnion(
+  ...     transformer_list=[
+  ...         ('features', SimpleImputer(strategy='mean')),
+  ...         ('indicators', MissingIndicator())])
+  >>> transformer = transformer.fit(X_train, y_train)
+  >>> results = transformer.transform(X_test)
+  >>> results.shape
+  (100, 8)
+
+Of course, we cannot use the transformer to make any predictions. We should
+wrap this in a :class:`Pipeline` with a classifier (e.g., a
+:class:`DecisionTreeClassifier`) to be able to make predictions.
+
+  >>> clf = make_pipeline(transformer, DecisionTreeClassifier())
+  >>> clf = clf.fit(X_train, y_train)
+  >>> results = clf.predict(X_test)
+  >>> results.shape
+  (100,)
+
diff --git a/sklearn/impute.py b/sklearn/impute.py
@@ -412,11 +412,18 @@ def transform(self, X):
 class MissingIndicator(BaseEstimator, TransformerMixin):
     """Binary indicators for missing values.
 
+    Note that this component typically should not not be used in a vanilla
+    :class:`Pipeline` consisting of transformers and a classifier, but rather
+    could be added using a :class:`FeatureUnion` or :class:`ColumnTransformer`.
+
+    Read more in the :ref:`User Guide <impute>`.
+
     Parameters
     ----------
     missing_values : number, string, np.nan (default) or None
         The placeholder for the missing values. All occurrences of
-        `missing_values` will be imputed.
+        `missing_values` will be indicated (True in the output array), the
+        other values will be marked as False.
 
     features : str, optional
         Whether the imputer mask should represent all or a subset of
@@ -437,7 +444,7 @@ class MissingIndicator(BaseEstimator, TransformerMixin):
     error_on_new : boolean, optional
         If True (default), transform will raise an error when there are
         features with missing values in transform that have no missing values
-        in fit This is applicable only when ``features="missing-only"``.
+        in fit. This is applicable only when ``features="missing-only"``.
 
     Attributes
     ----------