FIX add support for non numeric values in MissingIndicator (#13046)

scikit-learn · Feb 19, 2019 · 2594d91 · 2594d91
1 parent 03df72f
commit 2594d91
Show file tree

Hide file tree

Showing 4 changed files with 75 additions and 21 deletions.
diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
@@ -36,6 +36,15 @@ Changelog
   threaded when `n_jobs > 1` or `n_jobs = -1`.
   :issue:`13005` by :user:`Prabakaran Kumaresshan <nixphix>`.
 
+:mod:`sklearn.impute`
+.....................
+
+- |Fix| add support for non-numeric data in
+  :class:`sklearn.impute.MissingIndicator` which was not supported while
+  :class:`sklearn.impute.SimpleImputer` was supporting this for some
+  imputation strategies.
+  :issue:`13046` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.linear_model`
 ...........................
 

diff --git a/sklearn/impute.py b/sklearn/impute.py
@@ -537,6 +537,23 @@ def _get_missing_features_info(self, X):
 
         return imputer_mask, features_with_missing
 
+    def _validate_input(self, X):
+        if not is_scalar_nan(self.missing_values):
+            force_all_finite = True
+        else:
+            force_all_finite = "allow-nan"
+        X = check_array(X, accept_sparse=('csc', 'csr'), dtype=None,
+                        force_all_finite=force_all_finite)
+        _check_inputs_dtype(X, self.missing_values)
+        if X.dtype.kind not in ("i", "u", "f", "O"):
+            raise ValueError("MissingIndicator does not support data with "
+                             "dtype {0}. Please provide either a numeric array"
+                             " (with a floating point or integer dtype) or "
+                             "categorical data represented either as an array "
+                             "with integer dtype or an array of string values "
+                             "with an object dtype.".format(X.dtype))
+        return X
+
     def fit(self, X, y=None):
         """Fit the transformer on X.
 
@@ -551,14 +568,7 @@ def fit(self, X, y=None):
         self : object
             Returns self.
         """
-        if not is_scalar_nan(self.missing_values):
-            force_all_finite = True
-        else:
-            force_all_finite = "allow-nan"
-        X = check_array(X, accept_sparse=('csc', 'csr'),
-                        force_all_finite=force_all_finite)
-        _check_inputs_dtype(X, self.missing_values)
-
+        X = self._validate_input(X)
         self._n_features = X.shape[1]
 
         if self.features not in ('missing-only', 'all'):
@@ -592,14 +602,7 @@ def transform(self, X):
 
         """
         check_is_fitted(self, "features_")
-
-        if not is_scalar_nan(self.missing_values):
-            force_all_finite = True
-        else:
-            force_all_finite = "allow-nan"
-        X = check_array(X, accept_sparse=('csc', 'csr'),
-                        force_all_finite=force_all_finite)
-        _check_inputs_dtype(X, self.missing_values)
+        X = self._validate_input(X)
 
         if X.shape[1] != self._n_features:
             raise ValueError("X has a different number of features "

diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
@@ -14,6 +14,7 @@
 from sklearn.impute import MissingIndicator
 from sklearn.impute import SimpleImputer
 from sklearn.pipeline import Pipeline
+from sklearn.pipeline import make_union
 from sklearn.model_selection import GridSearchCV
 from sklearn import tree
 from sklearn.random_projection import sparse_random_matrix
@@ -510,7 +511,10 @@ def test_imputation_copy():
       "'features' has to be either 'missing-only' or 'all'"),
      (np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, 2]]),
       {'features': 'all', 'sparse': 'random'},
-      "'sparse' has to be a boolean or 'auto'")]
+      "'sparse' has to be a boolean or 'auto'"),
+     (np.array([['a', 'b'], ['c', 'a']], dtype=str),
+      np.array([['a', 'b'], ['c', 'a']], dtype=str),
+      {}, "MissingIndicator does not support data with dtype")]
 )
 def test_missing_indicator_error(X_fit, X_trans, params, msg_err):
     indicator = MissingIndicator(missing_values=-1)
@@ -615,6 +619,37 @@ def test_missing_indicator_sparse_param(arr_type, missing_values,
             assert isinstance(X_trans_mask, np.ndarray)
 
 
+def test_missing_indicator_string():
+    X = np.array([['a', 'b', 'c'], ['b', 'c', 'a']], dtype=object)
+    indicator = MissingIndicator(missing_values='a', features='all')
+    X_trans = indicator.fit_transform(X)
+    assert_array_equal(X_trans, np.array([[True, False, False],
+                                          [False, False, True]]))
+
+
+@pytest.mark.parametrize(
+    "X, missing_values, X_trans_exp",
+    [(np.array([['a', 'b'], ['b', 'a']], dtype=object), 'a',
+      np.array([['b', 'b', True, False], ['b', 'b', False, True]],
+               dtype=object)),
+     (np.array([[np.nan, 1.], [1., np.nan]]), np.nan,
+      np.array([[1., 1., True, False], [1., 1., False, True]])),
+     (np.array([[np.nan, 'b'], ['b', np.nan]], dtype=object), np.nan,
+      np.array([['b', 'b', True, False], ['b', 'b', False, True]],
+               dtype=object)),
+     (np.array([[None, 'b'], ['b', None]], dtype=object), None,
+      np.array([['b', 'b', True, False], ['b', 'b', False, True]],
+               dtype=object))]
+)
+def test_missing_indicator_with_imputer(X, missing_values, X_trans_exp):
+    trans = make_union(
+        SimpleImputer(missing_values=missing_values, strategy='most_frequent'),
+        MissingIndicator(missing_values=missing_values)
+    )
+    X_trans = trans.fit_transform(X)
+    assert_array_equal(X_trans, X_trans_exp)
+
+
 @pytest.mark.parametrize("imputer_constructor",
                          [SimpleImputer])
 @pytest.mark.parametrize(

diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
@@ -77,10 +77,10 @@
                 'OrthogonalMatchingPursuit', 'PLSCanonical', 'PLSRegression',
                 'RANSACRegressor', 'RadiusNeighborsRegressor',
                 'RandomForestRegressor', 'Ridge', 'RidgeCV']
-
 ALLOW_NAN = ['Imputer', 'SimpleImputer', 'MissingIndicator',
              'MaxAbsScaler', 'MinMaxScaler', 'RobustScaler', 'StandardScaler',
              'PowerTransformer', 'QuantileTransformer']
+SUPPORT_STRING = ['SimpleImputer', 'MissingIndicator']
 
 
 def _yield_non_meta_checks(name, estimator):
@@ -628,9 +628,16 @@ def check_dtype_object(name, estimator_orig):
         if "Unknown label type" not in str(e):
             raise
 
-    X[0, 0] = {'foo': 'bar'}
-    msg = "argument must be a string or a number"
-    assert_raises_regex(TypeError, msg, estimator.fit, X, y)
+    if name not in SUPPORT_STRING:
+        X[0, 0] = {'foo': 'bar'}
+        msg = "argument must be a string or a number"
+        assert_raises_regex(TypeError, msg, estimator.fit, X, y)
+    else:
+        # Estimators supporting string will not call np.asarray to convert the
+        # data to numeric and therefore, the error will not be raised.
+        # Checking for each element dtype in the input array will be costly.
+        # Refer to #11401 for full discussion.
+        estimator.fit(X, y)
 
 
 def check_complex_data(name, estimator_orig):