scikit-learn · ogrisel · Jun 20, 2018 · Jun 6, 2018 · Jun 6, 2018 · Jun 6, 2018
diff --git a/sklearn/impute.py b/sklearn/impute.py
@@ -7,6 +7,7 @@
 
 import warnings
 from time import time
+import numbers
 
 import numpy as np
 import numpy.ma as ma
@@ -36,11 +37,20 @@
     'MICEImputer',
 ]
 
+def _is_scalar_nan(x):
+    """Work around limitations of numpy ufuncs"""
+    return False if x is None else np.isnan(x)
+
 
 def _get_mask(X, value_to_mask):
     """Compute the boolean mask X == missing_values."""
-    if value_to_mask == "NaN" or np.isnan(value_to_mask):
-        return np.isnan(X)
+    if value_to_mask == "NaN" or _is_scalar_nan(value_to_mask):
+        if X.dtype.kind == "O":
+            # np.isnan does not work for dtype objects. We use the trick that
+            # nan values are never equal to themselves.
+            return np.logical_not(X == X)
+        else:
+            return np.isnan(X)
     else:
         return X == value_to_mask
 
@@ -94,6 +104,13 @@ class SimpleImputer(BaseEstimator, TransformerMixin):
           each column.
         - If "most_frequent", then replace missing using the most frequent
           value along each column.
+        - If "constant", then replace missing values with fill_value
+
+    fill_value : string or numerical value, optional (default=None)
+        When strategy == "constant", fill_value is used to replace all
+        occurrences of missing_values.
+        If left to the default, fill_value will be 0 when imputing numerical
+        data and "missing_value" for strings or object data types.
 
     verbose : integer, optional (default=0)
         Controls the verbosity of the imputer.
@@ -115,16 +132,41 @@ class SimpleImputer(BaseEstimator, TransformerMixin):
     Notes
     -----
     Columns which only contained missing values at `fit` are discarded upon
-    `transform`.
+    `transform` is strategy is not "constant"
 
     """
     def __init__(self, missing_values="NaN", strategy="mean",
-                 verbose=0, copy=True):
+                 fill_value=None, verbose=0, copy=True):
         self.missing_values = missing_values
         self.strategy = strategy
+        self.fill_value = fill_value
         self.verbose = verbose
         self.copy = copy
 
+    def _validate_input(self, X):
+        allowed_strategies = ["mean", "median", "most_frequent", "constant"]
+        if self.strategy not in allowed_strategies:
+            raise ValueError("Can only use these strategies: {0} "
+                             " got strategy={1}".format(allowed_strategies,
+                                                        self.strategy))
+
+        if self.strategy in ("most_frequent", "constant"):
+            dtype = None 
+        else:
+            dtype = FLOAT_DTYPES
+
+        if self.missing_values is None:
+            force_all_finite = "allow-nan"
+        else:
+            if self.missing_values == "NaN" or np.isnan(self.missing_values):
+                force_all_finite = "allow-nan"
+            else:
+                force_all_finite = True
+
+        return check_array(X, accept_sparse='csc', dtype=dtype,
+                           force_all_finite=force_all_finite)
+
+
     def fit(self, X, y=None):
         """Fit the imputer on X.
 
@@ -138,30 +180,37 @@ def fit(self, X, y=None):
         -------
         self : SimpleImputer
         """
-        # Check parameters
-        allowed_strategies = ["mean", "median", "most_frequent"]
-        if self.strategy not in allowed_strategies:
-            raise ValueError("Can only use these strategies: {0} "
-                             " got strategy={1}".format(allowed_strategies,
-                                                        self.strategy))
-
-        X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES,
-                        force_all_finite='allow-nan'
-                        if self.missing_values == 'NaN'
-                        or np.isnan(self.missing_values) else True)
+        X = self._validate_input(X)
+
+        if self.strategy == "constant":
+            if (X.dtype.kind in ("i", "f") 
+                    and not isinstance(self.fill_value, numbers.Real)):
+                raise ValueError(
+                    "fill_value={0} is invalid. Expected a numerical value "
+                    "to numerical data".format(self.fill_value))
+
+        if self.fill_value is None:
+            if X.dtype.kind in ("i", "f"):
+                fill_value = 0
+            else:
+                fill_value = "missing_value"
+        else:
+            fill_value = self.fill_value
 
         if sparse.issparse(X):
             self.statistics_ = self._sparse_fit(X,
                                                 self.strategy,
-                                                self.missing_values)
+                                                self.missing_values,
+                                                fill_value)
         else:
             self.statistics_ = self._dense_fit(X,
                                                self.strategy,
-                                               self.missing_values)
+                                               self.missing_values,
+                                               fill_value)
 
         return self
 
-    def _sparse_fit(self, X, strategy, missing_values):
+    def _sparse_fit(self, X, strategy, missing_values, fill_value):
         """Fit the transformer on sparse data."""
         # Count the zeros
         if missing_values == 0:
@@ -233,12 +282,14 @@ def _sparse_fit(self, X, strategy, missing_values):
                                                       n_zeros_axis[i])
 
                 return most_frequent
+
+            # Constant
+            elif strategy == "constant":
+
+                return np.full(X.shape[0], fill_value)
 
-    def _dense_fit(self, X, strategy, missing_values):
+    def _dense_fit(self, X, strategy, missing_values, fill_value):
         """Fit the transformer on dense data."""
-        X = check_array(X, force_all_finite='allow-nan'
-                        if self.missing_values == 'NaN'
-                        or np.isnan(self.missing_values) else True)
         mask = _get_mask(X, missing_values)
         masked_X = ma.masked_array(X, mask=mask)
 
@@ -280,6 +331,16 @@ def _dense_fit(self, X, strategy, missing_values):
 
             return most_frequent
 
+        # Constant
+        elif strategy == "constant":
+            if isinstance(fill_value, numbers.Real):
+                dtype = None
+            else:
+                dtype = object
+
+            return np.full(X.shape[0], fill_value, dtype=dtype)
+
+
     def transform(self, X):
         """Impute all missing values in X.
 
@@ -289,27 +350,29 @@ def transform(self, X):
             The input data to complete.
         """
         check_is_fitted(self, 'statistics_')
-        X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES,
-                        force_all_finite='allow-nan'
-                        if self.missing_values == 'NaN'
-                        or np.isnan(self.missing_values) else True,
-                        copy=self.copy)
+
+        X = self._validate_input(X)
+
         statistics = self.statistics_
         if X.shape[1] != statistics.shape[0]:
             raise ValueError("X has %d features per sample, expected %d"
                              % (X.shape[1], self.statistics_.shape[0]))
 
-        # Delete the invalid columns
-        invalid_mask = np.isnan(statistics)
-        valid_mask = np.logical_not(invalid_mask)
-        valid_statistics = statistics[valid_mask]
-        valid_statistics_indexes = np.flatnonzero(valid_mask)
-        missing = np.arange(X.shape[1])[invalid_mask]
-
-        if invalid_mask.any():
-            if self.verbose:
-                warnings.warn("Deleting features without "
-                              "observed values: %s" % missing)
+        # Delete the invalid columns if strategy is not constant
+        if self.strategy == "constant":
+            valid_statistics = statistics
+        else:
+            invalid_mask = np.isnan(statistics)
+            valid_mask = np.logical_not(invalid_mask)
+
+            if invalid_mask.any():
+                missing = np.arange(X.shape[1])[invalid_mask]
+                if self.verbose:
+                    warnings.warn("Deleting features without "
+                                "observed values: %s" % missing)
+
+            valid_statistics = statistics[valid_mask]
+            valid_statistics_indexes = np.flatnonzero(valid_mask)
             X = X[:, valid_statistics_indexes]
 
         # Do actual imputation

diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
@@ -24,16 +24,14 @@ def _check_statistics(X, X_true,
                       strategy, statistics, missing_values):
     """Utility function for testing imputation for a given strategy.
 
-    Test:
-        - along the two axes
-        - with dense and sparse arrays
+    Test with dense and sparse arrays
 
     Check that:
         - the statistics (mean, median, mode) are correct
         - the missing values are imputed correctly"""
 
     err_msg = "Parameters: strategy = %s, missing_values = %s, " \
-              "axis = {0}, sparse = {1}" % (strategy, missing_values)
+              "sparse = {0}" % (strategy, missing_values)
 
     assert_ae = assert_array_equal
     if X.dtype.kind == 'f' or X_true.dtype.kind == 'f':
@@ -43,8 +41,8 @@ def _check_statistics(X, X_true,
     imputer = SimpleImputer(missing_values, strategy=strategy)
     X_trans = imputer.fit(X).transform(X.copy())
     assert_ae(imputer.statistics_, statistics,
-              err_msg=err_msg.format(0, False))
-    assert_ae(X_trans, X_true, err_msg=err_msg.format(0, False))
+              err_msg=err_msg.format(False))
+    assert_ae(X_trans, X_true, err_msg=err_msg.format(False))
 
     # Sparse matrix
     imputer = SimpleImputer(missing_values, strategy=strategy)
@@ -55,8 +53,8 @@ def _check_statistics(X, X_true,
         X_trans = X_trans.toarray()
 
     assert_ae(imputer.statistics_, statistics,
-              err_msg=err_msg.format(0, True))
-    assert_ae(X_trans, X_true, err_msg=err_msg.format(0, True))
+              err_msg=err_msg.format(True))
+    assert_ae(X_trans, X_true, err_msg=err_msg.format(True))
 
 
 def test_imputation_shape():
@@ -210,6 +208,134 @@ def test_imputation_most_frequent():
     _check_statistics(X, X_true, "most_frequent", [np.nan, 2, 3, 3], -1)
 
 
+def test_imputation_constant_integer():
+    # Test imputation using the constant strategy
+    # on integers
+    X = np.array([
+        [-1, 2, 3, -1],
+        [4, -1, 5, -1],
+        [6, 7, -1, -1],
+        [8, 9, 0, -1]
+    ])
+
+    X_true = np.array([
+        [0, 2, 3, 0],
+        [4, 0, 5, 0],
+        [6, 7, 0, 0],
+        [8, 9, 0, 0]
+    ])
+
+    imputer = SimpleImputer(missing_values=-1, strategy="constant", 
+                            fill_value=0)
+    X_trans = imputer.fit(X).transform(X)
+
+    assert_array_equal(X_trans, X_true)
+
+
+def test_imputation_constant_float():
+    # Test imputation using the constant strategy
+    # on floats
+    for format in ["csr", "array"]:
+        X = np.array([
+            [np.nan, 1.1, 2.2, np.nan],
+            [3.3, np.nan, 4.4, np.nan],
+            [5.5, 6.6, np.nan, np.nan],
+            [7.7, 8.8, 9.9, np.nan]
+        ])
+
+        X = sparse.csr_matrix(X) if format == "csr" else X
+
+        X_true = np.array([
+            [0, 1.1, 2.2, 0],
+            [3.3, 0, 4.4, 0],
+            [5.5, 6.6, 0, 0],
+            [7.7, 8.8, 9.9, 0]
+        ])
+
+        X_true = sparse.csr_matrix(X_true) if format == "csr" else X_true
+
+        imputer = SimpleImputer(strategy="constant", fill_value=0)
+        X_trans = imputer.fit(X).transform(X)
+
+        if format == "csr":
+            assert_allclose(X_trans.toarray(), X_true.toarray())
+        else:
+            assert_allclose(X_trans, X_true)
+
+
+def test_imputation_constant_object():
+    # Test imputation using the constant strategy
+    # on objects
+    X = np.array([
+        [None, "a", "b", None],
+        ["c", None, "d", None],
+        ["e", "f", None, None],
+        ["g", "h", "i", None]
+    ], dtype=object)
+
+    X_true = np.array([
+        ["missing", "a", "b", "missing"],
+        ["c", "missing", "d", "missing"],
+        ["e", "f", "missing", "missing"],
+        ["g", "h", "i", "missing"]
+    ], dtype=object)
+
+    imputer = SimpleImputer(missing_values=None, strategy="constant",
+                            fill_value="missing")
+    X_trans = imputer.fit(X).transform(X)
+
+    assert_array_equal(X_trans, X_true)
+
+
+def test_imputation_constant_object_nan():
+    # Test imputation using the constant strategy
+    # on objects
+    X = np.array([
+        [np.nan, "a", "b", np.nan],
+        ["c", np.nan, "d", np.nan],
+        ["e", "f", np.nan, np.nan],
+        ["g", "h", "i", np.nan]
+    ], dtype=object)
+
+    X_true = np.array([
+        ["missing_value", "a", "b", "missing_value"],
+        ["c", "missing_value", "d", "missing_value"],
+        ["e", "f", "missing_value", "missing_value"],
+        ["g", "h", "i", "missing_value"]
+    ], dtype=object)
+
+    imputer = SimpleImputer(strategy="constant")
+    X_trans = imputer.fit(X).transform(X)
+
+    assert_array_equal(X_trans, X_true)
+
+
+def test_imputation_constant_pandas():
+    # Test imputation using the constant strategy
+    # on pandas df
+    pd = pytest.importorskip("pandas")
+
+    for dtype in [object, "category"]:
+        df = pd.DataFrame([
+            [np.nan, "a", "b", np.nan],
+            ["c", np.nan, "d", np.nan],
+            ["e", "f", np.nan, np.nan],
+            ["g", "h", "i", np.nan]
+        ], dtype=dtype)
+
+        X_true = np.array([
+            ["missing", "a", "b", "missing"],
+            ["c", "missing", "d", "missing"],
+            ["e", "f", "missing", "missing"],
+            ["g", "h", "i", "missing"]
+        ], dtype=object)
+
+        imputer = SimpleImputer(strategy="constant", fill_value="missing")
+        X_trans = imputer.fit(df).transform(df) 
+
+        assert_array_equal(X_trans, X_true)
+
+
 def test_imputation_pipeline_grid_search():
     # Test imputation within a pipeline + gridsearch.
     pipeline = Pipeline([('imputer', SimpleImputer(missing_values=0)),