-
-
Notifications
You must be signed in to change notification settings - Fork 25.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[MRG+1] SimpleImputer(strategy="constant") #11211
Changes from 5 commits
74384c6
6dd6a5e
2b101fb
6e13e68
f300fe2
35e30ac
ea4a929
10f165b
a6c33b1
9c2a407
df8608b
4517687
1f1c6a0
2f6d0b1
64fa1bc
3884d4e
72eb6b5
cc9aa6f
d4e5226
6efd122
fbaaa38
724a4a1
e5f4a1b
d69f855
c3a730d
94d7964
20456f4
e2ae626
7d3d1b5
972668b
f1da7b8
fb1a4e9
c8246f2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ | |
|
||
import warnings | ||
from time import time | ||
import numbers | ||
|
||
import numpy as np | ||
import numpy.ma as ma | ||
|
@@ -36,11 +37,20 @@ | |
'MICEImputer', | ||
] | ||
|
||
def _is_scalar_nan(x): | ||
"""Work around limitations of numpy ufuncs""" | ||
return False if x is None else np.isnan(x) | ||
|
||
|
||
def _get_mask(X, value_to_mask): | ||
"""Compute the boolean mask X == missing_values.""" | ||
if value_to_mask == "NaN" or np.isnan(value_to_mask): | ||
return np.isnan(X) | ||
if value_to_mask == "NaN" or _is_scalar_nan(value_to_mask): | ||
if X.dtype.kind == "O": | ||
# np.isnan does not work for dtype objects. We use the trick that | ||
# nan values are never equal to themselves. | ||
return np.logical_not(X == X) | ||
else: | ||
return np.isnan(X) | ||
else: | ||
return X == value_to_mask | ||
|
||
|
@@ -94,6 +104,13 @@ class SimpleImputer(BaseEstimator, TransformerMixin): | |
each column. | ||
- If "most_frequent", then replace missing using the most frequent | ||
value along each column. | ||
- If "constant", then replace missing values with fill_value | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please note here (or in a Notes section) that most_frequent and constant work with strings or numeric data, while the others only work with numeric data. |
||
fill_value : string or numerical value, optional (default=None) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a bit confusing to state |
||
When strategy == "constant", fill_value is used to replace all | ||
occurrences of missing_values. | ||
If left to the default, fill_value will be 0 when imputing numerical | ||
data and "missing_value" for strings or object data types. | ||
|
||
verbose : integer, optional (default=0) | ||
Controls the verbosity of the imputer. | ||
|
@@ -115,16 +132,41 @@ class SimpleImputer(BaseEstimator, TransformerMixin): | |
Notes | ||
----- | ||
Columns which only contained missing values at `fit` are discarded upon | ||
`transform`. | ||
`transform` is strategy is not "constant" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is -> if |
||
|
||
""" | ||
def __init__(self, missing_values="NaN", strategy="mean", | ||
verbose=0, copy=True): | ||
fill_value=None, verbose=0, copy=True): | ||
self.missing_values = missing_values | ||
self.strategy = strategy | ||
self.fill_value = fill_value | ||
self.verbose = verbose | ||
self.copy = copy | ||
|
||
def _validate_input(self, X): | ||
allowed_strategies = ["mean", "median", "most_frequent", "constant"] | ||
if self.strategy not in allowed_strategies: | ||
raise ValueError("Can only use these strategies: {0} " | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No test coverage |
||
" got strategy={1}".format(allowed_strategies, | ||
self.strategy)) | ||
|
||
if self.strategy in ("most_frequent", "constant"): | ||
dtype = None | ||
else: | ||
dtype = FLOAT_DTYPES | ||
|
||
if self.missing_values is None: | ||
force_all_finite = "allow-nan" | ||
else: | ||
if self.missing_values == "NaN" or np.isnan(self.missing_values): | ||
force_all_finite = "allow-nan" | ||
else: | ||
force_all_finite = True | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually this should be: if isinstance(self.missing_value, numbers.Real) and np.isnan(self.missing_values):
force_all_finite = "allow-nan"
else:
force_all_finite = True or even: force_all_finite = True if self.missing_values is not np.nan else 'allow-nan' |
||
|
||
return check_array(X, accept_sparse='csc', dtype=dtype, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we really be converting a mixed-type dataframe to an array if we're imputing with strings? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the alternative? (I maybe don't fully get your comment) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To me the imputer would be applied to an homegeneous subset of columns of a mixed-typed dataframe thanks to the use of |
||
force_all_finite=force_all_finite) | ||
|
||
|
||
def fit(self, X, y=None): | ||
"""Fit the imputer on X. | ||
|
||
|
@@ -138,30 +180,37 @@ def fit(self, X, y=None): | |
------- | ||
self : SimpleImputer | ||
""" | ||
# Check parameters | ||
allowed_strategies = ["mean", "median", "most_frequent"] | ||
if self.strategy not in allowed_strategies: | ||
raise ValueError("Can only use these strategies: {0} " | ||
" got strategy={1}".format(allowed_strategies, | ||
self.strategy)) | ||
|
||
X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES, | ||
force_all_finite='allow-nan' | ||
if self.missing_values == 'NaN' | ||
or np.isnan(self.missing_values) else True) | ||
X = self._validate_input(X) | ||
|
||
if self.strategy == "constant": | ||
if (X.dtype.kind in ("i", "f") | ||
and not isinstance(self.fill_value, numbers.Real)): | ||
raise ValueError( | ||
"fill_value={0} is invalid. Expected a numerical value " | ||
"to numerical data".format(self.fill_value)) | ||
|
||
if self.fill_value is None: | ||
if X.dtype.kind in ("i", "f"): | ||
fill_value = 0 | ||
else: | ||
fill_value = "missing_value" | ||
else: | ||
fill_value = self.fill_value | ||
|
||
if sparse.issparse(X): | ||
self.statistics_ = self._sparse_fit(X, | ||
self.strategy, | ||
self.missing_values) | ||
self.missing_values, | ||
fill_value) | ||
else: | ||
self.statistics_ = self._dense_fit(X, | ||
self.strategy, | ||
self.missing_values) | ||
self.missing_values, | ||
fill_value) | ||
|
||
return self | ||
|
||
def _sparse_fit(self, X, strategy, missing_values): | ||
def _sparse_fit(self, X, strategy, missing_values, fill_value): | ||
"""Fit the transformer on sparse data.""" | ||
# Count the zeros | ||
if missing_values == 0: | ||
|
@@ -233,12 +282,14 @@ def _sparse_fit(self, X, strategy, missing_values): | |
n_zeros_axis[i]) | ||
|
||
return most_frequent | ||
|
||
# Constant | ||
elif strategy == "constant": | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. style consistency: no need for a blank line here. |
||
return np.full(X.shape[0], fill_value) | ||
|
||
def _dense_fit(self, X, strategy, missing_values): | ||
def _dense_fit(self, X, strategy, missing_values, fill_value): | ||
"""Fit the transformer on dense data.""" | ||
X = check_array(X, force_all_finite='allow-nan' | ||
if self.missing_values == 'NaN' | ||
or np.isnan(self.missing_values) else True) | ||
mask = _get_mask(X, missing_values) | ||
masked_X = ma.masked_array(X, mask=mask) | ||
|
||
|
@@ -280,6 +331,16 @@ def _dense_fit(self, X, strategy, missing_values): | |
|
||
return most_frequent | ||
|
||
# Constant | ||
elif strategy == "constant": | ||
if isinstance(fill_value, numbers.Real): | ||
dtype = None | ||
else: | ||
dtype = object | ||
|
||
return np.full(X.shape[0], fill_value, dtype=dtype) | ||
|
||
|
||
def transform(self, X): | ||
"""Impute all missing values in X. | ||
|
||
|
@@ -289,27 +350,29 @@ def transform(self, X): | |
The input data to complete. | ||
""" | ||
check_is_fitted(self, 'statistics_') | ||
X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES, | ||
force_all_finite='allow-nan' | ||
if self.missing_values == 'NaN' | ||
or np.isnan(self.missing_values) else True, | ||
copy=self.copy) | ||
|
||
X = self._validate_input(X) | ||
|
||
statistics = self.statistics_ | ||
if X.shape[1] != statistics.shape[0]: | ||
raise ValueError("X has %d features per sample, expected %d" | ||
% (X.shape[1], self.statistics_.shape[0])) | ||
|
||
# Delete the invalid columns | ||
invalid_mask = np.isnan(statistics) | ||
valid_mask = np.logical_not(invalid_mask) | ||
valid_statistics = statistics[valid_mask] | ||
valid_statistics_indexes = np.flatnonzero(valid_mask) | ||
missing = np.arange(X.shape[1])[invalid_mask] | ||
|
||
if invalid_mask.any(): | ||
if self.verbose: | ||
warnings.warn("Deleting features without " | ||
"observed values: %s" % missing) | ||
# Delete the invalid columns if strategy is not constant | ||
if self.strategy == "constant": | ||
valid_statistics = statistics | ||
else: | ||
invalid_mask = np.isnan(statistics) | ||
valid_mask = np.logical_not(invalid_mask) | ||
|
||
if invalid_mask.any(): | ||
missing = np.arange(X.shape[1])[invalid_mask] | ||
if self.verbose: | ||
warnings.warn("Deleting features without " | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not tested |
||
"observed values: %s" % missing) | ||
|
||
valid_statistics = statistics[valid_mask] | ||
valid_statistics_indexes = np.flatnonzero(valid_mask) | ||
X = X[:, valid_statistics_indexes] | ||
|
||
# Do actual imputation | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,16 +24,14 @@ def _check_statistics(X, X_true, | |
strategy, statistics, missing_values): | ||
"""Utility function for testing imputation for a given strategy. | ||
|
||
Test: | ||
- along the two axes | ||
- with dense and sparse arrays | ||
Test with dense and sparse arrays | ||
|
||
Check that: | ||
- the statistics (mean, median, mode) are correct | ||
- the missing values are imputed correctly""" | ||
|
||
err_msg = "Parameters: strategy = %s, missing_values = %s, " \ | ||
"axis = {0}, sparse = {1}" % (strategy, missing_values) | ||
"sparse = {0}" % (strategy, missing_values) | ||
|
||
assert_ae = assert_array_equal | ||
if X.dtype.kind == 'f' or X_true.dtype.kind == 'f': | ||
|
@@ -43,8 +41,8 @@ def _check_statistics(X, X_true, | |
imputer = SimpleImputer(missing_values, strategy=strategy) | ||
X_trans = imputer.fit(X).transform(X.copy()) | ||
assert_ae(imputer.statistics_, statistics, | ||
err_msg=err_msg.format(0, False)) | ||
assert_ae(X_trans, X_true, err_msg=err_msg.format(0, False)) | ||
err_msg=err_msg.format(False)) | ||
assert_ae(X_trans, X_true, err_msg=err_msg.format(False)) | ||
|
||
# Sparse matrix | ||
imputer = SimpleImputer(missing_values, strategy=strategy) | ||
|
@@ -55,8 +53,8 @@ def _check_statistics(X, X_true, | |
X_trans = X_trans.toarray() | ||
|
||
assert_ae(imputer.statistics_, statistics, | ||
err_msg=err_msg.format(0, True)) | ||
assert_ae(X_trans, X_true, err_msg=err_msg.format(0, True)) | ||
err_msg=err_msg.format(True)) | ||
assert_ae(X_trans, X_true, err_msg=err_msg.format(True)) | ||
|
||
|
||
def test_imputation_shape(): | ||
|
@@ -210,6 +208,134 @@ def test_imputation_most_frequent(): | |
_check_statistics(X, X_true, "most_frequent", [np.nan, 2, 3, 3], -1) | ||
|
||
|
||
def test_imputation_constant_integer(): | ||
# Test imputation using the constant strategy | ||
# on integers | ||
X = np.array([ | ||
[-1, 2, 3, -1], | ||
[4, -1, 5, -1], | ||
[6, 7, -1, -1], | ||
[8, 9, 0, -1] | ||
]) | ||
|
||
X_true = np.array([ | ||
[0, 2, 3, 0], | ||
[4, 0, 5, 0], | ||
[6, 7, 0, 0], | ||
[8, 9, 0, 0] | ||
]) | ||
|
||
imputer = SimpleImputer(missing_values=-1, strategy="constant", | ||
fill_value=0) | ||
X_trans = imputer.fit(X).transform(X) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fit_transform |
||
|
||
assert_array_equal(X_trans, X_true) | ||
|
||
|
||
def test_imputation_constant_float(): | ||
# Test imputation using the constant strategy | ||
# on floats | ||
for format in ["csr", "array"]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. parametrize this part |
||
X = np.array([ | ||
[np.nan, 1.1, 2.2, np.nan], | ||
[3.3, np.nan, 4.4, np.nan], | ||
[5.5, 6.6, np.nan, np.nan], | ||
[7.7, 8.8, 9.9, np.nan] | ||
]) | ||
|
||
X = sparse.csr_matrix(X) if format == "csr" else X | ||
|
||
X_true = np.array([ | ||
[0, 1.1, 2.2, 0], | ||
[3.3, 0, 4.4, 0], | ||
[5.5, 6.6, 0, 0], | ||
[7.7, 8.8, 9.9, 0] | ||
]) | ||
|
||
X_true = sparse.csr_matrix(X_true) if format == "csr" else X_true | ||
|
||
imputer = SimpleImputer(strategy="constant", fill_value=0) | ||
X_trans = imputer.fit(X).transform(X) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fit_transform |
||
|
||
if format == "csr": | ||
assert_allclose(X_trans.toarray(), X_true.toarray()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you have a function |
||
else: | ||
assert_allclose(X_trans, X_true) | ||
|
||
|
||
def test_imputation_constant_object(): | ||
# Test imputation using the constant strategy | ||
# on objects | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nitpick: comment fits on one line. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This comment has been copied several times. |
||
X = np.array([ | ||
[None, "a", "b", None], | ||
["c", None, "d", None], | ||
["e", "f", None, None], | ||
["g", "h", "i", None] | ||
], dtype=object) | ||
|
||
X_true = np.array([ | ||
["missing", "a", "b", "missing"], | ||
["c", "missing", "d", "missing"], | ||
["e", "f", "missing", "missing"], | ||
["g", "h", "i", "missing"] | ||
], dtype=object) | ||
|
||
imputer = SimpleImputer(missing_values=None, strategy="constant", | ||
fill_value="missing") | ||
X_trans = imputer.fit(X).transform(X) | ||
|
||
assert_array_equal(X_trans, X_true) | ||
|
||
|
||
def test_imputation_constant_object_nan(): | ||
# Test imputation using the constant strategy | ||
# on objects | ||
X = np.array([ | ||
[np.nan, "a", "b", np.nan], | ||
["c", np.nan, "d", np.nan], | ||
["e", "f", np.nan, np.nan], | ||
["g", "h", "i", np.nan] | ||
], dtype=object) | ||
|
||
X_true = np.array([ | ||
["missing_value", "a", "b", "missing_value"], | ||
["c", "missing_value", "d", "missing_value"], | ||
["e", "f", "missing_value", "missing_value"], | ||
["g", "h", "i", "missing_value"] | ||
], dtype=object) | ||
|
||
imputer = SimpleImputer(strategy="constant") | ||
X_trans = imputer.fit(X).transform(X) | ||
|
||
assert_array_equal(X_trans, X_true) | ||
|
||
|
||
def test_imputation_constant_pandas(): | ||
# Test imputation using the constant strategy | ||
# on pandas df | ||
pd = pytest.importorskip("pandas") | ||
|
||
for dtype in [object, "category"]: | ||
df = pd.DataFrame([ | ||
[np.nan, "a", "b", np.nan], | ||
["c", np.nan, "d", np.nan], | ||
["e", "f", np.nan, np.nan], | ||
["g", "h", "i", np.nan] | ||
], dtype=dtype) | ||
|
||
X_true = np.array([ | ||
["missing", "a", "b", "missing"], | ||
["c", "missing", "d", "missing"], | ||
["e", "f", "missing", "missing"], | ||
["g", "h", "i", "missing"] | ||
], dtype=object) | ||
|
||
imputer = SimpleImputer(strategy="constant", fill_value="missing") | ||
X_trans = imputer.fit(df).transform(df) | ||
|
||
assert_array_equal(X_trans, X_true) | ||
|
||
|
||
def test_imputation_pipeline_grid_search(): | ||
# Test imputation within a pipeline + gridsearch. | ||
pipeline = Pipeline([('imputer', SimpleImputer(missing_values=0)), | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
style: end with "."