Skip to content

Commit

Permalink
FIX add support for non numeric values in MissingIndicator (#13046)
Browse files Browse the repository at this point in the history
  • Loading branch information
glemaitre authored and jnothman committed Feb 19, 2019
1 parent 03df72f commit 2594d91
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 21 deletions.
9 changes: 9 additions & 0 deletions doc/whats_new/v0.20.rst
Expand Up @@ -36,6 +36,15 @@ Changelog
threaded when `n_jobs > 1` or `n_jobs = -1`.
:issue:`13005` by :user:`Prabakaran Kumaresshan <nixphix>`.

:mod:`sklearn.impute`
.....................

- |Fix| add support for non-numeric data in
:class:`sklearn.impute.MissingIndicator` which was not supported while
:class:`sklearn.impute.SimpleImputer` was supporting this for some
imputation strategies.
:issue:`13046` by :user:`Guillaume Lemaitre <glemaitre>`.

:mod:`sklearn.linear_model`
...........................

Expand Down
35 changes: 19 additions & 16 deletions sklearn/impute.py
Expand Up @@ -537,6 +537,23 @@ def _get_missing_features_info(self, X):

return imputer_mask, features_with_missing

def _validate_input(self, X):
if not is_scalar_nan(self.missing_values):
force_all_finite = True
else:
force_all_finite = "allow-nan"
X = check_array(X, accept_sparse=('csc', 'csr'), dtype=None,
force_all_finite=force_all_finite)
_check_inputs_dtype(X, self.missing_values)
if X.dtype.kind not in ("i", "u", "f", "O"):
raise ValueError("MissingIndicator does not support data with "
"dtype {0}. Please provide either a numeric array"
" (with a floating point or integer dtype) or "
"categorical data represented either as an array "
"with integer dtype or an array of string values "
"with an object dtype.".format(X.dtype))
return X

def fit(self, X, y=None):
"""Fit the transformer on X.
Expand All @@ -551,14 +568,7 @@ def fit(self, X, y=None):
self : object
Returns self.
"""
if not is_scalar_nan(self.missing_values):
force_all_finite = True
else:
force_all_finite = "allow-nan"
X = check_array(X, accept_sparse=('csc', 'csr'),
force_all_finite=force_all_finite)
_check_inputs_dtype(X, self.missing_values)

X = self._validate_input(X)
self._n_features = X.shape[1]

if self.features not in ('missing-only', 'all'):
Expand Down Expand Up @@ -592,14 +602,7 @@ def transform(self, X):
"""
check_is_fitted(self, "features_")

if not is_scalar_nan(self.missing_values):
force_all_finite = True
else:
force_all_finite = "allow-nan"
X = check_array(X, accept_sparse=('csc', 'csr'),
force_all_finite=force_all_finite)
_check_inputs_dtype(X, self.missing_values)
X = self._validate_input(X)

if X.shape[1] != self._n_features:
raise ValueError("X has a different number of features "
Expand Down
37 changes: 36 additions & 1 deletion sklearn/tests/test_impute.py
Expand Up @@ -14,6 +14,7 @@
from sklearn.impute import MissingIndicator
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_union
from sklearn.model_selection import GridSearchCV
from sklearn import tree
from sklearn.random_projection import sparse_random_matrix
Expand Down Expand Up @@ -510,7 +511,10 @@ def test_imputation_copy():
"'features' has to be either 'missing-only' or 'all'"),
(np.array([[-1, 1], [1, 2]]), np.array([[-1, 1], [1, 2]]),
{'features': 'all', 'sparse': 'random'},
"'sparse' has to be a boolean or 'auto'")]
"'sparse' has to be a boolean or 'auto'"),
(np.array([['a', 'b'], ['c', 'a']], dtype=str),
np.array([['a', 'b'], ['c', 'a']], dtype=str),
{}, "MissingIndicator does not support data with dtype")]
)
def test_missing_indicator_error(X_fit, X_trans, params, msg_err):
indicator = MissingIndicator(missing_values=-1)
Expand Down Expand Up @@ -615,6 +619,37 @@ def test_missing_indicator_sparse_param(arr_type, missing_values,
assert isinstance(X_trans_mask, np.ndarray)


def test_missing_indicator_string():
X = np.array([['a', 'b', 'c'], ['b', 'c', 'a']], dtype=object)
indicator = MissingIndicator(missing_values='a', features='all')
X_trans = indicator.fit_transform(X)
assert_array_equal(X_trans, np.array([[True, False, False],
[False, False, True]]))


@pytest.mark.parametrize(
"X, missing_values, X_trans_exp",
[(np.array([['a', 'b'], ['b', 'a']], dtype=object), 'a',
np.array([['b', 'b', True, False], ['b', 'b', False, True]],
dtype=object)),
(np.array([[np.nan, 1.], [1., np.nan]]), np.nan,
np.array([[1., 1., True, False], [1., 1., False, True]])),
(np.array([[np.nan, 'b'], ['b', np.nan]], dtype=object), np.nan,
np.array([['b', 'b', True, False], ['b', 'b', False, True]],
dtype=object)),
(np.array([[None, 'b'], ['b', None]], dtype=object), None,
np.array([['b', 'b', True, False], ['b', 'b', False, True]],
dtype=object))]
)
def test_missing_indicator_with_imputer(X, missing_values, X_trans_exp):
trans = make_union(
SimpleImputer(missing_values=missing_values, strategy='most_frequent'),
MissingIndicator(missing_values=missing_values)
)
X_trans = trans.fit_transform(X)
assert_array_equal(X_trans, X_trans_exp)


@pytest.mark.parametrize("imputer_constructor",
[SimpleImputer])
@pytest.mark.parametrize(
Expand Down
15 changes: 11 additions & 4 deletions sklearn/utils/estimator_checks.py
Expand Up @@ -77,10 +77,10 @@
'OrthogonalMatchingPursuit', 'PLSCanonical', 'PLSRegression',
'RANSACRegressor', 'RadiusNeighborsRegressor',
'RandomForestRegressor', 'Ridge', 'RidgeCV']

ALLOW_NAN = ['Imputer', 'SimpleImputer', 'MissingIndicator',
'MaxAbsScaler', 'MinMaxScaler', 'RobustScaler', 'StandardScaler',
'PowerTransformer', 'QuantileTransformer']
SUPPORT_STRING = ['SimpleImputer', 'MissingIndicator']


def _yield_non_meta_checks(name, estimator):
Expand Down Expand Up @@ -628,9 +628,16 @@ def check_dtype_object(name, estimator_orig):
if "Unknown label type" not in str(e):
raise

X[0, 0] = {'foo': 'bar'}
msg = "argument must be a string or a number"
assert_raises_regex(TypeError, msg, estimator.fit, X, y)
if name not in SUPPORT_STRING:
X[0, 0] = {'foo': 'bar'}
msg = "argument must be a string or a number"
assert_raises_regex(TypeError, msg, estimator.fit, X, y)
else:
# Estimators supporting string will not call np.asarray to convert the
# data to numeric and therefore, the error will not be raised.
# Checking for each element dtype in the input array will be costly.
# Refer to #11401 for full discussion.
estimator.fit(X, y)


def check_complex_data(name, estimator_orig):
Expand Down

0 comments on commit 2594d91

Please sign in to comment.