diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index e1a39d3c70868..dde014edc05d9 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -485,7 +485,7 @@ estimator that supports imputation. See :ref:`sphx_glr_auto_examples_missing_val Transformer indicating missing values ===================================== -MissingIndicator transformer is useful to transform a dataset into corresponding +:class:`MissingIndicator` transformer is useful to transform a dataset into corresponding binary matrix indicating the presence of missing values in the dataset. The knowledge of which features were imputed can be exploited by a downstream estimator by adding features that indicate which elements have been imputed. @@ -508,9 +508,9 @@ estimator by adding features that indicate which elements have been imputed. MissingIndicator(features='train', missing_values=-1, sparse='auto') >>> X2_tr = MI.transform(X2) >>> X2_tr - array([[False, False, True], - [ True, True, False], - [False, False, False]], dtype=bool) + array([[0, 0, 1], + [1, 1, 0], + [0, 0, 0]], dtype=int32) .. _polynomial_features: diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index f614fa7660f0f..775c154103337 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -425,9 +425,9 @@ class MissingIndicator(BaseEstimator, TransformerMixin): MissingIndicator(features='train', missing_values=-1, sparse='auto') >>> X2_tr = MI.transform(X2) >>> X2_tr - array([[False, True], - [ True, False], - [False, False]], dtype=bool) + array([[0, 1], + [1, 0], + [0, 0]], dtype=int32) """ @@ -438,11 +438,13 @@ def __init__(self, missing_values="NaN", features="train", sparse="auto"): def fit(self, X): """Fit the transformer on X. + Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Input data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features. + Returns ------- self : object @@ -470,15 +472,17 @@ def fit(self, X): return self def transform(self, X): - """Impute all missing values in X. + """Generate missing values indicator for X. + Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] The input data to complete. + Returns ------- - X : {array-like, sparse matrix}, shape = [n_samples, n_features] - The transformerwith missing indicator. + Xt : {array-like, sparse matrix}, shape = [n_samples, n_features] + The missing indicator for input data """ if self.features == "train": @@ -486,16 +490,15 @@ def transform(self, X): X = check_array(X, accept_sparse=('csc', 'csr'), dtype=np.float64, force_all_finite=False) - imputer_mask, feat_with_missing = self._get_missing_features_info(X) if self.features == "train": features = np.setdiff1d(feat_with_missing, self.feat_with_missing_) if features.size: - warnings.warn("The features %s have missing " - "values in transform but have no missing values" - " in fit " % features, RuntimeWarning, + warnings.warn("The features %s have missing values " + "in transform but have no missing values " + "in fit " % features, RuntimeWarning, stacklevel=1) imputer_mask = imputer_mask[:, self.feat_with_missing_] @@ -522,6 +525,7 @@ def _get_missing_features_info(self, X): if sparse.issparse(X): X = X.toarray() imputer_mask = _get_mask(X, self.missing_values) + imputer_mask = imputer_mask.astype(np.int32, copy=False) feat_with_missing = np.where(np.any(imputer_mask, axis=0))[0] if self.sparse is True: @@ -531,5 +535,8 @@ def _get_missing_features_info(self, X): imputer_mask = sparse.csc_matrix(imputer_mask) elif self.sparse is False and sparse.issparse(imputer_mask): imputer_mask = imputer_mask.toarray() + elif self.sparse == 'auto' and self.missing_values != 0: + if sparse.issparse(imputer_mask): + imputer_mask = imputer_mask.tocsc() return imputer_mask, feat_with_missing diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index 95cc8388641bd..9954ddcdc2208 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -380,15 +380,17 @@ def test_missing_indicator(): [11, -1, 1, 1] ]) - def assert_type(actual, expect, sp, missing_values): - if sp is True and missing_values != 0: + def assert_type(actual, is_sparse, sp, missing_values): + if sp is True : assert_equal(actual, sparse.csc_matrix) - elif (sp is True and missing_values == 0) or \ - sp is False: + elif (sp is "auto" and missing_values == 0 ) \ + or sp is False: assert_equal(actual, np.ndarray) else: - print type(retype(X2)), sp, missing_values, type(X2_tr) - assert_equal(actual, expect) + if is_sparse: + assert_equal(actual, sparse.csc_matrix) + else: + assert_equal(actual, np.ndarray) def assert_mask(actual, expected, features): if hasattr(actual, 'toarray'): @@ -396,39 +398,43 @@ def assert_mask(actual, expected, features): else: assert_array_equal(actual, expected[:, features]) - for X1, X2, missing_values in [(X1_orig, X2_orig, -1), - (X1_orig + 1, X2_orig + 1, 0)]: + def _check_missing_indicator(X1, X2, retype, sp, missing_values): mask = X2 == missing_values expect_feat_missing = np.where(np.any(X1 == missing_values, axis=0))[0] - for retype in [np.array, sparse.csr_matrix, + + X1_in = retype(X1) + X2_in = retype(X2) + # features = "train": + MI = MissingIndicator(missing_values=missing_values, + sparse = sp) + + MI.fit(X1_in) + X2_tr = MI.transform(X2_in) + features = MI.feat_with_missing_ + assert_array_equal(expect_feat_missing, features) + assert_type(type(X2_tr),sparse.issparse(X2_in), sp, missing_values) + assert_mask(X2_tr, mask, features) + + # features = "all" + MI = clone(MI).set_params(features="all") + MI.fit(X1_in) + X2_tr = MI.transform(X2_in) + features = np.arange(X2.shape[1]) + assert_mask(X2_tr, mask, features) + + # features = [1, 2] + features = [1, 2] + MI = clone(MI).set_params(features=features) + MI.fit(X1_in) + X2_tr = MI.transform(X2_in) + assert_mask(X2_tr, mask, features) + + for X1, X2, missing_values in [(X1_orig, X2_orig, -1), + (X1_orig + 1, X2_orig + 1, 0)]: + for retype in [lambda x: x.tolist(), np.array, sparse.csr_matrix, sparse.csc_matrix, sparse.lil_matrix]: for sp in [True, False, 'auto']: - X1_ft = retype(X1) - X2_t = retype(X2) - # features = "train": - MI = MissingIndicator(missing_values=missing_values, - sparse = sp) - - MI.fit(X1_ft) - X2_tr = MI.transform(X2_t) - features = MI.feat_with_missing_ - assert_array_equal(expect_feat_missing, features) - assert_type(type(X2_tr), type(X2_t), sp, missing_values) - assert_mask(X2_tr, mask, features) - - # features = "all" - MI = clone(MI).set_params(features="all") - MI.fit(X1_ft) - X2_tr = MI.transform(retype(X2)) - features = np.arange(X2.shape[1]) - assert_mask(X2_tr, mask, features) - - # features = [1, 2] - features = [1, 2] - MI = clone(MI).set_params(features=features) - MI.fit(X1_ft) - X2_tr = MI.transform(X2_t) - assert_mask(X2_tr, mask, features) + _check_missing_indicator(X1, X2, retype, sp, missing_values) def test_missing_indicator_warning():