Skip to content

Commit

Permalink
Add comprehensive tests
Browse files Browse the repository at this point in the history
  • Loading branch information
maniteja123 committed Feb 2, 2017
1 parent cf7ad5d commit 5357a1b
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 49 deletions.
8 changes: 4 additions & 4 deletions doc/modules/preprocessing.rst
Expand Up @@ -485,7 +485,7 @@ estimator that supports imputation. See :ref:`sphx_glr_auto_examples_missing_val
Transformer indicating missing values
=====================================

MissingIndicator transformer is useful to transform a dataset into corresponding
:class:`MissingIndicator` transformer is useful to transform a dataset into corresponding
binary matrix indicating the presence of missing values in the dataset.
The knowledge of which features were imputed can be exploited by a downstream
estimator by adding features that indicate which elements have been imputed.
Expand All @@ -508,9 +508,9 @@ estimator by adding features that indicate which elements have been imputed.
MissingIndicator(features='train', missing_values=-1, sparse='auto')
>>> X2_tr = MI.transform(X2)
>>> X2_tr
array([[False, False, True],
[ True, True, False],
[False, False, False]], dtype=bool)
array([[0, 0, 1],
[1, 1, 0],
[0, 0, 0]], dtype=int32)


.. _polynomial_features:
Expand Down
27 changes: 17 additions & 10 deletions sklearn/preprocessing/imputation.py
Expand Up @@ -425,9 +425,9 @@ class MissingIndicator(BaseEstimator, TransformerMixin):
MissingIndicator(features='train', missing_values=-1, sparse='auto')
>>> X2_tr = MI.transform(X2)
>>> X2_tr
array([[False, True],
[ True, False],
[False, False]], dtype=bool)
array([[0, 1],
[1, 0],
[0, 0]], dtype=int32)
"""

Expand All @@ -438,11 +438,13 @@ def __init__(self, missing_values="NaN", features="train", sparse="auto"):

def fit(self, X):
"""Fit the transformer on X.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
Input data, where ``n_samples`` is the number of samples and
``n_features`` is the number of features.
Returns
-------
self : object
Expand Down Expand Up @@ -470,32 +472,33 @@ def fit(self, X):
return self

def transform(self, X):
"""Impute all missing values in X.
"""Generate missing values indicator for X.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
The input data to complete.
Returns
-------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
The transformerwith missing indicator.
Xt : {array-like, sparse matrix}, shape = [n_samples, n_features]
The missing indicator for input data
"""
if self.features == "train":
check_is_fitted(self, "feat_with_missing_")

X = check_array(X, accept_sparse=('csc', 'csr'), dtype=np.float64,
force_all_finite=False)

imputer_mask, feat_with_missing = self._get_missing_features_info(X)

if self.features == "train":
features = np.setdiff1d(feat_with_missing,
self.feat_with_missing_)
if features.size:
warnings.warn("The features %s have missing "
"values in transform but have no missing values"
" in fit " % features, RuntimeWarning,
warnings.warn("The features %s have missing values "
"in transform but have no missing values "
"in fit " % features, RuntimeWarning,
stacklevel=1)
imputer_mask = imputer_mask[:, self.feat_with_missing_]

Expand All @@ -522,6 +525,7 @@ def _get_missing_features_info(self, X):
if sparse.issparse(X):
X = X.toarray()
imputer_mask = _get_mask(X, self.missing_values)
imputer_mask = imputer_mask.astype(np.int32, copy=False)
feat_with_missing = np.where(np.any(imputer_mask, axis=0))[0]

if self.sparse is True:
Expand All @@ -531,5 +535,8 @@ def _get_missing_features_info(self, X):
imputer_mask = sparse.csc_matrix(imputer_mask)
elif self.sparse is False and sparse.issparse(imputer_mask):
imputer_mask = imputer_mask.toarray()
elif self.sparse == 'auto' and self.missing_values != 0:
if sparse.issparse(imputer_mask):
imputer_mask = imputer_mask.tocsc()

return imputer_mask, feat_with_missing
76 changes: 41 additions & 35 deletions sklearn/preprocessing/tests/test_imputation.py
Expand Up @@ -380,55 +380,61 @@ def test_missing_indicator():
[11, -1, 1, 1]
])

def assert_type(actual, expect, sp, missing_values):
if sp is True and missing_values != 0:
def assert_type(actual, is_sparse, sp, missing_values):
if sp is True :
assert_equal(actual, sparse.csc_matrix)
elif (sp is True and missing_values == 0) or \
sp is False:
elif (sp is "auto" and missing_values == 0 ) \
or sp is False:
assert_equal(actual, np.ndarray)
else:
print type(retype(X2)), sp, missing_values, type(X2_tr)
assert_equal(actual, expect)
if is_sparse:
assert_equal(actual, sparse.csc_matrix)
else:
assert_equal(actual, np.ndarray)

def assert_mask(actual, expected, features):
if hasattr(actual, 'toarray'):
assert_array_equal(actual.toarray(), expected[:, features])
else:
assert_array_equal(actual, expected[:, features])

for X1, X2, missing_values in [(X1_orig, X2_orig, -1),
(X1_orig + 1, X2_orig + 1, 0)]:
def _check_missing_indicator(X1, X2, retype, sp, missing_values):
mask = X2 == missing_values
expect_feat_missing = np.where(np.any(X1 == missing_values, axis=0))[0]
for retype in [np.array, sparse.csr_matrix,

X1_in = retype(X1)
X2_in = retype(X2)
# features = "train":
MI = MissingIndicator(missing_values=missing_values,
sparse = sp)

MI.fit(X1_in)
X2_tr = MI.transform(X2_in)
features = MI.feat_with_missing_
assert_array_equal(expect_feat_missing, features)
assert_type(type(X2_tr),sparse.issparse(X2_in), sp, missing_values)
assert_mask(X2_tr, mask, features)

# features = "all"
MI = clone(MI).set_params(features="all")
MI.fit(X1_in)
X2_tr = MI.transform(X2_in)
features = np.arange(X2.shape[1])
assert_mask(X2_tr, mask, features)

# features = [1, 2]
features = [1, 2]
MI = clone(MI).set_params(features=features)
MI.fit(X1_in)
X2_tr = MI.transform(X2_in)
assert_mask(X2_tr, mask, features)

for X1, X2, missing_values in [(X1_orig, X2_orig, -1),
(X1_orig + 1, X2_orig + 1, 0)]:
for retype in [lambda x: x.tolist(), np.array, sparse.csr_matrix,
sparse.csc_matrix, sparse.lil_matrix]:
for sp in [True, False, 'auto']:
X1_ft = retype(X1)
X2_t = retype(X2)
# features = "train":
MI = MissingIndicator(missing_values=missing_values,
sparse = sp)

MI.fit(X1_ft)
X2_tr = MI.transform(X2_t)
features = MI.feat_with_missing_
assert_array_equal(expect_feat_missing, features)
assert_type(type(X2_tr), type(X2_t), sp, missing_values)
assert_mask(X2_tr, mask, features)

# features = "all"
MI = clone(MI).set_params(features="all")
MI.fit(X1_ft)
X2_tr = MI.transform(retype(X2))
features = np.arange(X2.shape[1])
assert_mask(X2_tr, mask, features)

# features = [1, 2]
features = [1, 2]
MI = clone(MI).set_params(features=features)
MI.fit(X1_ft)
X2_tr = MI.transform(X2_t)
assert_mask(X2_tr, mask, features)
_check_missing_indicator(X1, X2, retype, sp, missing_values)


def test_missing_indicator_warning():
Expand Down

0 comments on commit 5357a1b

Please sign in to comment.