Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] FIX change boolean array-likes indexing in old NumPy version #14510

Merged
merged 8 commits into from
Aug 2, 2019
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions doc/whats_new/v0.22.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,14 @@ Changelog
`sample_weights` are not supported by the wrapped estimator). :pr:`13575`
by :user:`William de Vazelhes <wdevazelhes>`.

:mod:`sklearn.compose`
......................

- |Fix| Fixed a bug in :class:`compose.ColumnTransformer` which failed to
select the proper columns when using a boolean list and NumPy older than
glemaitre marked this conversation as resolved.
Show resolved Hide resolved
1.13.
:pr:`14510` by :user:`Guillaume Lemaitre <glemaitre>`.

:mod:`sklearn.datasets`
.......................

Expand Down
15 changes: 15 additions & 0 deletions sklearn/compose/tests/test_column_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from sklearn.base import BaseEstimator
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.exceptions import NotFittedError
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler, Normalizer, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer

Expand Down Expand Up @@ -1108,3 +1109,17 @@ def test_column_transformer_reordered_column_names_remainder(explicit_colname):
err_msg = 'Specifying the columns'
with pytest.raises(ValueError, match=err_msg):
tf.transform(X_array)


@pytest.mark.parametrize("array_type", [np.asarray, sparse.csr_matrix])
def test_column_transformer_mask_indexing(array_type):
# Regression test for #14510
# Boolean array-like does not behave as boolean array with NumPy < 1.13
# and sparse matrices as well
X = np.transpose([[1, 2, 3], [4, 5, 6], [5, 6, 7], [8, 9, 10]])
X = array_type(X)
column_transformer = ColumnTransformer(
[('identity', FunctionTransformer(), [False, True, False, True])]
)
X_trans = column_transformer.fit_transform(X)
assert X_trans.shape == (3, 2)
22 changes: 19 additions & 3 deletions sklearn/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from . import _joblib
from ..exceptions import DataConversionWarning
from .deprecation import deprecated
from .fixes import np_version
from .validation import (as_float_array,
assert_all_finite,
check_random_state, column_or_1d, check_array,
Expand Down Expand Up @@ -225,6 +226,21 @@ def safe_indexing(X, indices, axis=0):
)


def _array_indexing(array, key, axis=0):
glemaitre marked this conversation as resolved.
Show resolved Hide resolved
"""Index an array consistently across NumPy version."""
if axis not in (0, 1):
raise ValueError(
"'axis' should be either 0 (to index rows) or 1 (to index "
" column). Got {} instead.".format(axis)
)
if np_version < (1, 13) or issparse(array):
glemaitre marked this conversation as resolved.
Show resolved Hide resolved
# check if we have an boolean array-likes to make the proper indexing
key_array = np.asarray(key)
if np.issubdtype(key_array.dtype, np.bool_):
key = key_array
return array[key] if axis == 0 else array[:, key]


def _safe_indexing_row(X, indices):
"""Return items or rows from X using indices.

Expand Down Expand Up @@ -266,7 +282,7 @@ def _safe_indexing_row(X, indices):
# This is often substantially faster than X[indices]
return X.take(indices, axis=0)
else:
return X[indices]
return _array_indexing(X, indices, axis=0)
else:
return [X[idx] for idx in indices]

Expand Down Expand Up @@ -356,7 +372,7 @@ def _safe_indexing_column(X, key):
return X.iloc[:, key]
else:
# numpy arrays, sparse arrays
return X[:, key]
return _array_indexing(X, key, axis=1)


def _get_column_indices(X, key):
Expand All @@ -371,7 +387,7 @@ def _get_column_indices(X, key):
or hasattr(key, 'dtype') and np.issubdtype(key.dtype, np.bool_)):
# Convert key into positive indexes
try:
idx = np.arange(n_columns)[key]
idx = safe_indexing(np.arange(n_columns), key)
except IndexError as e:
raise ValueError(
'all features must be in [0, %d]' % (n_columns - 1)
Expand Down
30 changes: 30 additions & 0 deletions sklearn/utils/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,10 @@

from sklearn.utils.testing import (assert_raises,
assert_array_equal,
assert_allclose_dense_sparse,
assert_raises_regex,
assert_warns_message, assert_no_warnings)
from sklearn.utils import _array_indexing
from sklearn.utils import check_random_state
from sklearn.utils import _check_key_type
from sklearn.utils import deprecated
Expand Down Expand Up @@ -365,6 +367,34 @@ def test_safe_indexing_mock_pandas(asarray):
assert_array_equal(np.array(X_df_indexed), X_indexed)


@pytest.mark.parametrize("array_type", ['array', 'sparse', 'dataframe'])
def test_safe_indexing_mask_axis_1(array_type):
# regression test for #14510
# check that boolean array-like and boolean array lead to the same indexing
# even in NumPy < 1.13
if array_type == 'array':
array_constructor = np.asarray
elif array_type == 'sparse':
array_constructor = sp.csr_matrix
elif array_type == 'dataframe':
pd = pytest.importorskip('pandas')
array_constructor = pd.DataFrame

X = array_constructor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
mask = [True, False, True]
mask_array = np.array(mask)
X_masked = safe_indexing(X, mask, axis=1)
X_masked_array = safe_indexing(X, mask_array, axis=1)
assert_allclose_dense_sparse(X_masked, X_masked_array)


def test_array_indexing_array_error():
X = np.array([[0, 1], [2, 3]])
mask = [True, False]
with pytest.raises(ValueError, match="'axis' should be either 0"):
_array_indexing(X, mask, axis=3)


def test_shuffle_on_ndim_equals_three():
def to_tuple(A): # to make the inner arrays hashable
return tuple(tuple(tuple(C) for C in B) for B in A)
Expand Down