scikit-learn · thomasjpfan · Aug 2, 2019 · Jul 29, 2019 · Jul 29, 2019 · Jul 30, 2019
diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
@@ -61,6 +61,14 @@ Changelog
   `sample_weights` are not supported by the wrapped estimator). :pr:`13575`
   by :user:`William de Vazelhes <wdevazelhes>`.
 
+:mod:`sklearn.compose`
+......................
+
+- |Fix| Fixed a bug in :class:`compose.ColumnTransformer` which failed to
+  select the proper columns when using a boolean list and NumPy older than
+  1.13.
+  :pr:`14510` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.datasets`
 .......................
 

diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
@@ -16,6 +16,7 @@
 from sklearn.base import BaseEstimator
 from sklearn.compose import ColumnTransformer, make_column_transformer
 from sklearn.exceptions import NotFittedError
+from sklearn.preprocessing import FunctionTransformer
 from sklearn.preprocessing import StandardScaler, Normalizer, OneHotEncoder
 from sklearn.feature_extraction import DictVectorizer
 
@@ -1108,3 +1109,17 @@ def test_column_transformer_reordered_column_names_remainder(explicit_colname):
     err_msg = 'Specifying the columns'
     with pytest.raises(ValueError, match=err_msg):
         tf.transform(X_array)
+
+
+@pytest.mark.parametrize("array_type", [np.asarray, sparse.csr_matrix])
+def test_column_transformer_mask_indexing(array_type):
+    # Regression test for #14510
+    # Boolean array-like does not behave as boolean array with NumPy < 1.13
+    # and sparse matrices as well
+    X = np.transpose([[1, 2, 3], [4, 5, 6], [5, 6, 7], [8, 9, 10]])
+    X = array_type(X)
+    column_transformer = ColumnTransformer(
+        [('identity', FunctionTransformer(), [False, True, False, True])]
+    )
+    X_trans = column_transformer.fit_transform(X)
+    assert X_trans.shape == (3, 2)
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
@@ -18,6 +18,7 @@
 from . import _joblib
 from ..exceptions import DataConversionWarning
 from .deprecation import deprecated
+from .fixes import np_version
 from .validation import (as_float_array,
                          assert_all_finite,
                          check_random_state, column_or_1d, check_array,
@@ -225,6 +226,21 @@ def safe_indexing(X, indices, axis=0):
         )
 
 
+def _array_indexing(array, key, axis=0):
+    """Index an array consistently across NumPy version."""
+    if axis not in (0, 1):
+        raise ValueError(
+            "'axis' should be either 0 (to index rows) or 1 (to index "
+            " column). Got {} instead.".format(axis)
+        )
+    if np_version < (1, 13) or issparse(array):
+        # check if we have an boolean array-likes to make the proper indexing
+        key_array = np.asarray(key)
+        if np.issubdtype(key_array.dtype, np.bool_):
+            key = key_array
+    return array[key] if axis == 0 else array[:, key]
+
+
 def _safe_indexing_row(X, indices):
     """Return items or rows from X using indices.
 
@@ -266,7 +282,7 @@ def _safe_indexing_row(X, indices):
             # This is often substantially faster than X[indices]
             return X.take(indices, axis=0)
         else:
-            return X[indices]
+            return _array_indexing(X, indices, axis=0)
     else:
         return [X[idx] for idx in indices]
 
@@ -356,7 +372,7 @@ def _safe_indexing_column(X, key):
             return X.iloc[:, key]
         else:
             # numpy arrays, sparse arrays
-            return X[:, key]
+            return _array_indexing(X, key, axis=1)
 
 
 def _get_column_indices(X, key):
@@ -371,7 +387,7 @@ def _get_column_indices(X, key):
             or hasattr(key, 'dtype') and np.issubdtype(key.dtype, np.bool_)):
         # Convert key into positive indexes
         try:
-            idx = np.arange(n_columns)[key]
+            idx = safe_indexing(np.arange(n_columns), key)
         except IndexError as e:
             raise ValueError(
                 'all features must be in [0, %d]' % (n_columns - 1)

diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py
@@ -10,8 +10,10 @@
 
 from sklearn.utils.testing import (assert_raises,
                                    assert_array_equal,
+                                   assert_allclose_dense_sparse,
                                    assert_raises_regex,
                                    assert_warns_message, assert_no_warnings)
+from sklearn.utils import _array_indexing
 from sklearn.utils import check_random_state
 from sklearn.utils import _check_key_type
 from sklearn.utils import deprecated
@@ -365,6 +367,34 @@ def test_safe_indexing_mock_pandas(asarray):
     assert_array_equal(np.array(X_df_indexed), X_indexed)
 
 
+@pytest.mark.parametrize("array_type", ['array', 'sparse', 'dataframe'])
+def test_safe_indexing_mask_axis_1(array_type):
+    # regression test for #14510
+    # check that boolean array-like and boolean array lead to the same indexing
+    # even in NumPy < 1.13
+    if array_type == 'array':
+        array_constructor = np.asarray
+    elif array_type == 'sparse':
+        array_constructor = sp.csr_matrix
+    elif array_type == 'dataframe':
+        pd = pytest.importorskip('pandas')
+        array_constructor = pd.DataFrame
+
+    X = array_constructor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    mask = [True, False, True]
+    mask_array = np.array(mask)
+    X_masked = safe_indexing(X, mask, axis=1)
+    X_masked_array = safe_indexing(X, mask_array, axis=1)
+    assert_allclose_dense_sparse(X_masked, X_masked_array)
+
+
+def test_array_indexing_array_error():
+    X = np.array([[0, 1], [2, 3]])
+    mask = [True, False]
+    with pytest.raises(ValueError, match="'axis' should be either 0"):
+        _array_indexing(X, mask, axis=3)
+
+
 def test_shuffle_on_ndim_equals_three():
     def to_tuple(A):    # to make the inner arrays hashable
         return tuple(tuple(tuple(C) for C in B) for B in A)