From dbdd3490d5a863baa0914003e3a742ec0a99cdea Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Sat, 22 Feb 2020 09:37:37 -0600 Subject: [PATCH] ENH: Enable indexing with nullable Boolean (#31591) --- doc/source/user_guide/boolean.rst | 10 +++---- doc/source/user_guide/indexing.rst | 12 ++++++-- doc/source/whatsnew/v1.0.2.rst | 29 +++++++++++++++++-- pandas/core/arrays/datetimelike.py | 4 ++- pandas/core/common.py | 7 +---- pandas/core/indexers.py | 14 ++++----- pandas/core/indexing.py | 7 +++-- .../tests/arrays/categorical/test_indexing.py | 9 ++++-- pandas/tests/extension/base/getitem.py | 20 +++++++------ pandas/tests/extension/base/setitem.py | 18 +++++------- pandas/tests/extension/test_numpy.py | 4 --- pandas/tests/indexing/test_check_indexer.py | 12 ++++---- pandas/tests/indexing/test_na_indexing.py | 27 ++++++++++++----- pandas/tests/series/indexing/test_boolean.py | 2 +- 14 files changed, 109 insertions(+), 66 deletions(-) diff --git a/doc/source/user_guide/boolean.rst b/doc/source/user_guide/boolean.rst index 4f0ad0e8ceaeb5..6370a523b9a0d9 100644 --- a/doc/source/user_guide/boolean.rst +++ b/doc/source/user_guide/boolean.rst @@ -20,8 +20,9 @@ Nullable Boolean data type Indexing with NA values ----------------------- -pandas does not allow indexing with NA values. Attempting to do so -will raise a ``ValueError``. +pandas allows indexing with ``NA`` values in a boolean array, which are treated as ``False``. + +.. versionchanged:: 1.0.2 .. ipython:: python :okexcept: @@ -30,12 +31,11 @@ will raise a ``ValueError``. mask = pd.array([True, False, pd.NA], dtype="boolean") s[mask] -The missing values will need to be explicitly filled with True or False prior -to using the array as a mask. +If you would prefer to keep the ``NA`` values you can manually fill them with ``fillna(True)``. .. ipython:: python - s[mask.fillna(False)] + s[mask.fillna(True)] .. _boolean.kleene: diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index a8cdf4a61073d5..2bd3ff626f2e10 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -59,7 +59,7 @@ of multi-axis indexing. slices, **both** the start and the stop are included, when present in the index! See :ref:`Slicing with labels ` and :ref:`Endpoints are inclusive `.) - * A boolean array + * A boolean array (any ``NA`` values will be treated as ``False``). * A ``callable`` function with one argument (the calling Series or DataFrame) and that returns valid output for indexing (one of the above). @@ -75,7 +75,7 @@ of multi-axis indexing. * An integer e.g. ``5``. * A list or array of integers ``[4, 3, 0]``. * A slice object with ints ``1:7``. - * A boolean array. + * A boolean array (any ``NA`` values will be treated as ``False``). * A ``callable`` function with one argument (the calling Series or DataFrame) and that returns valid output for indexing (one of the above). @@ -374,6 +374,14 @@ For getting values with a boolean array: df1.loc['a'] > 0 df1.loc[:, df1.loc['a'] > 0] +NA values in a boolean array propogate as ``False``: + +.. versionchanged:: 1.0.2 + + mask = pd.array([True, False, True, False, pd.NA, False], dtype="boolean") + mask + df1[mask] + For getting a value explicitly: .. ipython:: python diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst index 07afe60c9c22a9..affe019d0ac865 100644 --- a/doc/source/whatsnew/v1.0.2.rst +++ b/doc/source/whatsnew/v1.0.2.rst @@ -26,6 +26,33 @@ Fixed regressions .. --------------------------------------------------------------------------- +Indexing with Nullable Boolean Arrays +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Previously indexing with a nullable Boolean array containing ``NA`` would raise a ``ValueError``, however this is now permitted with ``NA`` being treated as ``False``. (:issue:`31503`) + +.. ipython:: python + + s = pd.Series([1, 2, 3, 4]) + mask = pd.array([True, True, False, None], dtype="boolean") + s + mask + +*pandas 1.0.0-1.0.1* + +.. code-block:: python + + >>> s[mask] + Traceback (most recent call last): + ... + ValueError: cannot mask with array containing NA / NaN values + +*pandas 1.0.2* + +.. ipython:: python + + s[mask] + .. _whatsnew_102.bug_fixes: Bug fixes @@ -45,8 +72,6 @@ Bug fixes - Using ``pd.NA`` with :meth:`DataFrame.to_json` now correctly outputs a null value instead of an empty object (:issue:`31615`) - Fixed bug in parquet roundtrip with nullable unsigned integer dtypes (:issue:`31896`). - - **Experimental dtypes** - Fix bug in :meth:`DataFrame.convert_dtypes` for columns that were already using the ``"string"`` dtype (:issue:`31731`). diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index e39d1dc03adf5e..854075eaa8d09b 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -520,7 +520,9 @@ def __getitem__(self, key): if com.is_bool_indexer(key): # first convert to boolean, because check_array_indexer doesn't # allow object dtype - key = np.asarray(key, dtype=bool) + if is_object_dtype(key): + key = np.asarray(key, dtype=bool) + key = check_array_indexer(self, key) if key.all(): key = slice(0, None, None) diff --git a/pandas/core/common.py b/pandas/core/common.py index 550ce74de53571..705c618fc49dc7 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -118,7 +118,6 @@ def is_bool_indexer(key: Any) -> bool: check_array_indexer : Check that `key` is a valid array to index, and convert to an ndarray. """ - na_msg = "cannot mask with array containing NA / NaN values" if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or ( is_array_like(key) and is_extension_array_dtype(key.dtype) ): @@ -126,16 +125,12 @@ def is_bool_indexer(key: Any) -> bool: key = np.asarray(values_from_object(key)) if not lib.is_bool_array(key): + na_msg = "Cannot mask with non-boolean array containing NA / NaN values" if isna(key).any(): raise ValueError(na_msg) return False return True elif is_bool_dtype(key.dtype): - # an ndarray with bool-dtype by definition has no missing values. - # So we only need to check for NAs in ExtensionArrays - if is_extension_array_dtype(key.dtype): - if np.any(key.isna()): - raise ValueError(na_msg) return True elif isinstance(key, list): try: diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index cb48d4be75c4df..5e53b061dd1c8d 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -10,6 +10,7 @@ from pandas.core.dtypes.common import ( is_array_like, is_bool_dtype, + is_extension_array_dtype, is_integer_dtype, is_list_like, ) @@ -366,14 +367,11 @@ def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any: ... IndexError: Boolean index has wrong length: 3 instead of 2. - A ValueError is raised when the mask cannot be converted to - a bool-dtype ndarray. + NA values in a boolean array are treated as False. >>> mask = pd.array([True, pd.NA]) >>> pd.api.indexers.check_array_indexer(arr, mask) - Traceback (most recent call last): - ... - ValueError: Cannot mask with a boolean indexer containing NA values + array([ True, False]) A numpy boolean mask will get passed through (if the length is correct): @@ -425,10 +423,10 @@ def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any: dtype = indexer.dtype if is_bool_dtype(dtype): - try: + if is_extension_array_dtype(dtype): + indexer = indexer.to_numpy(dtype=bool, na_value=False) + else: indexer = np.asarray(indexer, dtype=bool) - except ValueError: - raise ValueError("Cannot mask with a boolean indexer containing NA values") # GH26658 if len(indexer) != len(array): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 081f87078d9c9d..5ae237eb7dc323 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -13,6 +13,7 @@ is_iterator, is_list_like, is_numeric_dtype, + is_object_dtype, is_scalar, is_sequence, ) @@ -2189,10 +2190,12 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: "the indexed object do not match)." ) result = result.astype(bool)._values - else: - # key might be sparse / object-dtype bool, check_array_indexer needs bool array + elif is_object_dtype(key): + # key might be object-dtype bool, check_array_indexer needs bool array result = np.asarray(result, dtype=bool) result = check_array_indexer(index, result) + else: + result = check_array_indexer(index, result) return result diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index 85d5a6a3dc3ac7..3d9469c2529143 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -240,14 +240,17 @@ def test_mask_with_boolean(index): @pytest.mark.parametrize("index", [True, False]) -def test_mask_with_boolean_raises(index): +def test_mask_with_boolean_na_treated_as_false(index): + # https://github.com/pandas-dev/pandas/issues/31503 s = Series(range(3)) idx = Categorical([True, False, None]) if index: idx = CategoricalIndex(idx) - with pytest.raises(ValueError, match="NA / NaN"): - s[idx] + result = s[idx] + expected = s[idx.fillna(False)] + + tm.assert_series_equal(result, expected) @pytest.fixture diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 8615a8df22dcc6..b08a64cc076b64 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -158,21 +158,23 @@ def test_getitem_boolean_array_mask(self, data): result = pd.Series(data)[mask] self.assert_series_equal(result, expected) - def test_getitem_boolean_array_mask_raises(self, data): + def test_getitem_boolean_na_treated_as_false(self, data): + # https://github.com/pandas-dev/pandas/issues/31503 mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean") mask[:2] = pd.NA + mask[2:4] = True - msg = ( - "Cannot mask with a boolean indexer containing NA values|" - "cannot mask with array containing NA / NaN values" - ) - with pytest.raises(ValueError, match=msg): - data[mask] + result = data[mask] + expected = data[mask.fillna(False)] + + self.assert_extension_array_equal(result, expected) s = pd.Series(data) - with pytest.raises(ValueError): - s[mask] + result = s[mask] + expected = s[mask.fillna(False)] + + self.assert_series_equal(result, expected) @pytest.mark.parametrize( "idx", diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index af70799c0236ee..a4fe89df158fab 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -98,8 +98,9 @@ def test_setitem_iloc_scalar_multiple_homogoneous(self, data): [ np.array([True, True, True, False, False]), pd.array([True, True, True, False, False], dtype="boolean"), + pd.array([True, True, True, pd.NA, pd.NA], dtype="boolean"), ], - ids=["numpy-array", "boolean-array"], + ids=["numpy-array", "boolean-array", "boolean-array-na"], ) def test_setitem_mask(self, data, mask, box_in_series): arr = data[:5].copy() @@ -124,20 +125,17 @@ def test_setitem_mask_raises(self, data, box_in_series): with pytest.raises(IndexError, match="wrong length"): data[mask] = data[0] - def test_setitem_mask_boolean_array_raises(self, data, box_in_series): - # missing values in mask + def test_setitem_mask_boolean_array_with_na(self, data, box_in_series): mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean") - mask[:2] = pd.NA + mask[:3] = True + mask[3:5] = pd.NA if box_in_series: data = pd.Series(data) - msg = ( - "Cannot mask with a boolean indexer containing NA values|" - "cannot mask with array containing NA / NaN values" - ) - with pytest.raises(ValueError, match=msg): - data[mask] = data[0] + data[mask] = data[0] + + assert (data[:3] == data[0]).all() @pytest.mark.parametrize( "idx", diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 80a093530a8cd2..61c5925383f88f 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -415,10 +415,6 @@ def test_setitem_mask(self, data, mask, box_in_series): def test_setitem_mask_raises(self, data, box_in_series): super().test_setitem_mask_raises(data, box_in_series) - @skip_nested - def test_setitem_mask_boolean_array_raises(self, data, box_in_series): - super().test_setitem_mask_boolean_array_raises(data, box_in_series) - @skip_nested @pytest.mark.parametrize( "idx", diff --git a/pandas/tests/indexing/test_check_indexer.py b/pandas/tests/indexing/test_check_indexer.py index 82f8c12229824e..69d4065234d934 100644 --- a/pandas/tests/indexing/test_check_indexer.py +++ b/pandas/tests/indexing/test_check_indexer.py @@ -34,12 +34,14 @@ def test_valid_input(indexer, expected): @pytest.mark.parametrize( "indexer", [[True, False, None], pd.array([True, False, None], dtype="boolean")], ) -def test_bool_raise_missing_values(indexer): - array = np.array([1, 2, 3]) +def test_boolean_na_returns_indexer(indexer): + # https://github.com/pandas-dev/pandas/issues/31503 + arr = np.array([1, 2, 3]) - msg = "Cannot mask with a boolean indexer containing NA values" - with pytest.raises(ValueError, match=msg): - check_array_indexer(array, indexer) + result = check_array_indexer(arr, indexer) + expected = np.array([True, False, False], dtype=bool) + + tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/indexing/test_na_indexing.py b/pandas/tests/indexing/test_na_indexing.py index befe4fee8ecf88..345ca30ec77ebc 100644 --- a/pandas/tests/indexing/test_na_indexing.py +++ b/pandas/tests/indexing/test_na_indexing.py @@ -62,18 +62,29 @@ def test_series_mask_boolean(values, dtype, mask, box_mask, frame): @pytest.mark.parametrize("frame", [True, False]) -def test_indexing_with_na_raises(frame): +def test_na_treated_as_false(frame): + # https://github.com/pandas-dev/pandas/issues/31503 s = pd.Series([1, 2, 3], name="name") if frame: s = s.to_frame() + mask = pd.array([True, False, None], dtype="boolean") - match = "cannot mask with array containing NA / NaN values" - with pytest.raises(ValueError, match=match): - s[mask] - with pytest.raises(ValueError, match=match): - s.loc[mask] + result = s[mask] + expected = s[mask.fillna(False)] + + result_loc = s.loc[mask] + expected_loc = s.loc[mask.fillna(False)] - with pytest.raises(ValueError, match=match): - s.iloc[mask] + result_iloc = s.iloc[mask] + expected_iloc = s.iloc[mask.fillna(False)] + + if frame: + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result_loc, expected_loc) + tm.assert_frame_equal(result_iloc, expected_iloc) + else: + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result_loc, expected_loc) + tm.assert_series_equal(result_iloc, expected_iloc) diff --git a/pandas/tests/series/indexing/test_boolean.py b/pandas/tests/series/indexing/test_boolean.py index 28f3c0f7429f8e..8878a4a6526af6 100644 --- a/pandas/tests/series/indexing/test_boolean.py +++ b/pandas/tests/series/indexing/test_boolean.py @@ -72,7 +72,7 @@ def test_getitem_boolean_object(string_series): # nans raise exception omask[5:10] = np.nan - msg = "cannot mask with array containing NA / NaN values" + msg = "Cannot mask with non-boolean array containing NA / NaN values" with pytest.raises(ValueError, match=msg): s[omask] with pytest.raises(ValueError, match=msg):