Skip to content

Commit

Permalink
Remove deprecated boolean masks for DataFrame.__getitem__ (#4342)
Browse files Browse the repository at this point in the history
  • Loading branch information
stinodego committed Aug 9, 2022
1 parent 7af0082 commit c03fbd3
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 53 deletions.
36 changes: 4 additions & 32 deletions py-polars/polars/internals/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
from polars import internals as pli
from polars._html import NotebookFormatter
from polars.datatypes import (
Boolean,
ColumnsType,
DataType,
Int8,
Expand Down Expand Up @@ -53,7 +52,6 @@
_process_null_values,
format_path,
handle_projection_columns,
is_bool_sequence,
is_int_sequence,
is_str_sequence,
range_to_slice,
Expand Down Expand Up @@ -117,10 +115,8 @@
# MultiRowSelector indexes into the vertical axis and
# MultiColSelector indexes into the horizontal axis
# NOTE: wrapping these as strings is necessary for Python <3.10
MultiRowSelector: TypeAlias = "slice | range | list[int] | list[bool] | pli.Series"
MultiColSelector: TypeAlias = (
"slice | range | list[int] | list[bool] | list[str] | pli.Series"
)
MultiRowSelector: TypeAlias = "slice | range | list[int] | pli.Series"
MultiColSelector: TypeAlias = "slice | range | list[int] | list[str] | pli.Series"


def wrap_df(df: PyDataFrame) -> DataFrame:
Expand Down Expand Up @@ -1714,21 +1710,6 @@ def __getitem__(
df = self.__getitem__(self.columns[col_selection])
return df[row_selection]

# slice and boolean mask
# df[:2, [True, False, True]]
if isinstance(col_selection, (Sequence, pli.Series)):
if (
isinstance(col_selection[0], bool)
or isinstance(col_selection, pli.Series)
and col_selection.dtype() == Boolean
):
df = self.__getitem__(row_selection)
select = []
for col, valid in zip(df.columns, col_selection):
if valid:
select.append(col)
return df.select(select)

# single slice
# df[:, unknown]
series = self.__getitem__(col_selection)
Expand Down Expand Up @@ -1795,27 +1776,18 @@ def __getitem__(
)
if isinstance(item[0], str):
return self._from_pydf(self._df.select(item))
if item.dtype == bool:
warnings.warn(
"index notation '[]' is deprecated for boolean masks. Consider"
" using 'filter'.",
DeprecationWarning,
)
return self._from_pydf(self._df.filter(pli.Series("", item).inner()))

if is_str_sequence(item, allow_str=False):
# select multiple columns
# df[["foo", "bar"]]
return self._from_pydf(self._df.select(item))
elif is_bool_sequence(item) or is_int_sequence(item):
elif is_int_sequence(item):
item = pli.Series("", item) # fall through to next if isinstance

if isinstance(item, pli.Series):
dtype = item.dtype
if dtype == Utf8:
return self._from_pydf(self._df.select(item))
if dtype == Boolean:
return self._from_pydf(self._df.filter(item.inner()))
if dtype == UInt32:
return self._from_pydf(self._df.take_with_series(item.inner()))
if dtype in {UInt8, UInt16, UInt64, Int8, Int16, Int32, Int64}:
Expand Down Expand Up @@ -2668,7 +2640,7 @@ def drop_nulls(self: DF, subset: str | list[str] | None = None) -> DF:
Drop a column if all values are null:
>>> df[:, [not (s.null_count() == df.height) for s in df]]
>>> df[[s.name for s in df if not (s.null_count() == df.height)]]
shape: (4, 2)
┌──────┬──────┐
│ b ┆ c │
Expand Down
30 changes: 9 additions & 21 deletions py-polars/tests/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,8 @@ def test_selection() -> None:
assert df.get_column("a").to_list() == [1, 2, 3]

# select columns by mask
assert df[:2, [True, False, False]].shape == (2, 1)
assert df[:2, pl.Series([True, False, False])].shape == (2, 1)
assert df[:2, :1].shape == (2, 1)
assert df[:2, "a"].shape == (2, 1)

# column selection by string(s) in first dimension
assert df["a"].to_list() == [1, 2, 3]
Expand All @@ -111,9 +111,6 @@ def test_selection() -> None:
pl.DataFrame({"a": [3, 2], "b": [3.0, 2.0], "c": ["c", "b"]})
)

assert df[[True, False, True]].frame_equal(
pl.DataFrame({"a": [1, 3], "b": [1.0, 3.0], "c": ["a", "c"]})
)
assert df[["a", "b"]].columns == ["a", "b"]
assert df[[1, 2], [1, 2]].frame_equal(
pl.DataFrame({"b": [2.0, 3.0], "c": ["b", "c"]})
Expand All @@ -122,9 +119,6 @@ def test_selection() -> None:
assert df[1, 1] == 2.0
assert df[2, 0] == 3

assert df[[True, False, True], "b"].shape == (2, 1)
assert df[[True, False, False], ["a", "b"]].shape == (1, 2)

assert df[[0, 1], "b"].shape == (2, 1)
assert df[[2], ["a", "b"]].shape == (1, 2)
assert df.to_series(0).name == "a"
Expand Down Expand Up @@ -1644,12 +1638,6 @@ def test_get_item() -> None:
with pytest.raises(ValueError):
_ = df[np.array([1.0])]

# using boolean masks with numpy is deprecated
with pytest.deprecated_call():
assert df[np.array([True, False, False, True])].frame_equal(
pl.DataFrame({"a": [1.0, 4.0], "b": [3, 6]})
)

# sequences (lists or tuples; tuple only if length != 2)
# if strings or list of expressions, assumed to be column names
# if bools, assumed to be a row mask
Expand All @@ -1659,9 +1647,6 @@ def test_get_item() -> None:
assert df[[1, -4, -1, 2, 1]].frame_equal(
pl.DataFrame({"a": [2.0, 1.0, 4.0, 3.0, 2.0], "b": [4, 3, 6, 5, 4]})
)
assert df[[False, True, True, False]].frame_equal(
pl.DataFrame({"a": [2.0, 3.0], "b": [4, 5]})
)

# pl.Series: strings for column selections.
assert df[pl.Series("", ["a", "b"])].frame_equal(df)
Expand All @@ -1687,10 +1672,13 @@ def test_get_item() -> None:
pl.DataFrame({"a": [4.0, 1.0, 2.0, 3.0, 4.0, 1.0], "b": [6, 3, 4, 5, 6, 3]})
)

# pl.Series: boolean masks for row selection.
assert df[pl.Series("", [False, True, True, False])].frame_equal(
pl.DataFrame({"a": [2.0, 3.0], "b": [4, 5]})
)
# Boolean masks not supported
with pytest.raises(ValueError):
df[np.array([True, False, True])]
with pytest.raises(ValueError):
df[[True, False, True], [False, True]] # type: ignore[index]
with pytest.raises(ValueError):
df[pl.Series([True, False, True]), "b"]


@pytest.mark.parametrize("as_series,inner_dtype", [(True, pl.Series), (False, list)])
Expand Down

0 comments on commit c03fbd3

Please sign in to comment.