Skip to content

Commit

Permalink
fix[python]: prevent invalid use of Series, DataFrame, and `LazyF…
Browse files Browse the repository at this point in the history
…rame` in logical boolean context (#4928)
  • Loading branch information
alexander-beedie committed Sep 22, 2022
1 parent 23a2309 commit 17e7cd6
Show file tree
Hide file tree
Showing 14 changed files with 164 additions and 86 deletions.
7 changes: 7 additions & 0 deletions py-polars/polars/internals/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
Callable,
Iterator,
Mapping,
NoReturn,
Sequence,
TextIO,
TypeVar,
Expand Down Expand Up @@ -1052,6 +1053,12 @@ def _compare_to_non_df(
else:
raise ValueError(f"got unexpected comparison operator: {op}")

def __bool__(self) -> NoReturn:
raise ValueError(
"The truth value of a DataFrame is ambiguous. "
"Hint: to check if a DataFrame contains any values, use 'is_empty()'"
)

def __eq__(self, other: Any) -> DataFrame: # type: ignore[override]
return self._comp(other, "eq")

Expand Down
3 changes: 2 additions & 1 deletion py-polars/polars/internals/expr/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,8 @@ def truncate(
2001-01-01 23:00:00
2001-01-02 00:00:00
]
>>> assert s.dt.truncate("1h") == s.dt.truncate(timedelta(hours=1))
>>> s.dt.truncate("1h").series_equal(s.dt.truncate(timedelta(hours=1)))
True
"""
if offset is None:
Expand Down
4 changes: 2 additions & 2 deletions py-polars/polars/internals/expr/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import math
import random
from datetime import date, datetime, time, timedelta
from typing import TYPE_CHECKING, Any, Callable, Sequence
from typing import TYPE_CHECKING, Any, Callable, NoReturn, Sequence
from warnings import warn

from polars import internals as pli
Expand Down Expand Up @@ -140,7 +140,7 @@ def _repr_html_(self) -> str:
def __str__(self) -> str:
return self._pyexpr.to_str()

def __bool__(self) -> Expr:
def __bool__(self) -> NoReturn:
raise ValueError(
"Since Expr are lazy, the truthiness of an Expr is ambiguous. "
"Hint: use '&' or '|' to chain Expr together, not and/or."
Expand Down
8 changes: 7 additions & 1 deletion py-polars/polars/internals/lazyframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import typing
from io import BytesIO, IOBase, StringIO
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, Sequence, TypeVar, overload
from typing import TYPE_CHECKING, Any, Callable, NoReturn, Sequence, TypeVar, overload
from warnings import warn

from polars import internals as pli
Expand Down Expand Up @@ -390,6 +390,12 @@ def schema(self) -> Schema:
""" # noqa: E501
return self._ldf.schema()

def __bool__(self) -> NoReturn:
raise ValueError(
"The truth value of a LazyFrame is ambiguous; consequently it "
"cannot be used in boolean context with and/or/not operators. "
)

def __contains__(self: LDF, key: str) -> bool:
return key in self.columns

Expand Down
3 changes: 2 additions & 1 deletion py-polars/polars/internals/series/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,7 @@ def truncate(
2001-01-01 23:00:00
2001-01-02 00:00:00
]
>>> assert s.dt.truncate("1h") == s.dt.truncate(timedelta(hours=1))
>>> s.dt.truncate("1h").series_equal(s.dt.truncate(timedelta(hours=1)))
True
"""
9 changes: 8 additions & 1 deletion py-polars/polars/internals/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import math
from datetime import date, datetime, time, timedelta
from typing import TYPE_CHECKING, Any, Callable, Sequence, Union
from typing import TYPE_CHECKING, Any, Callable, NoReturn, Sequence, Union
from warnings import warn

from polars import internals as pli
Expand Down Expand Up @@ -322,6 +322,13 @@ def time_unit(self) -> TimeUnit | None:
"""Get the time unit of underlying Datetime Series as {"ns", "us", "ms"}."""
return self._s.time_unit()

def __bool__(self) -> NoReturn:
raise ValueError(
"The truth value of a Series is ambiguous. Hint: use '&' or '|' to chain "
"Series boolean results together, not and/or; to check if a Series "
"contains any values, use 'is_empty()'"
)

def __getstate__(self) -> Any:
return self._s.__getstate__()

Expand Down
15 changes: 8 additions & 7 deletions py-polars/tests/unit/io/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import polars as pl
from polars import DataType
from polars.internals.type_aliases import TimeUnit
from polars.testing import assert_frame_equal_local_categoricals
from polars.testing import assert_frame_equal_local_categoricals, assert_series_equal


def test_quoted_date() -> None:
Expand Down Expand Up @@ -230,12 +230,13 @@ def test_read_csv_encoding() -> None:
for use_pyarrow in (False, True):
for file in (file_path, file_str, bts, bytesio):
print(type(file))
assert pl.read_csv(
file, # type: ignore[arg-type]
encoding="big5",
use_pyarrow=use_pyarrow,
).get_column("Region") == pl.Series(
"Region", ["台北", "台中", "新竹", "高雄", "美國"]
assert_series_equal(
pl.read_csv(
file, # type: ignore[arg-type]
encoding="big5",
use_pyarrow=use_pyarrow,
).get_column("Region"),
pl.Series("Region", ["台北", "台中", "新竹", "高雄", "美國"]),
)


Expand Down
8 changes: 4 additions & 4 deletions py-polars/tests/unit/test_datelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import polars as pl
from polars.datatypes import DTYPE_TEMPORAL_UNITS, TemporalDataType
from polars.testing import verify_series_and_expr_api
from polars.testing import assert_series_equal, verify_series_and_expr_api

if TYPE_CHECKING:
from polars.internals.type_aliases import TimeUnit
Expand Down Expand Up @@ -83,7 +83,7 @@ def test_series_add_datetime() -> None:
out = pl.Series(
[datetime(2027, 5, 19), datetime(2054, 10, 4), datetime(2082, 2, 19)]
)
assert (deltas + pl.Series([datetime(2000, 1, 1)])) == out
assert_series_equal(deltas + pl.Series([datetime(2000, 1, 1)]), out)


def test_diff_datetime() -> None:
Expand All @@ -102,7 +102,7 @@ def test_diff_datetime() -> None:
]
).with_columns([pl.col("timestamp").diff().list().over("char")])
)["timestamp"]
assert out[0] == out[1]
assert (out[0] == out[1]).all()


def test_from_pydatetime() -> None:
Expand Down Expand Up @@ -379,7 +379,7 @@ def test_date_range() -> None:
assert result.name == "drange"

result = pl.date_range(date(2022, 1, 1), date(2022, 1, 2), "1h30m")
assert result == [
assert list(result) == [
datetime(2022, 1, 1, 0, 0),
datetime(2022, 1, 1, 1, 30),
datetime(2022, 1, 1, 3, 0),
Expand Down
49 changes: 33 additions & 16 deletions py-polars/tests/unit/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@ def test_init_empty() -> None:
for empty in (None, (), [], {}, pa.Table.from_arrays([])):
df = pl.DataFrame(empty)
assert df.shape == (0, 0)
assert df.is_empty()

# note: cannot use df (empty or otherwise) in boolean context
empty_df = pl.DataFrame()
with pytest.raises(ValueError, match="ambiguous"):
not empty_df


def test_special_char_colname_init() -> None:
Expand Down Expand Up @@ -104,9 +110,9 @@ def test_selection() -> None:
assert df[-1].frame_equal(pl.DataFrame({"a": [3], "b": [3.0], "c": ["c"]}))

# row, column selection when using two dimensions
assert df[:, 0] == [1, 2, 3]
assert df[:, 1] == [1.0, 2.0, 3.0]
assert df[:2, 2] == ["a", "b"]
assert df[:, 0].to_list() == [1, 2, 3]
assert df[:, 1].to_list() == [1.0, 2.0, 3.0]
assert df[:2, 2].to_list() == ["a", "b"]

assert df[[1, 2]].frame_equal(
pl.DataFrame({"a": [2, 3], "b": [2.0, 3.0], "c": ["b", "c"]})
Expand Down Expand Up @@ -938,7 +944,7 @@ def test_column_names() -> None:
def test_lazy_functions() -> None:
df = pl.DataFrame({"a": ["foo", "bar", "2"], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]})
out = df.select([pl.count("a")])
assert out["a"] == 3
assert list(out["a"]) == [3]
assert pl.count(df["a"]) == 3
out = df.select(
[
Expand Down Expand Up @@ -989,19 +995,17 @@ def test_lazy_functions() -> None:
def test_multiple_column_sort() -> None:
df = pl.DataFrame({"a": ["foo", "bar", "2"], "b": [2, 2, 3], "c": [1.0, 2.0, 3.0]})
out = df.sort([pl.col("b"), pl.col("c").reverse()])
assert out["c"] == [2, 3, 1]
assert out["b"] == [2, 2, 3]
assert list(out["c"]) == [2.0, 1.0, 3.0]
assert list(out["b"]) == [2, 2, 3]

df = pl.DataFrame({"a": np.arange(1, 4), "b": ["a", "a", "b"]})

df.sort("a", reverse=True).frame_equal(
pl.DataFrame({"a": [3, 2, 1], "b": ["b", "a", "a"]})
)

df.sort("b", reverse=True).frame_equal(
pl.DataFrame({"a": [3, 1, 2], "b": ["b", "a", "a"]})
)

df.sort(["b", "a"], reverse=[False, True]).frame_equal(
pl.DataFrame({"a": [2, 1, 3], "b": ["a", "a", "b"]})
)
Expand Down Expand Up @@ -1062,7 +1066,7 @@ def test_assign() -> None:
df = pl.DataFrame({"a": [1, 2, 3]})
# test if we can assign in case of single column
df = df.with_column(pl.col("a") * 2)
assert df["a"] == [2, 4, 6]
assert list(df["a"]) == [2, 4, 6]


def test_to_numpy() -> None:
Expand All @@ -1071,11 +1075,24 @@ def test_to_numpy() -> None:


def test_argsort_by(df: pl.DataFrame) -> None:
idx_df = df.select(pl.argsort_by(["int_nulls", "floats"], reverse=[False, True]))
assert idx_df["int_nulls"] == [1, 0, 3]
idx_df = df.select(
pl.argsort_by(["int_nulls", "floats"], reverse=[False, True]).alias("idx")
)
assert (idx_df["idx"] == [1, 0, 2]).all()

idx_df = df.select(pl.argsort_by(["int_nulls", "floats"], reverse=False))
assert idx_df["int_nulls"] == [1, 0, 2]
idx_df = df.select(
pl.argsort_by(["int_nulls", "floats"], reverse=False).alias("idx")
)
assert (idx_df["idx"] == [1, 0, 2]).all()

df = pl.DataFrame({"x": [0, 0, 0, 1, 1, 2], "y": [9, 9, 8, 7, 6, 6]})
for expr, expected in (
(pl.argsort_by(["x", "y"]), [2, 0, 1, 4, 3, 5]),
(pl.argsort_by(["x", "y"], reverse=[True, True]), [5, 3, 4, 0, 1, 2]),
(pl.argsort_by(["x", "y"], reverse=[True, False]), [5, 4, 3, 2, 0, 1]),
(pl.argsort_by(["x", "y"], reverse=[False, True]), [0, 1, 2, 3, 4, 5]),
):
assert (df.select(expr.alias("idx"))["idx"] == expected).all()


def test_literal_series() -> None:
Expand Down Expand Up @@ -1176,8 +1193,8 @@ def test_repeat_by() -> None:

out = df.select(pl.col("n").repeat_by("n"))
s = out["n"]
assert s[0] == [2, 2]
assert s[1] == [3, 3, 3]
assert s[0].to_list() == [2, 2]
assert s[1].to_list() == [3, 3, 3]


def test_join_dates() -> None:
Expand Down Expand Up @@ -1561,7 +1578,7 @@ def test_partitioned_groupby_order() -> None:
# we only have 30 groups, so this triggers a partitioned group by
df = pl.DataFrame({"x": [chr(v) for v in range(33, 63)], "y": range(30)})
out = df.groupby("x", maintain_order=True).agg(pl.all().list())
assert out["x"] == df["x"]
assert_series_equal(out["x"], df["x"])


def test_schema() -> None:
Expand Down
10 changes: 10 additions & 0 deletions py-polars/tests/unit/test_exprs.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,3 +412,13 @@ def test_abs_expr() -> None:
out = df.select(abs(pl.col("x")))

assert out["x"].to_list() == [1, 0, 1]


def test_logical_boolean() -> None:
# note, cannot use expressions in logical
# boolean context (eg: and/or/not operators)
with pytest.raises(ValueError, match="ambiguous"):
pl.col("colx") and pl.col("coly")

with pytest.raises(ValueError, match="ambiguous"):
pl.col("colx") or pl.col("coly")
5 changes: 2 additions & 3 deletions py-polars/tests/unit/test_interop.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,13 +229,12 @@ def test_from_dicts() -> None:
assert df.shape == (3, 2)


@pytest.mark.skip("not implemented yet")
def test_from_dicts_struct() -> None:
data = [{"a": {"b": 1, "c": 2}, "d": 5}, {"a": {"b": 3, "c": 4}, "d": 6}]
df = pl.from_dicts(data)
assert df.shape == (2, 2)
assert df["a"][0] == (1, 2)
assert df["a"][1] == (3, 4)
assert df["a"][0] == {"b": 1, "c": 2}
assert df["a"][1] == {"b": 3, "c": 4}


def test_from_records() -> None:
Expand Down

0 comments on commit 17e7cd6

Please sign in to comment.