fix[python]: prevent invalid use of Series, DataFrame, and `LazyF…

…rame` in logical boolean context (#4928)
pola-rs · Sep 22, 2022 · 17e7cd6 · 17e7cd6
1 parent 23a2309
commit 17e7cd6
Show file tree

Hide file tree

Showing 14 changed files with 164 additions and 86 deletions.
diff --git a/py-polars/polars/internals/dataframe/frame.py b/py-polars/polars/internals/dataframe/frame.py
@@ -13,6 +13,7 @@
     Callable,
     Iterator,
     Mapping,
+    NoReturn,
     Sequence,
     TextIO,
     TypeVar,
@@ -1052,6 +1053,12 @@ def _compare_to_non_df(
         else:
             raise ValueError(f"got unexpected comparison operator: {op}")
 
+    def __bool__(self) -> NoReturn:
+        raise ValueError(
+            "The truth value of a DataFrame is ambiguous. "
+            "Hint: to check if a DataFrame contains any values, use 'is_empty()'"
+        )
+
     def __eq__(self, other: Any) -> DataFrame:  # type: ignore[override]
         return self._comp(other, "eq")
 

diff --git a/py-polars/polars/internals/expr/datetime.py b/py-polars/polars/internals/expr/datetime.py
@@ -120,7 +120,8 @@ def truncate(
             2001-01-01 23:00:00
             2001-01-02 00:00:00
         ]
-        >>> assert s.dt.truncate("1h") == s.dt.truncate(timedelta(hours=1))
+        >>> s.dt.truncate("1h").series_equal(s.dt.truncate(timedelta(hours=1)))
+        True
 
         """
         if offset is None:

diff --git a/py-polars/polars/internals/expr/expr.py b/py-polars/polars/internals/expr/expr.py
@@ -3,7 +3,7 @@
 import math
 import random
 from datetime import date, datetime, time, timedelta
-from typing import TYPE_CHECKING, Any, Callable, Sequence
+from typing import TYPE_CHECKING, Any, Callable, NoReturn, Sequence
 from warnings import warn
 
 from polars import internals as pli
@@ -140,7 +140,7 @@ def _repr_html_(self) -> str:
     def __str__(self) -> str:
         return self._pyexpr.to_str()
 
-    def __bool__(self) -> Expr:
+    def __bool__(self) -> NoReturn:
         raise ValueError(
             "Since Expr are lazy, the truthiness of an Expr is ambiguous. "
             "Hint: use '&' or '|' to chain Expr together, not and/or."

diff --git a/py-polars/polars/internals/lazyframe/frame.py b/py-polars/polars/internals/lazyframe/frame.py
@@ -7,7 +7,7 @@
 import typing
 from io import BytesIO, IOBase, StringIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Sequence, TypeVar, overload
+from typing import TYPE_CHECKING, Any, Callable, NoReturn, Sequence, TypeVar, overload
 from warnings import warn
 
 from polars import internals as pli
@@ -390,6 +390,12 @@ def schema(self) -> Schema:
         """  # noqa: E501
         return self._ldf.schema()
 
+    def __bool__(self) -> NoReturn:
+        raise ValueError(
+            "The truth value of a LazyFrame is ambiguous; consequently it "
+            "cannot be used in boolean context with and/or/not operators. "
+        )
+
     def __contains__(self: LDF, key: str) -> bool:
         return key in self.columns
 

diff --git a/py-polars/polars/internals/series/datetime.py b/py-polars/polars/internals/series/datetime.py
@@ -500,6 +500,7 @@ def truncate(
             2001-01-01 23:00:00
             2001-01-02 00:00:00
         ]
-        >>> assert s.dt.truncate("1h") == s.dt.truncate(timedelta(hours=1))
+        >>> s.dt.truncate("1h").series_equal(s.dt.truncate(timedelta(hours=1)))
+        True
 
         """
diff --git a/py-polars/polars/internals/series/series.py b/py-polars/polars/internals/series/series.py
@@ -2,7 +2,7 @@
 
 import math
 from datetime import date, datetime, time, timedelta
-from typing import TYPE_CHECKING, Any, Callable, Sequence, Union
+from typing import TYPE_CHECKING, Any, Callable, NoReturn, Sequence, Union
 from warnings import warn
 
 from polars import internals as pli
@@ -322,6 +322,13 @@ def time_unit(self) -> TimeUnit | None:
         """Get the time unit of underlying Datetime Series as {"ns", "us", "ms"}."""
         return self._s.time_unit()
 
+    def __bool__(self) -> NoReturn:
+        raise ValueError(
+            "The truth value of a Series is ambiguous. Hint: use '&' or '|' to chain "
+            "Series boolean results together, not and/or; to check if a Series "
+            "contains any values, use 'is_empty()'"
+        )
+
     def __getstate__(self) -> Any:
         return self._s.__getstate__()
 

diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py
@@ -13,7 +13,7 @@
 import polars as pl
 from polars import DataType
 from polars.internals.type_aliases import TimeUnit
-from polars.testing import assert_frame_equal_local_categoricals
+from polars.testing import assert_frame_equal_local_categoricals, assert_series_equal
 
 
 def test_quoted_date() -> None:
@@ -230,12 +230,13 @@ def test_read_csv_encoding() -> None:
     for use_pyarrow in (False, True):
         for file in (file_path, file_str, bts, bytesio):
             print(type(file))
-            assert pl.read_csv(
-                file,  # type: ignore[arg-type]
-                encoding="big5",
-                use_pyarrow=use_pyarrow,
-            ).get_column("Region") == pl.Series(
-                "Region", ["台北", "台中", "新竹", "高雄", "美國"]
+            assert_series_equal(
+                pl.read_csv(
+                    file,  # type: ignore[arg-type]
+                    encoding="big5",
+                    use_pyarrow=use_pyarrow,
+                ).get_column("Region"),
+                pl.Series("Region", ["台北", "台中", "新竹", "高雄", "美國"]),
             )
 
 

diff --git a/py-polars/tests/unit/test_datelike.py b/py-polars/tests/unit/test_datelike.py
@@ -17,7 +17,7 @@
 
 import polars as pl
 from polars.datatypes import DTYPE_TEMPORAL_UNITS, TemporalDataType
-from polars.testing import verify_series_and_expr_api
+from polars.testing import assert_series_equal, verify_series_and_expr_api
 
 if TYPE_CHECKING:
     from polars.internals.type_aliases import TimeUnit
@@ -83,7 +83,7 @@ def test_series_add_datetime() -> None:
     out = pl.Series(
         [datetime(2027, 5, 19), datetime(2054, 10, 4), datetime(2082, 2, 19)]
     )
-    assert (deltas + pl.Series([datetime(2000, 1, 1)])) == out
+    assert_series_equal(deltas + pl.Series([datetime(2000, 1, 1)]), out)
 
 
 def test_diff_datetime() -> None:
@@ -102,7 +102,7 @@ def test_diff_datetime() -> None:
             ]
         ).with_columns([pl.col("timestamp").diff().list().over("char")])
     )["timestamp"]
-    assert out[0] == out[1]
+    assert (out[0] == out[1]).all()
 
 
 def test_from_pydatetime() -> None:
@@ -379,7 +379,7 @@ def test_date_range() -> None:
     assert result.name == "drange"
 
     result = pl.date_range(date(2022, 1, 1), date(2022, 1, 2), "1h30m")
-    assert result == [
+    assert list(result) == [
         datetime(2022, 1, 1, 0, 0),
         datetime(2022, 1, 1, 1, 30),
         datetime(2022, 1, 1, 3, 0),

diff --git a/py-polars/tests/unit/test_df.py b/py-polars/tests/unit/test_df.py
@@ -33,6 +33,12 @@ def test_init_empty() -> None:
     for empty in (None, (), [], {}, pa.Table.from_arrays([])):
         df = pl.DataFrame(empty)
         assert df.shape == (0, 0)
+        assert df.is_empty()
+
+    # note: cannot use df (empty or otherwise) in boolean context
+    empty_df = pl.DataFrame()
+    with pytest.raises(ValueError, match="ambiguous"):
+        not empty_df
 
 
 def test_special_char_colname_init() -> None:
@@ -104,9 +110,9 @@ def test_selection() -> None:
     assert df[-1].frame_equal(pl.DataFrame({"a": [3], "b": [3.0], "c": ["c"]}))
 
     # row, column selection when using two dimensions
-    assert df[:, 0] == [1, 2, 3]
-    assert df[:, 1] == [1.0, 2.0, 3.0]
-    assert df[:2, 2] == ["a", "b"]
+    assert df[:, 0].to_list() == [1, 2, 3]
+    assert df[:, 1].to_list() == [1.0, 2.0, 3.0]
+    assert df[:2, 2].to_list() == ["a", "b"]
 
     assert df[[1, 2]].frame_equal(
         pl.DataFrame({"a": [2, 3], "b": [2.0, 3.0], "c": ["b", "c"]})
@@ -938,7 +944,7 @@ def test_column_names() -> None:
 def test_lazy_functions() -> None:
     df = pl.DataFrame({"a": ["foo", "bar", "2"], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]})
     out = df.select([pl.count("a")])
-    assert out["a"] == 3
+    assert list(out["a"]) == [3]
     assert pl.count(df["a"]) == 3
     out = df.select(
         [
@@ -989,19 +995,17 @@ def test_lazy_functions() -> None:
 def test_multiple_column_sort() -> None:
     df = pl.DataFrame({"a": ["foo", "bar", "2"], "b": [2, 2, 3], "c": [1.0, 2.0, 3.0]})
     out = df.sort([pl.col("b"), pl.col("c").reverse()])
-    assert out["c"] == [2, 3, 1]
-    assert out["b"] == [2, 2, 3]
+    assert list(out["c"]) == [2.0, 1.0, 3.0]
+    assert list(out["b"]) == [2, 2, 3]
 
     df = pl.DataFrame({"a": np.arange(1, 4), "b": ["a", "a", "b"]})
 
     df.sort("a", reverse=True).frame_equal(
         pl.DataFrame({"a": [3, 2, 1], "b": ["b", "a", "a"]})
     )
-
     df.sort("b", reverse=True).frame_equal(
         pl.DataFrame({"a": [3, 1, 2], "b": ["b", "a", "a"]})
     )
-
     df.sort(["b", "a"], reverse=[False, True]).frame_equal(
         pl.DataFrame({"a": [2, 1, 3], "b": ["a", "a", "b"]})
     )
@@ -1062,7 +1066,7 @@ def test_assign() -> None:
     df = pl.DataFrame({"a": [1, 2, 3]})
     # test if we can assign in case of single column
     df = df.with_column(pl.col("a") * 2)
-    assert df["a"] == [2, 4, 6]
+    assert list(df["a"]) == [2, 4, 6]
 
 
 def test_to_numpy() -> None:
@@ -1071,11 +1075,24 @@ def test_to_numpy() -> None:
 
 
 def test_argsort_by(df: pl.DataFrame) -> None:
-    idx_df = df.select(pl.argsort_by(["int_nulls", "floats"], reverse=[False, True]))
-    assert idx_df["int_nulls"] == [1, 0, 3]
+    idx_df = df.select(
+        pl.argsort_by(["int_nulls", "floats"], reverse=[False, True]).alias("idx")
+    )
+    assert (idx_df["idx"] == [1, 0, 2]).all()
 
-    idx_df = df.select(pl.argsort_by(["int_nulls", "floats"], reverse=False))
-    assert idx_df["int_nulls"] == [1, 0, 2]
+    idx_df = df.select(
+        pl.argsort_by(["int_nulls", "floats"], reverse=False).alias("idx")
+    )
+    assert (idx_df["idx"] == [1, 0, 2]).all()
+
+    df = pl.DataFrame({"x": [0, 0, 0, 1, 1, 2], "y": [9, 9, 8, 7, 6, 6]})
+    for expr, expected in (
+        (pl.argsort_by(["x", "y"]), [2, 0, 1, 4, 3, 5]),
+        (pl.argsort_by(["x", "y"], reverse=[True, True]), [5, 3, 4, 0, 1, 2]),
+        (pl.argsort_by(["x", "y"], reverse=[True, False]), [5, 4, 3, 2, 0, 1]),
+        (pl.argsort_by(["x", "y"], reverse=[False, True]), [0, 1, 2, 3, 4, 5]),
+    ):
+        assert (df.select(expr.alias("idx"))["idx"] == expected).all()
 
 
 def test_literal_series() -> None:
@@ -1176,8 +1193,8 @@ def test_repeat_by() -> None:
 
     out = df.select(pl.col("n").repeat_by("n"))
     s = out["n"]
-    assert s[0] == [2, 2]
-    assert s[1] == [3, 3, 3]
+    assert s[0].to_list() == [2, 2]
+    assert s[1].to_list() == [3, 3, 3]
 
 
 def test_join_dates() -> None:
@@ -1561,7 +1578,7 @@ def test_partitioned_groupby_order() -> None:
     # we only have 30 groups, so this triggers a partitioned group by
     df = pl.DataFrame({"x": [chr(v) for v in range(33, 63)], "y": range(30)})
     out = df.groupby("x", maintain_order=True).agg(pl.all().list())
-    assert out["x"] == df["x"]
+    assert_series_equal(out["x"], df["x"])
 
 
 def test_schema() -> None:

diff --git a/py-polars/tests/unit/test_exprs.py b/py-polars/tests/unit/test_exprs.py
@@ -412,3 +412,13 @@ def test_abs_expr() -> None:
     out = df.select(abs(pl.col("x")))
 
     assert out["x"].to_list() == [1, 0, 1]
+
+
+def test_logical_boolean() -> None:
+    # note, cannot use expressions in logical
+    # boolean context (eg: and/or/not operators)
+    with pytest.raises(ValueError, match="ambiguous"):
+        pl.col("colx") and pl.col("coly")
+
+    with pytest.raises(ValueError, match="ambiguous"):
+        pl.col("colx") or pl.col("coly")
diff --git a/py-polars/tests/unit/test_interop.py b/py-polars/tests/unit/test_interop.py
@@ -229,13 +229,12 @@ def test_from_dicts() -> None:
     assert df.shape == (3, 2)
 
 
-@pytest.mark.skip("not implemented yet")
 def test_from_dicts_struct() -> None:
     data = [{"a": {"b": 1, "c": 2}, "d": 5}, {"a": {"b": 3, "c": 4}, "d": 6}]
     df = pl.from_dicts(data)
     assert df.shape == (2, 2)
-    assert df["a"][0] == (1, 2)
-    assert df["a"][1] == (3, 4)
+    assert df["a"][0] == {"b": 1, "c": 2}
+    assert df["a"][1] == {"b": 3, "c": 4}
 
 
 def test_from_records() -> None: