test(python): Mark some tests as slow (#11350)

pola-rs · Sep 27, 2023 · 15108c1 · 15108c1
1 parent 66f0a6d
commit 15108c1
Show file tree

Hide file tree

Showing 12 changed files with 269 additions and 286 deletions.
diff --git a/py-polars/polars/convert.py b/py-polars/polars/convert.py
@@ -412,7 +412,7 @@ def from_repr(tbl: str) -> DataFrame | Series:
 
     From Series repr:
 
-    >>> srs = pl.from_repr(
+    >>> s = pl.from_repr(
     ...     '''
     ...     shape: (3,)
     ...     Series: 's' [bool]
@@ -423,7 +423,7 @@ def from_repr(tbl: str) -> DataFrame | Series:
     ...     ]
     ...     '''
     ... )
-    >>> srs.to_list()
+    >>> s.to_list()
     [True, False, True]
 
     """

diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py
@@ -295,187 +295,6 @@ def test_from_arrow(monkeypatch: Any) -> None:
         pl.from_arrow(data=(x for x in (1, 2, 3)))
 
 
-def test_from_dict_with_column_order() -> None:
-    # expect schema/columns order to take precedence
-    schema = {"a": pl.UInt8, "b": pl.UInt32}
-    data = {"b": [3, 4], "a": [1, 2]}
-    for df in (
-        pl.DataFrame(data, schema=schema),
-        pl.DataFrame(data, schema=["a", "b"], schema_overrides=schema),
-    ):
-        # ┌─────┬─────┐
-        # │ a   ┆ b   │
-        # │ --- ┆ --- │
-        # │ u8  ┆ u32 │
-        # ╞═════╪═════╡
-        # │ 1   ┆ 3   │
-        # │ 2   ┆ 4   │
-        # └─────┴─────┘
-        assert df.columns == ["a", "b"]
-        assert df.schema == {"a": pl.UInt8, "b": pl.UInt32}
-        assert df.rows() == [(1, 3), (2, 4)]
-
-        # expect an error
-        mismatched_schema = {"x": pl.UInt8, "b": pl.UInt32}
-        with pytest.raises(ValueError):
-            pl.DataFrame({"b": [3, 4], "a": [1, 2]}, schema=mismatched_schema)
-
-
-def test_from_dict_with_scalars() -> None:
-    import polars as pl
-
-    # one or more valid arrays, with some scalars (inc. None)
-    df1 = pl.DataFrame(
-        {"key": ["aa", "bb", "cc"], "misc": "xyz", "other": None, "value": 0}
-    )
-    assert df1.to_dict(False) == {
-        "key": ["aa", "bb", "cc"],
-        "misc": ["xyz", "xyz", "xyz"],
-        "other": [None, None, None],
-        "value": [0, 0, 0],
-    }
-
-    # edge-case: all scalars
-    df2 = pl.DataFrame({"key": "aa", "misc": "xyz", "other": None, "value": 0})
-    assert df2.to_dict(False) == {
-        "key": ["aa"],
-        "misc": ["xyz"],
-        "other": [None],
-        "value": [0],
-    }
-
-    # edge-case: single unsized generator
-    df3 = pl.DataFrame({"vals": map(float, [1, 2, 3])})
-    assert df3.to_dict(False) == {"vals": [1.0, 2.0, 3.0]}
-
-    # ensure we don't accidentally consume or expand map/range/generator
-    # cols, and can properly apply schema dtype/ordering directives
-    df4 = pl.DataFrame(
-        {
-            "key": range(1, 4),
-            "misc": (x for x in [4, 5, 6]),
-            "other": map(float, [7, 8, 9]),
-            "value": {0: "x", 1: "y", 2: "z"}.values(),
-        },
-        schema={
-            "value": pl.Utf8,
-            "other": pl.Float32,
-            "misc": pl.Int32,
-            "key": pl.Int8,
-        },
-    )
-    assert df4.columns == ["value", "other", "misc", "key"]
-    assert df4.to_dict(False) == {
-        "value": ["x", "y", "z"],
-        "other": [7.0, 8.0, 9.0],
-        "misc": [4, 5, 6],
-        "key": [1, 2, 3],
-    }
-    assert df4.schema == {
-        "value": pl.Utf8,
-        "other": pl.Float32,
-        "misc": pl.Int32,
-        "key": pl.Int8,
-    }
-
-    # mixed with struct cols
-    for df5 in (
-        pl.from_dict(
-            {"x": {"b": [1, 3], "c": [2, 4]}, "y": [5, 6], "z": "x"},
-            schema_overrides={"y": pl.Int8},
-        ),
-        pl.from_dict(
-            {"x": {"b": [1, 3], "c": [2, 4]}, "y": [5, 6], "z": "x"},
-            schema=["x", ("y", pl.Int8), "z"],
-        ),
-    ):
-        assert df5.rows() == [({"b": 1, "c": 2}, 5, "x"), ({"b": 3, "c": 4}, 6, "x")]
-        assert df5.schema == {
-            "x": pl.Struct([pl.Field("b", pl.Int64), pl.Field("c", pl.Int64)]),
-            "y": pl.Int8,
-            "z": pl.Utf8,
-        }
-
-    # mixed with numpy cols...
-    df6 = pl.DataFrame(
-        {"x": np.ones(3), "y": np.zeros(3), "z": 1.0},
-    )
-    assert df6.rows() == [(1.0, 0.0, 1.0), (1.0, 0.0, 1.0), (1.0, 0.0, 1.0)]
-
-    # ...and trigger multithreaded load codepath
-    df7 = pl.DataFrame(
-        {
-            "w": np.zeros(1001, dtype=np.uint8),
-            "x": np.ones(1001, dtype=np.uint8),
-            "y": np.zeros(1001, dtype=np.uint8),
-            "z": 1,
-        },
-        schema_overrides={"z": pl.UInt8},
-    )
-    assert df7[999:].rows() == [(0, 1, 0, 1), (0, 1, 0, 1)]
-    assert df7.schema == {
-        "w": pl.UInt8,
-        "x": pl.UInt8,
-        "y": pl.UInt8,
-        "z": pl.UInt8,
-    }
-
-    # a bit of everything
-    mixed_dtype_data: dict[str, Any] = {
-        "a": 0,
-        "b": 8,
-        "c": 9.5,
-        "d": None,
-        "e": True,
-        "f": False,
-        "g": time(0, 1, 2),
-        "h": date(2023, 3, 14),
-        "i": timedelta(seconds=3601),
-        "j": datetime(2111, 11, 11, 11, 11, 11, 11),
-        "k": "「趣味でヒーローをやっている者だ」",
-    }
-    # note: deliberately set this value large; if all dtypes are
-    # on the fast-path it'll only take ~0.03secs. if it becomes
-    # even remotely noticeable that will indicate a regression.
-    n_range = 1_000_000
-    index_and_data: dict[str, Any] = {"idx": range(n_range)}
-    index_and_data.update(mixed_dtype_data.items())
-    df8 = pl.DataFrame(
-        data=index_and_data,
-        schema={
-            "idx": pl.Int32,
-            "a": pl.UInt16,
-            "b": pl.UInt32,
-            "c": pl.Float64,
-            "d": pl.Float32,
-            "e": pl.Boolean,
-            "f": pl.Boolean,
-            "g": pl.Time,
-            "h": pl.Date,
-            "i": pl.Duration,
-            "j": pl.Datetime,
-            "k": pl.Utf8,
-        },
-    )
-    dfx = df8.select(pl.exclude("idx"))
-
-    assert len(df8) == n_range
-    assert dfx[:5].rows() == dfx[5:10].rows()
-    assert dfx[-10:-5].rows() == dfx[-5:].rows()
-    assert dfx.row(n_range // 2, named=True) == mixed_dtype_data
-
-    # misc generators/iterables
-    df9 = pl.DataFrame(
-        {
-            "a": iter([0, 1, 2]),
-            "b": (2, 1, 0).__iter__(),
-            "c": (v for v in (0, 0, 0)),
-            "d": "x",
-        }
-    )
-    assert df9.rows() == [(0, 2, 0, "x"), (1, 1, 0, "x"), (2, 0, 0, "x")]
-
-
 def test_dataframe_membership_operator() -> None:
     # cf. issue #4032
     df = pl.DataFrame({"name": ["Jane", "John"], "age": [20, 30]})

diff --git a/py-polars/tests/unit/dataframe/test_from_dict.py b/py-polars/tests/unit/dataframe/test_from_dict.py
@@ -0,0 +1,192 @@
+from __future__ import annotations
+
+from datetime import date, datetime, time, timedelta
+from typing import Any
+
+import numpy as np
+import pytest
+
+import polars as pl
+
+
+def test_from_dict_with_column_order() -> None:
+    # expect schema/columns order to take precedence
+    schema = {"a": pl.UInt8, "b": pl.UInt32}
+    data = {"b": [3, 4], "a": [1, 2]}
+    for df in (
+        pl.DataFrame(data, schema=schema),
+        pl.DataFrame(data, schema=["a", "b"], schema_overrides=schema),
+    ):
+        # ┌─────┬─────┐
+        # │ a   ┆ b   │
+        # │ --- ┆ --- │
+        # │ u8  ┆ u32 │
+        # ╞═════╪═════╡
+        # │ 1   ┆ 3   │
+        # │ 2   ┆ 4   │
+        # └─────┴─────┘
+        assert df.columns == ["a", "b"]
+        assert df.schema == {"a": pl.UInt8, "b": pl.UInt32}
+        assert df.rows() == [(1, 3), (2, 4)]
+
+        # expect an error
+        mismatched_schema = {"x": pl.UInt8, "b": pl.UInt32}
+        with pytest.raises(ValueError):
+            pl.DataFrame({"b": [3, 4], "a": [1, 2]}, schema=mismatched_schema)
+
+
+def test_from_dict_with_scalars() -> None:
+    # one or more valid arrays, with some scalars (inc. None)
+    df1 = pl.DataFrame(
+        {"key": ["aa", "bb", "cc"], "misc": "xyz", "other": None, "value": 0}
+    )
+    assert df1.to_dict(False) == {
+        "key": ["aa", "bb", "cc"],
+        "misc": ["xyz", "xyz", "xyz"],
+        "other": [None, None, None],
+        "value": [0, 0, 0],
+    }
+
+    # edge-case: all scalars
+    df2 = pl.DataFrame({"key": "aa", "misc": "xyz", "other": None, "value": 0})
+    assert df2.to_dict(False) == {
+        "key": ["aa"],
+        "misc": ["xyz"],
+        "other": [None],
+        "value": [0],
+    }
+
+    # edge-case: single unsized generator
+    df3 = pl.DataFrame({"vals": map(float, [1, 2, 3])})
+    assert df3.to_dict(False) == {"vals": [1.0, 2.0, 3.0]}
+
+    # ensure we don't accidentally consume or expand map/range/generator
+    # cols, and can properly apply schema dtype/ordering directives
+    df4 = pl.DataFrame(
+        {
+            "key": range(1, 4),
+            "misc": (x for x in [4, 5, 6]),
+            "other": map(float, [7, 8, 9]),
+            "value": {0: "x", 1: "y", 2: "z"}.values(),
+        },
+        schema={
+            "value": pl.Utf8,
+            "other": pl.Float32,
+            "misc": pl.Int32,
+            "key": pl.Int8,
+        },
+    )
+    assert df4.columns == ["value", "other", "misc", "key"]
+    assert df4.to_dict(False) == {
+        "value": ["x", "y", "z"],
+        "other": [7.0, 8.0, 9.0],
+        "misc": [4, 5, 6],
+        "key": [1, 2, 3],
+    }
+    assert df4.schema == {
+        "value": pl.Utf8,
+        "other": pl.Float32,
+        "misc": pl.Int32,
+        "key": pl.Int8,
+    }
+
+    # mixed with struct cols
+    for df5 in (
+        pl.from_dict(
+            {"x": {"b": [1, 3], "c": [2, 4]}, "y": [5, 6], "z": "x"},
+            schema_overrides={"y": pl.Int8},
+        ),
+        pl.from_dict(
+            {"x": {"b": [1, 3], "c": [2, 4]}, "y": [5, 6], "z": "x"},
+            schema=["x", ("y", pl.Int8), "z"],
+        ),
+    ):
+        assert df5.rows() == [({"b": 1, "c": 2}, 5, "x"), ({"b": 3, "c": 4}, 6, "x")]
+        assert df5.schema == {
+            "x": pl.Struct([pl.Field("b", pl.Int64), pl.Field("c", pl.Int64)]),
+            "y": pl.Int8,
+            "z": pl.Utf8,
+        }
+
+    # mixed with numpy cols...
+    df6 = pl.DataFrame(
+        {"x": np.ones(3), "y": np.zeros(3), "z": 1.0},
+    )
+    assert df6.rows() == [(1.0, 0.0, 1.0), (1.0, 0.0, 1.0), (1.0, 0.0, 1.0)]
+
+    # ...and trigger multithreaded load codepath
+    df7 = pl.DataFrame(
+        {
+            "w": np.zeros(1001, dtype=np.uint8),
+            "x": np.ones(1001, dtype=np.uint8),
+            "y": np.zeros(1001, dtype=np.uint8),
+            "z": 1,
+        },
+        schema_overrides={"z": pl.UInt8},
+    )
+    assert df7[999:].rows() == [(0, 1, 0, 1), (0, 1, 0, 1)]
+    assert df7.schema == {
+        "w": pl.UInt8,
+        "x": pl.UInt8,
+        "y": pl.UInt8,
+        "z": pl.UInt8,
+    }
+
+    # misc generators/iterables
+    df9 = pl.DataFrame(
+        {
+            "a": iter([0, 1, 2]),
+            "b": (2, 1, 0).__iter__(),
+            "c": (v for v in (0, 0, 0)),
+            "d": "x",
+        }
+    )
+    assert df9.rows() == [(0, 2, 0, "x"), (1, 1, 0, "x"), (2, 0, 0, "x")]
+
+
+@pytest.mark.slow()
+def test_from_dict_with_scalars_mixed() -> None:
+    # a bit of everything
+    mixed_dtype_data: dict[str, Any] = {
+        "a": 0,
+        "b": 8,
+        "c": 9.5,
+        "d": None,
+        "e": True,
+        "f": False,
+        "g": time(0, 1, 2),
+        "h": date(2023, 3, 14),
+        "i": timedelta(seconds=3601),
+        "j": datetime(2111, 11, 11, 11, 11, 11, 11),
+        "k": "「趣味でヒーローをやっている者だ」",
+    }
+    # note: deliberately set this value large; if all dtypes are
+    # on the fast-path it'll only take ~0.03secs. if it becomes
+    # even remotely noticeable that will indicate a regression.
+    # TODO: This is now slow (~0.15 seconds). Needs to be looked into.
+    n_range = 1_000_000
+    index_and_data: dict[str, Any] = {"idx": range(n_range)}
+    index_and_data.update(mixed_dtype_data.items())
+    df8 = pl.DataFrame(
+        data=index_and_data,
+        schema={
+            "idx": pl.Int32,
+            "a": pl.UInt16,
+            "b": pl.UInt32,
+            "c": pl.Float64,
+            "d": pl.Float32,
+            "e": pl.Boolean,
+            "f": pl.Boolean,
+            "g": pl.Time,
+            "h": pl.Date,
+            "i": pl.Duration,
+            "j": pl.Datetime,
+            "k": pl.Utf8,
+        },
+    )
+    dfx = df8.select(pl.exclude("idx"))
+
+    assert len(df8) == n_range
+    assert dfx[:5].rows() == dfx[5:10].rows()
+    assert dfx[-10:-5].rows() == dfx[-5:].rows()
+    assert dfx.row(n_range // 2, named=True) == mixed_dtype_data