Skip to content

Commit

Permalink
test(python): Mark some tests as slow (#11350)
Browse files Browse the repository at this point in the history
  • Loading branch information
stinodego committed Sep 27, 2023
1 parent 66f0a6d commit 15108c1
Show file tree
Hide file tree
Showing 12 changed files with 269 additions and 286 deletions.
4 changes: 2 additions & 2 deletions py-polars/polars/convert.py
Expand Up @@ -412,7 +412,7 @@ def from_repr(tbl: str) -> DataFrame | Series:
From Series repr:
>>> srs = pl.from_repr(
>>> s = pl.from_repr(
... '''
... shape: (3,)
... Series: 's' [bool]
Expand All @@ -423,7 +423,7 @@ def from_repr(tbl: str) -> DataFrame | Series:
... ]
... '''
... )
>>> srs.to_list()
>>> s.to_list()
[True, False, True]
"""
Expand Down
181 changes: 0 additions & 181 deletions py-polars/tests/unit/dataframe/test_df.py
Expand Up @@ -295,187 +295,6 @@ def test_from_arrow(monkeypatch: Any) -> None:
pl.from_arrow(data=(x for x in (1, 2, 3)))


def test_from_dict_with_column_order() -> None:
# expect schema/columns order to take precedence
schema = {"a": pl.UInt8, "b": pl.UInt32}
data = {"b": [3, 4], "a": [1, 2]}
for df in (
pl.DataFrame(data, schema=schema),
pl.DataFrame(data, schema=["a", "b"], schema_overrides=schema),
):
# ┌─────┬─────┐
# │ a ┆ b │
# │ --- ┆ --- │
# │ u8 ┆ u32 │
# ╞═════╪═════╡
# │ 1 ┆ 3 │
# │ 2 ┆ 4 │
# └─────┴─────┘
assert df.columns == ["a", "b"]
assert df.schema == {"a": pl.UInt8, "b": pl.UInt32}
assert df.rows() == [(1, 3), (2, 4)]

# expect an error
mismatched_schema = {"x": pl.UInt8, "b": pl.UInt32}
with pytest.raises(ValueError):
pl.DataFrame({"b": [3, 4], "a": [1, 2]}, schema=mismatched_schema)


def test_from_dict_with_scalars() -> None:
import polars as pl

# one or more valid arrays, with some scalars (inc. None)
df1 = pl.DataFrame(
{"key": ["aa", "bb", "cc"], "misc": "xyz", "other": None, "value": 0}
)
assert df1.to_dict(False) == {
"key": ["aa", "bb", "cc"],
"misc": ["xyz", "xyz", "xyz"],
"other": [None, None, None],
"value": [0, 0, 0],
}

# edge-case: all scalars
df2 = pl.DataFrame({"key": "aa", "misc": "xyz", "other": None, "value": 0})
assert df2.to_dict(False) == {
"key": ["aa"],
"misc": ["xyz"],
"other": [None],
"value": [0],
}

# edge-case: single unsized generator
df3 = pl.DataFrame({"vals": map(float, [1, 2, 3])})
assert df3.to_dict(False) == {"vals": [1.0, 2.0, 3.0]}

# ensure we don't accidentally consume or expand map/range/generator
# cols, and can properly apply schema dtype/ordering directives
df4 = pl.DataFrame(
{
"key": range(1, 4),
"misc": (x for x in [4, 5, 6]),
"other": map(float, [7, 8, 9]),
"value": {0: "x", 1: "y", 2: "z"}.values(),
},
schema={
"value": pl.Utf8,
"other": pl.Float32,
"misc": pl.Int32,
"key": pl.Int8,
},
)
assert df4.columns == ["value", "other", "misc", "key"]
assert df4.to_dict(False) == {
"value": ["x", "y", "z"],
"other": [7.0, 8.0, 9.0],
"misc": [4, 5, 6],
"key": [1, 2, 3],
}
assert df4.schema == {
"value": pl.Utf8,
"other": pl.Float32,
"misc": pl.Int32,
"key": pl.Int8,
}

# mixed with struct cols
for df5 in (
pl.from_dict(
{"x": {"b": [1, 3], "c": [2, 4]}, "y": [5, 6], "z": "x"},
schema_overrides={"y": pl.Int8},
),
pl.from_dict(
{"x": {"b": [1, 3], "c": [2, 4]}, "y": [5, 6], "z": "x"},
schema=["x", ("y", pl.Int8), "z"],
),
):
assert df5.rows() == [({"b": 1, "c": 2}, 5, "x"), ({"b": 3, "c": 4}, 6, "x")]
assert df5.schema == {
"x": pl.Struct([pl.Field("b", pl.Int64), pl.Field("c", pl.Int64)]),
"y": pl.Int8,
"z": pl.Utf8,
}

# mixed with numpy cols...
df6 = pl.DataFrame(
{"x": np.ones(3), "y": np.zeros(3), "z": 1.0},
)
assert df6.rows() == [(1.0, 0.0, 1.0), (1.0, 0.0, 1.0), (1.0, 0.0, 1.0)]

# ...and trigger multithreaded load codepath
df7 = pl.DataFrame(
{
"w": np.zeros(1001, dtype=np.uint8),
"x": np.ones(1001, dtype=np.uint8),
"y": np.zeros(1001, dtype=np.uint8),
"z": 1,
},
schema_overrides={"z": pl.UInt8},
)
assert df7[999:].rows() == [(0, 1, 0, 1), (0, 1, 0, 1)]
assert df7.schema == {
"w": pl.UInt8,
"x": pl.UInt8,
"y": pl.UInt8,
"z": pl.UInt8,
}

# a bit of everything
mixed_dtype_data: dict[str, Any] = {
"a": 0,
"b": 8,
"c": 9.5,
"d": None,
"e": True,
"f": False,
"g": time(0, 1, 2),
"h": date(2023, 3, 14),
"i": timedelta(seconds=3601),
"j": datetime(2111, 11, 11, 11, 11, 11, 11),
"k": "「趣味でヒーローをやっている者だ」",
}
# note: deliberately set this value large; if all dtypes are
# on the fast-path it'll only take ~0.03secs. if it becomes
# even remotely noticeable that will indicate a regression.
n_range = 1_000_000
index_and_data: dict[str, Any] = {"idx": range(n_range)}
index_and_data.update(mixed_dtype_data.items())
df8 = pl.DataFrame(
data=index_and_data,
schema={
"idx": pl.Int32,
"a": pl.UInt16,
"b": pl.UInt32,
"c": pl.Float64,
"d": pl.Float32,
"e": pl.Boolean,
"f": pl.Boolean,
"g": pl.Time,
"h": pl.Date,
"i": pl.Duration,
"j": pl.Datetime,
"k": pl.Utf8,
},
)
dfx = df8.select(pl.exclude("idx"))

assert len(df8) == n_range
assert dfx[:5].rows() == dfx[5:10].rows()
assert dfx[-10:-5].rows() == dfx[-5:].rows()
assert dfx.row(n_range // 2, named=True) == mixed_dtype_data

# misc generators/iterables
df9 = pl.DataFrame(
{
"a": iter([0, 1, 2]),
"b": (2, 1, 0).__iter__(),
"c": (v for v in (0, 0, 0)),
"d": "x",
}
)
assert df9.rows() == [(0, 2, 0, "x"), (1, 1, 0, "x"), (2, 0, 0, "x")]


def test_dataframe_membership_operator() -> None:
# cf. issue #4032
df = pl.DataFrame({"name": ["Jane", "John"], "age": [20, 30]})
Expand Down
192 changes: 192 additions & 0 deletions py-polars/tests/unit/dataframe/test_from_dict.py
@@ -0,0 +1,192 @@
from __future__ import annotations

from datetime import date, datetime, time, timedelta
from typing import Any

import numpy as np
import pytest

import polars as pl


def test_from_dict_with_column_order() -> None:
# expect schema/columns order to take precedence
schema = {"a": pl.UInt8, "b": pl.UInt32}
data = {"b": [3, 4], "a": [1, 2]}
for df in (
pl.DataFrame(data, schema=schema),
pl.DataFrame(data, schema=["a", "b"], schema_overrides=schema),
):
# ┌─────┬─────┐
# │ a ┆ b │
# │ --- ┆ --- │
# │ u8 ┆ u32 │
# ╞═════╪═════╡
# │ 1 ┆ 3 │
# │ 2 ┆ 4 │
# └─────┴─────┘
assert df.columns == ["a", "b"]
assert df.schema == {"a": pl.UInt8, "b": pl.UInt32}
assert df.rows() == [(1, 3), (2, 4)]

# expect an error
mismatched_schema = {"x": pl.UInt8, "b": pl.UInt32}
with pytest.raises(ValueError):
pl.DataFrame({"b": [3, 4], "a": [1, 2]}, schema=mismatched_schema)


def test_from_dict_with_scalars() -> None:
# one or more valid arrays, with some scalars (inc. None)
df1 = pl.DataFrame(
{"key": ["aa", "bb", "cc"], "misc": "xyz", "other": None, "value": 0}
)
assert df1.to_dict(False) == {
"key": ["aa", "bb", "cc"],
"misc": ["xyz", "xyz", "xyz"],
"other": [None, None, None],
"value": [0, 0, 0],
}

# edge-case: all scalars
df2 = pl.DataFrame({"key": "aa", "misc": "xyz", "other": None, "value": 0})
assert df2.to_dict(False) == {
"key": ["aa"],
"misc": ["xyz"],
"other": [None],
"value": [0],
}

# edge-case: single unsized generator
df3 = pl.DataFrame({"vals": map(float, [1, 2, 3])})
assert df3.to_dict(False) == {"vals": [1.0, 2.0, 3.0]}

# ensure we don't accidentally consume or expand map/range/generator
# cols, and can properly apply schema dtype/ordering directives
df4 = pl.DataFrame(
{
"key": range(1, 4),
"misc": (x for x in [4, 5, 6]),
"other": map(float, [7, 8, 9]),
"value": {0: "x", 1: "y", 2: "z"}.values(),
},
schema={
"value": pl.Utf8,
"other": pl.Float32,
"misc": pl.Int32,
"key": pl.Int8,
},
)
assert df4.columns == ["value", "other", "misc", "key"]
assert df4.to_dict(False) == {
"value": ["x", "y", "z"],
"other": [7.0, 8.0, 9.0],
"misc": [4, 5, 6],
"key": [1, 2, 3],
}
assert df4.schema == {
"value": pl.Utf8,
"other": pl.Float32,
"misc": pl.Int32,
"key": pl.Int8,
}

# mixed with struct cols
for df5 in (
pl.from_dict(
{"x": {"b": [1, 3], "c": [2, 4]}, "y": [5, 6], "z": "x"},
schema_overrides={"y": pl.Int8},
),
pl.from_dict(
{"x": {"b": [1, 3], "c": [2, 4]}, "y": [5, 6], "z": "x"},
schema=["x", ("y", pl.Int8), "z"],
),
):
assert df5.rows() == [({"b": 1, "c": 2}, 5, "x"), ({"b": 3, "c": 4}, 6, "x")]
assert df5.schema == {
"x": pl.Struct([pl.Field("b", pl.Int64), pl.Field("c", pl.Int64)]),
"y": pl.Int8,
"z": pl.Utf8,
}

# mixed with numpy cols...
df6 = pl.DataFrame(
{"x": np.ones(3), "y": np.zeros(3), "z": 1.0},
)
assert df6.rows() == [(1.0, 0.0, 1.0), (1.0, 0.0, 1.0), (1.0, 0.0, 1.0)]

# ...and trigger multithreaded load codepath
df7 = pl.DataFrame(
{
"w": np.zeros(1001, dtype=np.uint8),
"x": np.ones(1001, dtype=np.uint8),
"y": np.zeros(1001, dtype=np.uint8),
"z": 1,
},
schema_overrides={"z": pl.UInt8},
)
assert df7[999:].rows() == [(0, 1, 0, 1), (0, 1, 0, 1)]
assert df7.schema == {
"w": pl.UInt8,
"x": pl.UInt8,
"y": pl.UInt8,
"z": pl.UInt8,
}

# misc generators/iterables
df9 = pl.DataFrame(
{
"a": iter([0, 1, 2]),
"b": (2, 1, 0).__iter__(),
"c": (v for v in (0, 0, 0)),
"d": "x",
}
)
assert df9.rows() == [(0, 2, 0, "x"), (1, 1, 0, "x"), (2, 0, 0, "x")]


@pytest.mark.slow()
def test_from_dict_with_scalars_mixed() -> None:
# a bit of everything
mixed_dtype_data: dict[str, Any] = {
"a": 0,
"b": 8,
"c": 9.5,
"d": None,
"e": True,
"f": False,
"g": time(0, 1, 2),
"h": date(2023, 3, 14),
"i": timedelta(seconds=3601),
"j": datetime(2111, 11, 11, 11, 11, 11, 11),
"k": "「趣味でヒーローをやっている者だ」",
}
# note: deliberately set this value large; if all dtypes are
# on the fast-path it'll only take ~0.03secs. if it becomes
# even remotely noticeable that will indicate a regression.
# TODO: This is now slow (~0.15 seconds). Needs to be looked into.
n_range = 1_000_000
index_and_data: dict[str, Any] = {"idx": range(n_range)}
index_and_data.update(mixed_dtype_data.items())
df8 = pl.DataFrame(
data=index_and_data,
schema={
"idx": pl.Int32,
"a": pl.UInt16,
"b": pl.UInt32,
"c": pl.Float64,
"d": pl.Float32,
"e": pl.Boolean,
"f": pl.Boolean,
"g": pl.Time,
"h": pl.Date,
"i": pl.Duration,
"j": pl.Datetime,
"k": pl.Utf8,
},
)
dfx = df8.select(pl.exclude("idx"))

assert len(df8) == n_range
assert dfx[:5].rows() == dfx[5:10].rows()
assert dfx[-10:-5].rows() == dfx[-5:].rows()
assert dfx.row(n_range // 2, named=True) == mixed_dtype_data

0 comments on commit 15108c1

Please sign in to comment.