Skip to content

Commit

Permalink
feat[python]: parametric tests can now generate example dataframes an…
Browse files Browse the repository at this point in the history
…d series with/without chunked data (#4981)
  • Loading branch information
alexander-beedie committed Sep 26, 2022
1 parent 6ab6e26 commit 64aafdd
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 11 deletions.
40 changes: 30 additions & 10 deletions py-polars/polars/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -548,6 +548,7 @@ def series(
strategy: SearchStrategy[object] | None = None,
null_probability: float = 0.0,
unique: bool = False,
chunked: bool | None = None,
allowed_dtypes: Sequence[PolarsDataType] | None = None,
excluded_dtypes: Sequence[PolarsDataType] | None = None,
) -> SearchStrategy[pli.Series]:
Expand All @@ -571,6 +572,7 @@ def series(
if not passing an exact size, can set a maximum value here (defaults to
MAX_DATA_SIZE).
no-op if `size` is set.
strategy : strategy, optional
supports overriding the default strategy for the given dtype.
null_probability : float, optional
Expand All @@ -579,6 +581,9 @@ def series(
underlying strategy.
unique : bool, optional
indicate whether Series values should all be distinct.
chunked : bool, optional
ensure that Series with more than one element have ``n_chunks`` > 1.
if omitted, chunking is applied at random.
allowed_dtypes : {list,set}, optional
when automatically generating Series data, allow only these dtypes.
excluded_dtypes : {list,set}, optional
Expand Down Expand Up @@ -643,12 +648,10 @@ def draw_series(draw: DrawFn) -> pli.Series:
if size is None
else size
)

# assign series name
series_name = name if isinstance(name, str) or name is None else draw(name)

# create series using dtype-specific strategy to generate values

if series_size == 0:
series_values = []
else:
Expand All @@ -674,6 +677,10 @@ def draw_series(draw: DrawFn) -> pli.Series:
)
if is_categorical_dtype(dtype):
s = s.cast(Categorical)
if series_size:
if chunked or (chunked is None and draw(booleans())):
split_at = series_size // 2
s = s[:split_at].append(s[split_at:], append_chunks=True)
return s

return draw_series()
Expand All @@ -688,6 +695,7 @@ def dataframes(
size: int | None = None,
min_size: int | None = 0,
max_size: int | None = MAX_DATA_SIZE,
chunked: bool | None = None,
include_cols: Sequence[column] | None = None,
null_probability: float | dict[str, float] = 0.0,
allowed_dtypes: Sequence[PolarsDataType] | None = None,
Expand Down Expand Up @@ -717,6 +725,9 @@ def dataframes(
max_size : int, optional
if not passing an exact size, set the maximum number of rows in the
DataFrame.
chunked : bool, optional
ensure that DataFrames with more than row have ``n_chunks`` > 1. if
omitted, chunking will be randomised at the level of individual Series.
include_cols : [column], optional
a list of `column` objects to include in the generated DataFrame. note that
explicitly provided columns are appended onto the list of existing columns
Expand All @@ -725,8 +736,8 @@ def dataframes(
percentage chance (expressed between 0.0 => 1.0) that a generated value is
None. this is applied independently of any None values generated by the
underlying strategy, and can be applied either on a per-column basis (if
given as a {col:pct} dict), or globally. if null_probability is defined on a
column, it takes precedence over the global value.
given as a ``{col:pct}`` dict), or globally. if null_probability is defined
on a column, it takes precedence over the global value.
allowed_dtypes : {list,set}, optional
when automatically generating data, allow only these dtypes.
excluded_dtypes : {list,set}, optional
Expand All @@ -735,10 +746,10 @@ def dataframes(
Notes
-----
In actual usage this is deployed as a unit test decorator, providing a strategy
that generates DataFrames or LazyFrames with the given schema/size
characteristics for the unit test. While developing a strategy/test, it can also
be useful to call `.example()` directly on a given strategy to see concrete
instances of the generated data.
that generates DataFrames or LazyFrames with the given characteristics for
the unit test. While developing a strategy/test, it can also be useful to
call `.example()` directly on a given strategy to see concrete instances of
the generated data.
Examples
--------
Expand Down Expand Up @@ -784,6 +795,9 @@ def dataframes(
""" # noqa: 501
if isinstance(cols, int):
cols = columns(cols)
if isinstance(min_size, int):
if min_cols in (0, None):
min_cols = 1

selectable_dtypes = [
dtype
Expand All @@ -808,7 +822,7 @@ def draw_frames(draw: DrawFn) -> pli.DataFrame | pli.LazyFrame:
# append any explicitly provided cols
coldefs.extend(include_cols or ())

# if not given, assign dataframe/series size
# assign dataframe/series size
series_size = (
between(
draw, int, min_=(min_size or 0), max_=(max_size or MAX_DATA_SIZE)
Expand Down Expand Up @@ -840,13 +854,19 @@ def draw_frames(draw: DrawFn) -> pli.DataFrame | pli.LazyFrame:
null_probability=(c.null_probability or 0.0),
strategy=c.strategy,
unique=c.unique,
chunked=(chunked is None and draw(booleans())),
)
)
for c in coldefs
},
columns=frame_columns, # type: ignore[arg-type]
)
# if indicated, make lazy
# optionally generate frames with n_chunks > 1
if series_size > 1 and chunked is True:
split_at = series_size // 2
df = df[:split_at].vstack(df[split_at:])

# optionally make lazy
return df.lazy() if lazy else df

return draw_frames()
Expand Down
24 changes: 23 additions & 1 deletion py-polars/tests/parametric/test_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@


@given(df=dataframes(), lf=dataframes(lazy=True), srs=series())
@settings(max_examples=10)
@settings(max_examples=5)
def test_strategy_classes(df: pl.DataFrame, lf: pl.LazyFrame, srs: pl.Series) -> None:
assert isinstance(df, pl.DataFrame)
assert isinstance(lf, pl.LazyFrame)
Expand Down Expand Up @@ -139,3 +139,25 @@ def test_strategy_null_probability(
nulls_col0, nulls_colx = df3.null_count().rows()[0]
assert nulls_col0 > nulls_colx
assert nulls_col0 == 50


@given(
df1=dataframes(chunked=False, min_size=1),
df2=dataframes(chunked=True, min_size=1),
s1=series(chunked=False, min_size=1),
s2=series(chunked=True, min_size=1),
)
@settings(max_examples=10)
def test_chunking(
df1: pl.DataFrame,
df2: pl.DataFrame,
s1: pl.Series,
s2: pl.Series,
) -> None:
assert df1.n_chunks() == 1
if len(df2) > 1:
assert df2.n_chunks("all") == [2] * len(df2.columns)

assert s1.n_chunks() == 1
if len(s2) > 1:
assert s2.n_chunks() > 1

0 comments on commit 64aafdd

Please sign in to comment.