feat[python]: parametric tests can now generate example dataframes an…

…d series with/without chunked data (#4981)
pola-rs · Sep 26, 2022 · 64aafdd · 64aafdd
1 parent 6ab6e26
commit 64aafdd
Show file tree

Hide file tree

Showing 2 changed files with 53 additions and 11 deletions.
diff --git a/py-polars/polars/testing.py b/py-polars/polars/testing.py
@@ -548,6 +548,7 @@ def series(
         strategy: SearchStrategy[object] | None = None,
         null_probability: float = 0.0,
         unique: bool = False,
+        chunked: bool | None = None,
         allowed_dtypes: Sequence[PolarsDataType] | None = None,
         excluded_dtypes: Sequence[PolarsDataType] | None = None,
     ) -> SearchStrategy[pli.Series]:
@@ -571,6 +572,7 @@ def series(
             if not passing an exact size, can set a maximum value here (defaults to
             MAX_DATA_SIZE).
             no-op if `size` is set.
+
         strategy : strategy, optional
             supports overriding the default strategy for the given dtype.
         null_probability : float, optional
@@ -579,6 +581,9 @@ def series(
             underlying strategy.
         unique : bool, optional
             indicate whether Series values should all be distinct.
+        chunked : bool, optional
+            ensure that Series with more than one element have ``n_chunks`` > 1.
+            if omitted, chunking is applied at random.
         allowed_dtypes : {list,set}, optional
             when automatically generating Series data, allow only these dtypes.
         excluded_dtypes : {list,set}, optional
@@ -643,12 +648,10 @@ def draw_series(draw: DrawFn) -> pli.Series:
                 if size is None
                 else size
             )
-
             # assign series name
             series_name = name if isinstance(name, str) or name is None else draw(name)
 
             # create series using dtype-specific strategy to generate values
-
             if series_size == 0:
                 series_values = []
             else:
@@ -674,6 +677,10 @@ def draw_series(draw: DrawFn) -> pli.Series:
             )
             if is_categorical_dtype(dtype):
                 s = s.cast(Categorical)
+            if series_size:
+                if chunked or (chunked is None and draw(booleans())):
+                    split_at = series_size // 2
+                    s = s[:split_at].append(s[split_at:], append_chunks=True)
             return s
 
         return draw_series()
@@ -688,6 +695,7 @@ def dataframes(
         size: int | None = None,
         min_size: int | None = 0,
         max_size: int | None = MAX_DATA_SIZE,
+        chunked: bool | None = None,
         include_cols: Sequence[column] | None = None,
         null_probability: float | dict[str, float] = 0.0,
         allowed_dtypes: Sequence[PolarsDataType] | None = None,
@@ -717,6 +725,9 @@ def dataframes(
         max_size : int, optional
             if not passing an exact size, set the maximum number of rows in the
             DataFrame.
+        chunked : bool, optional
+            ensure that DataFrames with more than row have ``n_chunks`` > 1. if
+            omitted, chunking will be randomised at the level of individual Series.
         include_cols : [column], optional
             a list of `column` objects to include in the generated DataFrame. note that
             explicitly provided columns are appended onto the list of existing columns
@@ -725,8 +736,8 @@ def dataframes(
             percentage chance (expressed between 0.0 => 1.0) that a generated value is
             None. this is applied independently of any None values generated by the
             underlying strategy, and can be applied either on a per-column basis (if
-            given as a {col:pct} dict), or globally. if null_probability is defined on a
-            column, it takes precedence over the global value.
+            given as a ``{col:pct}`` dict), or globally. if null_probability is defined
+            on a column, it takes precedence over the global value.
         allowed_dtypes : {list,set}, optional
             when automatically generating data, allow only these dtypes.
         excluded_dtypes : {list,set}, optional
@@ -735,10 +746,10 @@ def dataframes(
         Notes
         -----
         In actual usage this is deployed as a unit test decorator, providing a strategy
-        that generates DataFrames or LazyFrames with the given schema/size
-        characteristics for the unit test. While developing a strategy/test, it can also
-        be useful to call `.example()` directly on a given strategy to see concrete
-        instances of the generated data.
+        that generates DataFrames or LazyFrames with the given characteristics for
+        the unit test. While developing a strategy/test, it can also be useful to
+        call `.example()` directly on a given strategy to see concrete instances of
+        the generated data.
 
         Examples
         --------
@@ -784,6 +795,9 @@ def dataframes(
         """  # noqa: 501
         if isinstance(cols, int):
             cols = columns(cols)
+        if isinstance(min_size, int):
+            if min_cols in (0, None):
+                min_cols = 1
 
         selectable_dtypes = [
             dtype
@@ -808,7 +822,7 @@ def draw_frames(draw: DrawFn) -> pli.DataFrame | pli.LazyFrame:
             # append any explicitly provided cols
             coldefs.extend(include_cols or ())
 
-            # if not given, assign dataframe/series size
+            # assign dataframe/series size
             series_size = (
                 between(
                     draw, int, min_=(min_size or 0), max_=(max_size or MAX_DATA_SIZE)
@@ -840,13 +854,19 @@ def draw_frames(draw: DrawFn) -> pli.DataFrame | pli.LazyFrame:
                             null_probability=(c.null_probability or 0.0),
                             strategy=c.strategy,
                             unique=c.unique,
+                            chunked=(chunked is None and draw(booleans())),
                         )
                     )
                     for c in coldefs
                 },
                 columns=frame_columns,  # type: ignore[arg-type]
             )
-            # if indicated, make lazy
+            # optionally generate frames with n_chunks > 1
+            if series_size > 1 and chunked is True:
+                split_at = series_size // 2
+                df = df[:split_at].vstack(df[split_at:])
+
+            # optionally make lazy
             return df.lazy() if lazy else df
 
         return draw_frames()

diff --git a/py-polars/tests/parametric/test_testing.py b/py-polars/tests/parametric/test_testing.py
@@ -14,7 +14,7 @@
 
 
 @given(df=dataframes(), lf=dataframes(lazy=True), srs=series())
-@settings(max_examples=10)
+@settings(max_examples=5)
 def test_strategy_classes(df: pl.DataFrame, lf: pl.LazyFrame, srs: pl.Series) -> None:
     assert isinstance(df, pl.DataFrame)
     assert isinstance(lf, pl.LazyFrame)
@@ -139,3 +139,25 @@ def test_strategy_null_probability(
     nulls_col0, nulls_colx = df3.null_count().rows()[0]
     assert nulls_col0 > nulls_colx
     assert nulls_col0 == 50
+
+
+@given(
+    df1=dataframes(chunked=False, min_size=1),
+    df2=dataframes(chunked=True, min_size=1),
+    s1=series(chunked=False, min_size=1),
+    s2=series(chunked=True, min_size=1),
+)
+@settings(max_examples=10)
+def test_chunking(
+    df1: pl.DataFrame,
+    df2: pl.DataFrame,
+    s1: pl.Series,
+    s2: pl.Series,
+) -> None:
+    assert df1.n_chunks() == 1
+    if len(df2) > 1:
+        assert df2.n_chunks("all") == [2] * len(df2.columns)
+
+    assert s1.n_chunks() == 1
+    if len(s2) > 1:
+        assert s2.n_chunks() > 1