null_probability functionality for dataframes/series test strategie…

…s. (#3860)
pola-rs · Jul 3, 2022 · 284e308 · 284e308
1 parent 9e1ab0d
commit 284e308
Show file tree

Hide file tree

Showing 4 changed files with 97 additions and 28 deletions.
diff --git a/py-polars/polars/datatypes.py b/py-polars/polars/datatypes.py
@@ -330,7 +330,7 @@ def __hash__(self) -> int:
 }
 for tu in DTYPE_TEMPORAL_UNITS:
     _DTYPE_TO_FFINAME[Datetime(tu)] = "datetime"
-    _DTYPE_TO_FFINAME[Datetime(tu)] = "duration"
+    _DTYPE_TO_FFINAME[Duration(tu)] = "duration"
 
 _DTYPE_TO_CTYPE: Dict[PolarsDataType, Any] = {
     UInt8: ctypes.c_uint8,

diff --git a/py-polars/polars/internals/series.py b/py-polars/polars/internals/series.py
@@ -497,8 +497,8 @@ def __getitem__(self, item: Union[int, "Series", range, slice]) -> Any:
     def __setitem__(
         self, key: Union[int, "Series", np.ndarray, List, Tuple], value: Any
     ) -> None:
-        if isinstance(value, list):
-            raise ValueError("cannot set with a list as value, use a primitive value")
+        if isinstance(value, Sequence):
+            raise ValueError("cannot set with list/tuple as value; use a scalar value")
         if isinstance(key, Series):
             if key.dtype == Boolean:
                 self._s = self.set(key, value)._s

diff --git a/py-polars/polars/testing.py b/py-polars/polars/testing.py
@@ -290,14 +290,6 @@ def is_categorical_dtype(data_type: Any) -> bool:
 
 
 if HYPOTHESIS_INSTALLED:
-
-    def between(draw: Callable, type_: type, min_: Any, max_: Any) -> Any:
-        """
-        Draw a value in a given range from a type-inferred strategy.
-        """
-        strategy_init = getattr(from_type(type_), "function")
-        return draw(strategy_init(min_, max_))
-
     # =====================================================================
     # Polars-specific 'hypothesis' strategies and helper functions
     # See: https://hypothesis.readthedocs.io/
@@ -339,6 +331,13 @@ def between(draw: Callable, type_: type, min_: Any, max_: Any) -> Any:
 
     strategy_dtypes = list(dtype_strategy_mapping)
 
+    def between(draw: Callable, type_: type, min_: Any, max_: Any) -> Any:
+        """
+        Draw a value in a given range from a type-inferred strategy.
+        """
+        strategy_init = getattr(from_type(type_), "function")
+        return draw(strategy_init(min_, max_))
+
     @dataclass
     class column:
         """
@@ -352,6 +351,10 @@ class column:
             a recognised polars dtype.
         strategy : strategy, optional
             supports overriding the default strategy for the given dtype.
+        null_probability : float, optional
+            percentage chance (expressed between 0.0 => 1.0) that a generated value
+            is None. this is applied in addition to any None values output by the
+            given/inferred strategy for the column.
         unique : bool, optional
             flag indicating that all values generated for the column should be unique.
 
@@ -367,9 +370,16 @@ class column:
         name: str
         dtype: Optional[PolarsDataType] = None
         strategy: Optional["SearchStrategy"] = None
+        null_probability: Optional[float] = None
         unique: bool = False
 
         def __post_init__(self) -> None:
+            if (self.null_probability is not None) and (
+                self.null_probability < 0 or self.null_probability > 1
+            ):
+                raise InvalidArgument(
+                    f"null_probability should be between 0.0 and 1.0 or None; found {self.null_probability}"
+                )
             if self.dtype is None and not self.strategy:
                 self.dtype = random.choice(strategy_dtypes)
             elif self.dtype not in dtype_strategy_mapping:
@@ -483,7 +493,7 @@ def series(
         min_size: Optional[int] = 0,
         max_size: Optional[int] = MAX_DATA_SIZE,
         strategy: Optional["SearchStrategy"] = None,
-        null_probability: float = 0.01,
+        null_probability: float = 0.0,
         unique: bool = False,
         allowed_dtypes: Optional[Sequence[PolarsDataType]] = None,
         excluded_dtypes: Optional[Sequence[PolarsDataType]] = None,
@@ -508,7 +518,8 @@ def series(
         strategy : strategy, optional
             supports overriding the default strategy for the given dtype.
         null_probability : float, optional
-            percentage chance (expressed between 0.0 => 1.0) that a generated value is None; default = 0.01 (1%).
+            percentage chance (expressed between 0.0 => 1.0) that a generated value is None. this
+            is applied independently of any None values generated by the underlying strategy.
         unique : bool, optional
             indicate whether Series values should all be distinct.
         allowed_dtypes : {list,set}, optional
@@ -546,16 +557,16 @@ def series(
         ]
         >>>
         """
-        # TODO: finish 'null_probability' integration - currently a no-op ;p
-        if null_probability and null_probability < 0 or null_probability > 1:
-            raise InvalidArgument(
-                f"null_probability should be between 0.0 and 1.0; found {null_probability}"
-            )
         selectable_dtypes = [
             dtype
             for dtype in (allowed_dtypes or strategy_dtypes)
             if dtype not in (excluded_dtypes or ())
         ]
+        if null_probability and (null_probability < 0 or null_probability > 1):
+            raise InvalidArgument(
+                f"null_probability should be between 0.0 and 1.0; found {null_probability}"
+            )
+        null_probability = float(null_probability or 0.0)
 
         @composite
         def draw_series(draw: Callable) -> Series:
@@ -573,12 +584,15 @@ def draw_series(draw: Callable) -> Series:
                 if size is None
                 else size
             )
-            # create series using dtype-specific strategy to generate values
+
+            # assign series name
             series_name = name if isinstance(name, (str, type(None))) else draw(name)
-            s = Series(
-                name=series_name,
-                dtype=series_dtype,
-                values=(
+
+            # create series using dtype-specific strategy to generate values
+            series_values = (
+                [None] * series_size
+                if null_probability == 1
+                else (
                     draw(
                         lists(
                             dtype_strategy,
@@ -589,7 +603,19 @@ def draw_series(draw: Callable) -> Series:
                     )
                     if (series_size > 0)
                     else []
-                ),
+                )
+            )
+            # optionally apply null values (custom frequency)
+            if 0.0 < null_probability < 1.0:
+                for idx in range(series_size):
+                    if random.random() < null_probability:
+                        series_values[idx] = None
+
+            # init series with strategy-generated data
+            s = Series(
+                name=series_name,
+                dtype=series_dtype,
+                values=series_values,
             )
             if is_categorical_dtype(dtype):
                 s = s.cast(Categorical)
@@ -608,7 +634,7 @@ def dataframes(
         min_size: Optional[int] = 0,
         max_size: Optional[int] = MAX_DATA_SIZE,
         include_cols: Optional[Sequence[column]] = None,
-        null_probability: float = 0.0,
+        null_probability: Union[float, dict] = 0.0,
         allowed_dtypes: Optional[Sequence[PolarsDataType]] = None,
         excluded_dtypes: Optional[Sequence[PolarsDataType]] = None,
     ) -> "SearchStrategy[Union[DataFrame, LazyFrame]]":
@@ -635,8 +661,11 @@ def dataframes(
         include_cols : [column], optional
             a list of `column` objects to include in the generated DataFrame. note that explicitly
             provided columns are appended onto the list of existing columns (if any present).
-        null_probability : float, optional
-            chance (expressed as a float between 0.0 => 1.0) that a generated value is None.
+        null_probability : {float, dict[str,float]}, optional
+            percentage chance (expressed between 0.0 => 1.0) that a generated value is None. this is
+            applied independently of any None values generated by the underlying strategy, and can
+            be applied either on a per-column basis (if given as a {col:pct} dict), or globally. if
+            null_probability is defined on a column, it takes precedence over the global value.
         allowed_dtypes : {list,set}, optional
             when automatically generating data, allow only these dtypes.
         excluded_dtypes : {list,set}, optional
@@ -730,6 +759,12 @@ def draw_frames(draw: Callable) -> Union[DataFrame, LazyFrame]:
             for idx, c in enumerate(coldefs):
                 if c.name is None:
                     c.name = f"col{idx}"
+                if c.null_probability is None:
+                    if isinstance(null_probability, dict):
+                        c.null_probability = null_probability.get(c.name, 0.0)
+                    else:
+                        c.null_probability = null_probability
+
             frame_columns = [
                 c.name if (c.dtype is None) else (c.name, c.dtype) for c in coldefs
             ]
@@ -740,7 +775,7 @@ def draw_frames(draw: Callable) -> Union[DataFrame, LazyFrame]:
                             name=c.name,
                             dtype=c.dtype,
                             size=series_size,
-                            null_probability=null_probability,
+                            null_probability=(c.null_probability or 0.0),
                             strategy=c.strategy,
                             unique=c.unique,
                         )

diff --git a/py-polars/tests/test_testing.py b/py-polars/tests/test_testing.py
@@ -223,3 +223,37 @@ def test_strategy_dtypes(
     assert s2.dtype == pl.Boolean
     assert s3.dtype in TEMPORAL_DTYPES
     assert s4.dtype not in TEMPORAL_DTYPES
+
+
+@given(
+    # set global, per-column, and overridden null-probabilities
+    s=series(size=50, null_probability=0.10),
+    df1=dataframes(cols=1, size=50, null_probability=0.30),
+    df2=dataframes(cols=2, size=50, null_probability={"col0": 0.70}),
+    df3=dataframes(
+        cols=1,
+        size=50,
+        null_probability=1.0,
+        include_cols=[column(name="colx", null_probability=0.20)],
+    ),
+)
+def test_strategy_null_probability(
+    s: pl.Series,
+    df1: pl.DataFrame,
+    df2: pl.DataFrame,
+    df3: pl.DataFrame,
+) -> None:
+    for obj in (s, df1, df2, df3):
+        assert len(obj) == 50  # type: ignore[arg-type]
+
+    assert s.null_count() < df1.null_count().fold(sum).sum()
+    assert df1.null_count().fold(sum).sum() < df2.null_count().fold(sum).sum()
+    assert df2.null_count().fold(sum).sum() < df3.null_count().fold(sum).sum()
+
+    nulls_col0, nulls_col1 = df2.null_count().rows()[0]
+    assert nulls_col0 > nulls_col1
+    assert nulls_col0 < 50
+
+    nulls_col0, nulls_colx = df3.null_count().rows()[0]
+    assert nulls_col0 > nulls_colx
+    assert nulls_col0 == 50