Skip to content

Commit

Permalink
null_probability functionality for dataframes/series test strategie…
Browse files Browse the repository at this point in the history
…s. (#3860)
  • Loading branch information
alexander-beedie committed Jul 3, 2022
1 parent 9e1ab0d commit 284e308
Show file tree
Hide file tree
Showing 4 changed files with 97 additions and 28 deletions.
2 changes: 1 addition & 1 deletion py-polars/polars/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ def __hash__(self) -> int:
}
for tu in DTYPE_TEMPORAL_UNITS:
_DTYPE_TO_FFINAME[Datetime(tu)] = "datetime"
_DTYPE_TO_FFINAME[Datetime(tu)] = "duration"
_DTYPE_TO_FFINAME[Duration(tu)] = "duration"

_DTYPE_TO_CTYPE: Dict[PolarsDataType, Any] = {
UInt8: ctypes.c_uint8,
Expand Down
4 changes: 2 additions & 2 deletions py-polars/polars/internals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,8 +497,8 @@ def __getitem__(self, item: Union[int, "Series", range, slice]) -> Any:
def __setitem__(
self, key: Union[int, "Series", np.ndarray, List, Tuple], value: Any
) -> None:
if isinstance(value, list):
raise ValueError("cannot set with a list as value, use a primitive value")
if isinstance(value, Sequence):
raise ValueError("cannot set with list/tuple as value; use a scalar value")
if isinstance(key, Series):
if key.dtype == Boolean:
self._s = self.set(key, value)._s
Expand Down
85 changes: 60 additions & 25 deletions py-polars/polars/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,14 +290,6 @@ def is_categorical_dtype(data_type: Any) -> bool:


if HYPOTHESIS_INSTALLED:

def between(draw: Callable, type_: type, min_: Any, max_: Any) -> Any:
"""
Draw a value in a given range from a type-inferred strategy.
"""
strategy_init = getattr(from_type(type_), "function")
return draw(strategy_init(min_, max_))

# =====================================================================
# Polars-specific 'hypothesis' strategies and helper functions
# See: https://hypothesis.readthedocs.io/
Expand Down Expand Up @@ -339,6 +331,13 @@ def between(draw: Callable, type_: type, min_: Any, max_: Any) -> Any:

strategy_dtypes = list(dtype_strategy_mapping)

def between(draw: Callable, type_: type, min_: Any, max_: Any) -> Any:
"""
Draw a value in a given range from a type-inferred strategy.
"""
strategy_init = getattr(from_type(type_), "function")
return draw(strategy_init(min_, max_))

@dataclass
class column:
"""
Expand All @@ -352,6 +351,10 @@ class column:
a recognised polars dtype.
strategy : strategy, optional
supports overriding the default strategy for the given dtype.
null_probability : float, optional
percentage chance (expressed between 0.0 => 1.0) that a generated value
is None. this is applied in addition to any None values output by the
given/inferred strategy for the column.
unique : bool, optional
flag indicating that all values generated for the column should be unique.
Expand All @@ -367,9 +370,16 @@ class column:
name: str
dtype: Optional[PolarsDataType] = None
strategy: Optional["SearchStrategy"] = None
null_probability: Optional[float] = None
unique: bool = False

def __post_init__(self) -> None:
if (self.null_probability is not None) and (
self.null_probability < 0 or self.null_probability > 1
):
raise InvalidArgument(
f"null_probability should be between 0.0 and 1.0 or None; found {self.null_probability}"
)
if self.dtype is None and not self.strategy:
self.dtype = random.choice(strategy_dtypes)
elif self.dtype not in dtype_strategy_mapping:
Expand Down Expand Up @@ -483,7 +493,7 @@ def series(
min_size: Optional[int] = 0,
max_size: Optional[int] = MAX_DATA_SIZE,
strategy: Optional["SearchStrategy"] = None,
null_probability: float = 0.01,
null_probability: float = 0.0,
unique: bool = False,
allowed_dtypes: Optional[Sequence[PolarsDataType]] = None,
excluded_dtypes: Optional[Sequence[PolarsDataType]] = None,
Expand All @@ -508,7 +518,8 @@ def series(
strategy : strategy, optional
supports overriding the default strategy for the given dtype.
null_probability : float, optional
percentage chance (expressed between 0.0 => 1.0) that a generated value is None; default = 0.01 (1%).
percentage chance (expressed between 0.0 => 1.0) that a generated value is None. this
is applied independently of any None values generated by the underlying strategy.
unique : bool, optional
indicate whether Series values should all be distinct.
allowed_dtypes : {list,set}, optional
Expand Down Expand Up @@ -546,16 +557,16 @@ def series(
]
>>>
"""
# TODO: finish 'null_probability' integration - currently a no-op ;p
if null_probability and null_probability < 0 or null_probability > 1:
raise InvalidArgument(
f"null_probability should be between 0.0 and 1.0; found {null_probability}"
)
selectable_dtypes = [
dtype
for dtype in (allowed_dtypes or strategy_dtypes)
if dtype not in (excluded_dtypes or ())
]
if null_probability and (null_probability < 0 or null_probability > 1):
raise InvalidArgument(
f"null_probability should be between 0.0 and 1.0; found {null_probability}"
)
null_probability = float(null_probability or 0.0)

@composite
def draw_series(draw: Callable) -> Series:
Expand All @@ -573,12 +584,15 @@ def draw_series(draw: Callable) -> Series:
if size is None
else size
)
# create series using dtype-specific strategy to generate values

# assign series name
series_name = name if isinstance(name, (str, type(None))) else draw(name)
s = Series(
name=series_name,
dtype=series_dtype,
values=(

# create series using dtype-specific strategy to generate values
series_values = (
[None] * series_size
if null_probability == 1
else (
draw(
lists(
dtype_strategy,
Expand All @@ -589,7 +603,19 @@ def draw_series(draw: Callable) -> Series:
)
if (series_size > 0)
else []
),
)
)
# optionally apply null values (custom frequency)
if 0.0 < null_probability < 1.0:
for idx in range(series_size):
if random.random() < null_probability:
series_values[idx] = None

# init series with strategy-generated data
s = Series(
name=series_name,
dtype=series_dtype,
values=series_values,
)
if is_categorical_dtype(dtype):
s = s.cast(Categorical)
Expand All @@ -608,7 +634,7 @@ def dataframes(
min_size: Optional[int] = 0,
max_size: Optional[int] = MAX_DATA_SIZE,
include_cols: Optional[Sequence[column]] = None,
null_probability: float = 0.0,
null_probability: Union[float, dict] = 0.0,
allowed_dtypes: Optional[Sequence[PolarsDataType]] = None,
excluded_dtypes: Optional[Sequence[PolarsDataType]] = None,
) -> "SearchStrategy[Union[DataFrame, LazyFrame]]":
Expand All @@ -635,8 +661,11 @@ def dataframes(
include_cols : [column], optional
a list of `column` objects to include in the generated DataFrame. note that explicitly
provided columns are appended onto the list of existing columns (if any present).
null_probability : float, optional
chance (expressed as a float between 0.0 => 1.0) that a generated value is None.
null_probability : {float, dict[str,float]}, optional
percentage chance (expressed between 0.0 => 1.0) that a generated value is None. this is
applied independently of any None values generated by the underlying strategy, and can
be applied either on a per-column basis (if given as a {col:pct} dict), or globally. if
null_probability is defined on a column, it takes precedence over the global value.
allowed_dtypes : {list,set}, optional
when automatically generating data, allow only these dtypes.
excluded_dtypes : {list,set}, optional
Expand Down Expand Up @@ -730,6 +759,12 @@ def draw_frames(draw: Callable) -> Union[DataFrame, LazyFrame]:
for idx, c in enumerate(coldefs):
if c.name is None:
c.name = f"col{idx}"
if c.null_probability is None:
if isinstance(null_probability, dict):
c.null_probability = null_probability.get(c.name, 0.0)
else:
c.null_probability = null_probability

frame_columns = [
c.name if (c.dtype is None) else (c.name, c.dtype) for c in coldefs
]
Expand All @@ -740,7 +775,7 @@ def draw_frames(draw: Callable) -> Union[DataFrame, LazyFrame]:
name=c.name,
dtype=c.dtype,
size=series_size,
null_probability=null_probability,
null_probability=(c.null_probability or 0.0),
strategy=c.strategy,
unique=c.unique,
)
Expand Down
34 changes: 34 additions & 0 deletions py-polars/tests/test_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,3 +223,37 @@ def test_strategy_dtypes(
assert s2.dtype == pl.Boolean
assert s3.dtype in TEMPORAL_DTYPES
assert s4.dtype not in TEMPORAL_DTYPES


@given(
# set global, per-column, and overridden null-probabilities
s=series(size=50, null_probability=0.10),
df1=dataframes(cols=1, size=50, null_probability=0.30),
df2=dataframes(cols=2, size=50, null_probability={"col0": 0.70}),
df3=dataframes(
cols=1,
size=50,
null_probability=1.0,
include_cols=[column(name="colx", null_probability=0.20)],
),
)
def test_strategy_null_probability(
s: pl.Series,
df1: pl.DataFrame,
df2: pl.DataFrame,
df3: pl.DataFrame,
) -> None:
for obj in (s, df1, df2, df3):
assert len(obj) == 50 # type: ignore[arg-type]

assert s.null_count() < df1.null_count().fold(sum).sum()
assert df1.null_count().fold(sum).sum() < df2.null_count().fold(sum).sum()
assert df2.null_count().fold(sum).sum() < df3.null_count().fold(sum).sum()

nulls_col0, nulls_col1 = df2.null_count().rows()[0]
assert nulls_col0 > nulls_col1
assert nulls_col0 < 50

nulls_col0, nulls_colx = df3.null_count().rows()[0]
assert nulls_col0 > nulls_colx
assert nulls_col0 == 50

0 comments on commit 284e308

Please sign in to comment.