Skip to content

Commit

Permalink
add sample expression (#2668)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Feb 16, 2022
1 parent 0bb9ec6 commit 87e0ddf
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 2 deletions.
9 changes: 9 additions & 0 deletions polars/polars-lazy/src/dsl/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1982,6 +1982,15 @@ impl Expr {
.with_fmt("shuffle")
}

#[cfg(feature = "random")]
pub fn sample_frac(self, frac: f64, with_replacement: bool, seed: u64) -> Self {
self.apply(
move |s| s.sample_frac(frac, with_replacement, seed),
GetOutput::same_type(),
)
.with_fmt("shuffle")
}

#[cfg(feature = "ewma")]
pub fn ewm_mean(self, options: EWMOptions) -> Self {
use DataType::*;
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/expression.rst
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ Manipulation/ selection
Expr.reshape
Expr.to_physical
Expr.shuffle
Expr.sample
Expr.extend_constant

Column names
Expand Down
28 changes: 26 additions & 2 deletions py-polars/polars/internals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -2185,17 +2185,41 @@ def reshape(self, dims: Tuple[int, ...]) -> "Expr":
"""
return wrap_expr(self._pyexpr.reshape(dims))

def shuffle(self, seed: int = 0) -> "Expr":
def shuffle(self, seed: Optional[int] = None) -> "Expr":
"""
Shuffle the contents of this expr.
Parameters
----------
seed
Seed initialization
Seed initialization. If None given numpy is used.
"""
if seed is None:
seed = int(np.random.randint(0, 10000))
return wrap_expr(self._pyexpr.shuffle(seed))

def sample(
self,
fraction: float = 1.0,
with_replacement: bool = True,
seed: Optional[int] = 0,
) -> "Expr":
"""
Sample a fraction of the `Series`.
Parameters
----------
fraction
Fraction 0.0 <= value <= 1.0
with_replacement
Allow values to be sampled more than once.
seed
Seed initialization. If None given numpy is used.
"""
if seed is None:
seed = int(np.random.randint(0, 10000))
return wrap_expr(self._pyexpr.sample_frac(fraction, with_replacement, seed))

def ewm_mean(
self,
com: Optional[float] = None,
Expand Down
7 changes: 7 additions & 0 deletions py-polars/src/lazy/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1168,6 +1168,13 @@ impl PyExpr {
self.inner.clone().shuffle(seed).into()
}

pub fn sample_frac(&self, frac: f64, with_replacement: bool, seed: u64) -> Self {
self.inner
.clone()
.sample_frac(frac, with_replacement, seed)
.into()
}

pub fn ewm_mean(&self, alpha: f64, adjust: bool, min_periods: usize) -> Self {
let options = EWMOptions {
alpha,
Expand Down
8 changes: 8 additions & 0 deletions py-polars/tests/test_exprs.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,11 @@ def test_count_expr() -> None:
out = df.groupby("b", maintain_order=True).agg(pl.count())
assert out["b"].to_list() == ["a", "b"]
assert out["count"].to_list() == [4, 1]


def test_sample() -> None:
a = pl.Series("a", range(0, 20))
out = pl.select(pl.lit(a).sample(0.5, False, 1)).to_series()
assert out.shape == (10,)
assert out.to_list() != out.sort().to_list()
assert out.unique().shape == (10,)

0 comments on commit 87e0ddf

Please sign in to comment.