Skip to content

Commit

Permalink
fix unique and drop (#2908)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Mar 16, 2022
1 parent da96822 commit 4c841eb
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 17 deletions.
6 changes: 3 additions & 3 deletions polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -211,13 +211,13 @@ bench = [
]

[dependencies]
# fixed because of compilation error
# todo! remove
crossbeam-epoch = "=0.9.7"
polars-core = { version = "0.20.0", path = "./polars-core", features = ["docs", "private"], default-features = false }
polars-io = { version = "0.20.0", path = "./polars-io", features = ["private"], default-features = false, optional = true }
polars-lazy = { version = "0.20.0", path = "./polars-lazy", features = ["private"], default-features = false, optional = true }
polars-time = { version = "0.20.0", path = "./polars-time", default-features = false, optional = true }
# fixed because of compilation error
# todo! remove
crossbeam-epoch = "=0.9.7"

[dev-dependencies]
ahash = "0.7"
Expand Down
27 changes: 26 additions & 1 deletion polars/polars-lazy/src/dsl/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ use polars_core::export::arrow::{array::BooleanArray, bitmap::MutableBitmap};
use polars_core::prelude::*;

use std::fmt::{Debug, Formatter};
use std::ops::Deref;
use std::ops::{Deref, Not};
use std::{
fmt,
ops::{Add, Div, Mul, Rem, Sub},
Expand Down Expand Up @@ -637,6 +637,31 @@ impl Expr {
Expr::IsNotNull(Box::new(self))
}

/// Drop null values
pub fn drop_nulls(self) -> Self {
self.map(|s| Ok(s.drop_nulls()), GetOutput::same_type())
}

/// Drop NaN values
pub fn drop_nans(self) -> Self {
self.map(
|s| match s.dtype() {
DataType::Float32 => {
let ca = s.f32()?;
let mask = ca.is_nan().not();
ca.filter(&mask).map(|ca| ca.into_series())
}
DataType::Float64 => {
let ca = s.f64()?;
let mask = ca.is_nan().not();
ca.filter(&mask).map(|ca| ca.into_series())
}
_ => Ok(s),
},
GetOutput::same_type(),
)
}

/// Reduce groups to minimal value.
pub fn min(self) -> Self {
AggExpr::Min(Box::new(self)).into()
Expand Down
14 changes: 4 additions & 10 deletions py-polars/polars/internals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -762,21 +762,15 @@ def slice(self, offset: Union[int, "Expr"], length: Union[int, "Expr"]) -> "Expr

def drop_nulls(self) -> "Expr":
"""
Syntactic sugar for:
>>> pl.col("foo").filter(pl.col("foo").is_not_null()) # doctest: +IGNORE_RESULT
Drop null values
"""
return self.filter(self.is_not_null())
return wrap_expr(self._pyexpr.drop_nulls())

def drop_nans(self) -> "Expr":
"""
Syntactic sugar for:
>>> pl.col("foo").filter(pl.col("foo").is_not_nan()) # doctest: +IGNORE_RESULT
Drop floating point NaN values
"""
return self.filter(self.is_not_nan())
return wrap_expr(self._pyexpr.drop_nans())

def cumsum(self, reverse: bool = False) -> "Expr":
"""
Expand Down
14 changes: 11 additions & 3 deletions py-polars/src/lazy/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -236,15 +236,23 @@ impl PyExpr {

pub fn fill_null_with_strategy(&self, strategy: &str) -> PyExpr {
let strat = parse_strategy(strategy);
self.clone()
.inner
self.inner
.clone()
.apply(move |s| s.fill_null(strat), GetOutput::same_type())
.with_fmt("fill_null")
.into()
}

pub fn fill_nan(&self, expr: PyExpr) -> PyExpr {
self.clone().inner.fill_nan(expr.inner).into()
self.inner.clone().fill_nan(expr.inner).into()
}

pub fn drop_nulls(&self) -> PyExpr {
self.inner.clone().drop_nulls().into()
}

pub fn drop_nans(&self) -> PyExpr {
self.inner.clone().drop_nans().into()
}

pub fn filter(&self, predicate: PyExpr) -> PyExpr {
Expand Down
10 changes: 10 additions & 0 deletions py-polars/tests/test_exprs.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,13 @@ def test_split_exact() -> None:
assert out.frame_equal(expected)
assert df["x"].str.split_exact("_", 1).dtype == pl.Struct
assert df["x"].str.split_exact("_", 1, inclusive=False).dtype == pl.Struct


def test_unique_and_drop_stability() -> None:
# see: 2898
# the original cause was that we wrote:
# expr_a = a.unique()
# expr_a.filter(a.unique().is_not_null())
# meaning that the a.unique was executed twice, which is an unstable algorithm
df = pl.DataFrame({"a": [1, None, 1, None]})
assert df.select(pl.col("a").unique().drop_nulls()).to_series()[0] == 1

0 comments on commit 4c841eb

Please sign in to comment.