Skip to content

Commit

Permalink
list eval expression (#3185)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Apr 19, 2022
1 parent 9a4a4be commit 69dc5ba
Show file tree
Hide file tree
Showing 14 changed files with 212 additions and 7 deletions.
1 change: 0 additions & 1 deletion polars-sql/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ features = [
"string_encoding",
"product",
"ndarray",
"series_from_anyvalue",
"avro",
"parquet",
"ipc",
Expand Down
3 changes: 1 addition & 2 deletions polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,7 @@ unique_counts = ["polars-core/unique_counts", "polars-lazy/unique_counts"]
log = ["polars-core/log", "polars-lazy/log"]
partition_by = ["polars-core/partition_by"]
semi_anti_join = ["polars-core/semi_anti_join"]

series_from_anyvalue = ["polars-core/series_from_anyvalue"]
list_eval = ["polars-lazy/list_eval"]

test = [
"lazy",
Expand Down
3 changes: 1 addition & 2 deletions polars/polars-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ temporal = ["regex", "chrono"]
random = ["rand", "rand_distr"]
default = ["docs", "temporal", "performant", "private"]
lazy = ["sort_multiple"]
series_from_anyvalue = []

# ~40% faster collect, needed until trustedlength iter stabilizes
performant = []
Expand All @@ -35,7 +34,7 @@ fmt = ["comfy-table"]
sort_multiple = []
# create from row values
# and include pivot operation
rows = ["series_from_anyvalue"]
rows = []
# dont use this
private = []

Expand Down
1 change: 0 additions & 1 deletion polars/polars-core/src/series/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ use polars_arrow::prelude::QuantileInterpolOptions;
#[cfg(any(feature = "dtype-struct", feature = "object"))]
use std::any::Any;

#[cfg(feature = "series_from_anyvalue")]
mod any_value;
pub(crate) mod arithmetic;
mod comparison;
Expand Down
5 changes: 5 additions & 0 deletions polars/polars-core/src/series/ops/null.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@ use crate::prelude::*;

impl Series {
pub fn full_null(name: &str, size: usize, dtype: &DataType) -> Self {
if let DataType::List(dtype) = dtype {
let val = Series::full_null("", 0, dtype);
let avs = [AnyValue::List(val)];
return Series::new(name, avs.as_ref());
}
if dtype == &dtype.to_physical() {
macro_rules! primitive {
($type:ty) => {{
Expand Down
1 change: 1 addition & 0 deletions polars/polars-lazy/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ ewma = ["polars-core/ewma"]
dot_diagram = []
unique_counts = ["polars-core/unique_counts"]
log = ["polars-core/log"]
list_eval = []

# no guarantees whatsoever
private = ["polars-time/private"]
Expand Down
104 changes: 104 additions & 0 deletions polars/polars-lazy/src/dsl/list.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
use crate::physical_plan::state::ExecutionState;
use crate::prelude::*;
use parking_lot::Mutex;
use polars_arrow::utils::CustomIterTools;
use polars_core::prelude::*;
use polars_core::series::ops::NullBehavior;
use rayon::prelude::*;

/// Specialized expressions for [`Series`] of [`DataType::List`].
pub struct ListNameSpace(pub(crate) Expr);
Expand Down Expand Up @@ -203,4 +207,104 @@ impl ListNameSpace {
pub fn tail(self, n: usize) -> Expr {
self.slice(-(n as i64), n)
}

/// Run any [`Expr`] on these lists elements
#[cfg(feature = "list_eval")]
pub fn eval(self, mut expr: Expr, parallel: bool) -> Expr {
expr.mutate().apply(|e| match e {
Expr::Column(name) => {
*name = Arc::from("");
true
}
Expr::Nth(_) => {
*e = Expr::Column(Arc::from(""));
true
}
_ => true,
});

let expr2 = expr.clone();
let func = move |s: Series| {
let expr = expr.clone();
let mut arena = Arena::with_capacity(10);
let aexpr = to_aexpr(expr, &mut arena);
let planner = DefaultPlanner::default();
let phys_expr = planner.create_physical_expr(aexpr, Context::Default, &mut arena)?;

let state = ExecutionState::new();

let lst = s.list()?;
let mut err = None;
let mut ca: ListChunked = if parallel {
let m_err = Mutex::new(None);
let ca: ListChunked = lst
.par_iter()
.map(|opt_s| {
opt_s.and_then(|s| {
let df = DataFrame::new_no_checks(vec![s]);
let out = phys_expr.evaluate(&df, &state);
match out {
Ok(s) => Some(s),
Err(e) => {
*m_err.lock() = Some(e);
None
}
}
})
})
.collect();
err = m_err.lock().take();
ca
} else {
let mut df_container = DataFrame::new_no_checks(vec![]);

lst.into_iter()
.map(|s| {
s.and_then(|s| {
df_container.get_columns_mut().push(s);
let out = phys_expr.evaluate(&df_container, &state);
df_container.get_columns_mut().clear();
match out {
Ok(s) => Some(s),
Err(e) => {
err = Some(e);
None
}
}
})
})
.collect_trusted()
};

ca.rename(s.name());

match err {
None => Ok(ca.into_series()),
Some(e) => Err(e),
}
};

self.0
.map(
func,
GetOutput::map_field(move |f| {
// dummy df to determine output dtype
let dtype = f
.data_type()
.inner_dtype()
.cloned()
.unwrap_or_else(|| f.data_type().clone());

let df = Series::new_empty(f.name(), &dtype).into_frame();
match df.lazy().select([expr2.clone()]).collect() {
Ok(out) => {
let dtype = out.get_columns()[0].dtype();
Field::new(f.name(), dtype.clone())
}
Err(_) => Field::new(f.name(), DataType::Null),
}
}),
)
.with_fmt("eval")
}
}
2 changes: 1 addition & 1 deletion py-polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,12 @@ features = [
"string_encoding",
"product",
"ndarray",
"series_from_anyvalue",
"unique_counts",
"log",
"serde-lazy",
"partition_by",
"semi_anti_join",
"list_eval",
]

# [patch.crates-io]
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/expression.rst
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,7 @@ The following methods are available under the `expr.arr` attribute.
ExprListNameSpace.slice
ExprListNameSpace.head
ExprListNameSpace.tail
ExprListNameSpace.eval

Categories
----------
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ The following methods are available under the `Series.arr` attribute.
ListNameSpace.slice
ListNameSpace.head
ListNameSpace.tail
ListNameSpace.eval

Categories
----------
Expand Down
37 changes: 37 additions & 0 deletions py-polars/polars/internals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3395,6 +3395,43 @@ def tail(self, n: int = 5) -> "Expr":
"""
return self.slice(-n, n)

def eval(self, expr: "Expr", parallel: bool = False) -> "Expr":
"""
Run any polars expression against the lists' elements
Parameters
----------
expr
Expression to run. Note that you can select an element with `pl.first()`, or `pl.col()`
parallel
Run all expression parallel. Don't activate this blindly.
Parallelism is worth it if there is enough work to do per thread.
This likely should not be use in the groupby context, because we already parallel execution per group
Examples
--------
>>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2]})
>>> df.with_column(
... pl.concat_list(["a", "b"]).arr.eval(pl.first().rank()).alias("rank")
... )
shape: (3, 3)
┌─────┬─────┬────────────┐
│ a ┆ b ┆ rank │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ list [f32] │
╞═════╪═════╪════════════╡
│ 1 ┆ 4 ┆ [1.0, 2.0] │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 8 ┆ 5 ┆ [2.0, 1.0] │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ 2 ┆ [2.0, 1.0] │
└─────┴─────┴────────────┘
"""
return wrap_expr(self._pyexpr.lst_eval(expr._pyexpr, parallel))


class ExprStringNameSpace:
"""
Expand Down
37 changes: 37 additions & 0 deletions py-polars/polars/internals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4350,6 +4350,43 @@ def tail(self, n: int = 5) -> "Series":
"""
return self.slice(-n, n)

def eval(self, expr: "pli.Expr", parallel: bool = False) -> "Series":
"""
Run any polars expression against the lists' elements
Parameters
----------
expr
Expression to run. Note that you can select an element with `pl.first()`, or `pl.col()`
parallel
Run all expression parallel. Don't activate this blindly.
Parallelism is worth it if there is enough work to do per thread.
This likely should not be use in the groupby context, because we already parallel execution per group
Examples
--------
>>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2]})
>>> df.with_column(
... pl.concat_list(["a", "b"]).arr.eval(pl.first().rank()).alias("rank")
... )
shape: (3, 3)
┌─────┬─────┬────────────┐
│ a ┆ b ┆ rank │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ list [f32] │
╞═════╪═════╪════════════╡
│ 1 ┆ 4 ┆ [1.0, 2.0] │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 8 ┆ 5 ┆ [2.0, 1.0] │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ 2 ┆ [2.0, 1.0] │
└─────┴─────┴────────────┘
"""
return pli.select(pli.lit(wrap_s(self._s)).arr.eval(expr, parallel)).to_series()


class DateTimeNameSpace:
"""
Expand Down
4 changes: 4 additions & 0 deletions py-polars/src/lazy/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1166,6 +1166,10 @@ impl PyExpr {
self.inner.clone().arr().slice(offset, length).into()
}

fn lst_eval(&self, expr: PyExpr, parallel: bool) -> Self {
self.inner.clone().arr().eval(expr.inner, parallel).into()
}

fn rank(&self, method: &str, reverse: bool) -> Self {
let method = str_to_rankmethod(method).unwrap();
let options = RankOptions {
Expand Down
19 changes: 19 additions & 0 deletions py-polars/tests/test_exprs.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,3 +195,22 @@ def test_dot_in_groupby() -> None:
.agg(pl.col("x").dot("y").alias("dot"))
.frame_equal(pl.DataFrame({"group": ["a", "b"], "dot": [6, 15]}))
)


def test_list_eval_expression() -> None:
df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2]})

for parallel in [True, False]:
assert df.with_column(
pl.concat_list(["a", "b"])
.arr.eval(pl.first().rank(), parallel=parallel)
.alias("rank")
).to_dict(False) == {
"a": [1, 8, 3],
"b": [4, 5, 2],
"rank": [[1.0, 2.0], [2.0, 1.0], [2.0, 1.0]],
}

assert df["a"].reshape((1, -1)).arr.eval(
pl.first(), parallel=parallel
).to_list() == [[1, 8, 3]]

0 comments on commit 69dc5ba

Please sign in to comment.