Skip to content

Commit

Permalink
feat(rust, python): Impl any/all for array type (#13250)
Browse files Browse the repository at this point in the history
  • Loading branch information
reswqa committed Dec 27, 2023
1 parent a34a4f0 commit 675e506
Show file tree
Hide file tree
Showing 17 changed files with 281 additions and 9 deletions.
1 change: 1 addition & 0 deletions crates/polars-lazy/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ serde = [
fused = ["polars-plan/fused", "polars-ops/fused"]
list_sets = ["polars-plan/list_sets", "polars-ops/list_sets"]
list_any_all = ["polars-ops/list_any_all", "polars-plan/list_any_all"]
array_any_all = ["polars-ops/array_any_all", "polars-plan/array_any_all", "dtype-array"]
list_drop_nulls = ["polars-ops/list_drop_nulls", "polars-plan/list_drop_nulls"]
list_sample = ["polars-ops/list_sample", "polars-plan/list_sample"]
cutqcut = ["polars-plan/cutqcut", "polars-ops/cutqcut"]
Expand Down
1 change: 1 addition & 0 deletions crates/polars-ops/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ cross_join = []
chunked_ids = ["polars-core/chunked_ids"]
asof_join = ["polars-core/asof_join"]
semi_anti_join = []
array_any_all = ["dtype-array"]
list_gather = []
list_sets = []
list_any_all = []
Expand Down
54 changes: 54 additions & 0 deletions crates/polars-ops/src/chunked_array/array/any_all.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
use arrow::array::{BooleanArray, FixedSizeListArray};
use arrow::bitmap::MutableBitmap;
use arrow::legacy::utils::CustomIterTools;

use super::*;

fn array_all_any<F>(arr: &FixedSizeListArray, op: F, is_all: bool) -> PolarsResult<BooleanArray>
where
F: Fn(&BooleanArray) -> bool,
{
let values = arr.values();

polars_ensure!(values.data_type() == &ArrowDataType::Boolean, ComputeError: "expected boolean elements in array");

let values = values.as_any().downcast_ref::<BooleanArray>().unwrap();
let validity = arr.validity().cloned();

// Fast path where all values set (all is free).
if is_all {
let all_set = arrow::compute::boolean::all(values);
if all_set {
let mut bits = MutableBitmap::with_capacity(arr.len());
bits.extend_constant(arr.len(), true);
return Ok(BooleanArray::from_data_default(bits.into(), None).with_validity(validity));
}
}

let len = arr.size();
let iter = (0..values.len()).step_by(len).map(|start| {
// SAFETY: start + len is in bound guarded by invariant of FixedSizeListArray
let val = unsafe { values.clone().sliced_unchecked(start, len) };
op(&val)
});

Ok(BooleanArray::from_trusted_len_values_iter(
// SAFETY: we evaluate for every sub-array, the length is equals to arr.len().
unsafe { iter.trust_my_length(arr.len()) },
)
.with_validity(validity))
}

pub(super) fn array_all(ca: &ArrayChunked) -> PolarsResult<Series> {
let chunks = ca
.downcast_iter()
.map(|arr| array_all_any(arr, arrow::compute::boolean::all, true));
Ok(BooleanChunked::try_from_chunk_iter(ca.name(), chunks)?.into_series())
}

pub(super) fn array_any(ca: &ArrayChunked) -> PolarsResult<Series> {
let chunks = ca
.downcast_iter()
.map(|arr| array_all_any(arr, arrow::compute::boolean::any, false));
Ok(BooleanChunked::try_from_chunk_iter(ca.name(), chunks)?.into_series())
}
2 changes: 2 additions & 0 deletions crates/polars-ops/src/chunked_array/array/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#[cfg(feature = "array_any_all")]
mod any_all;
mod min_max;
mod namespace;
mod sum_mean;
Expand Down
14 changes: 14 additions & 0 deletions crates/polars-ops/src/chunked_array/array/namespace.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
use super::min_max::AggType;
use super::*;
use crate::chunked_array::array::sum_mean::sum_with_nulls;
#[cfg(feature = "array_any_all")]
use crate::prelude::array::any_all::{array_all, array_any};
use crate::prelude::array::sum_mean::sum_array_numerical;

pub fn has_inner_nulls(ca: &ArrayChunked) -> bool {
Expand Down Expand Up @@ -51,6 +53,18 @@ pub trait ArrayNameSpace: AsArray {
let ca = self.as_array();
ca.try_apply_amortized(|s| s.as_ref().unique_stable())
}

#[cfg(feature = "array_any_all")]
fn array_any(&self) -> PolarsResult<Series> {
let ca = self.as_array();
array_any(ca)
}

#[cfg(feature = "array_any_all")]
fn array_all(&self) -> PolarsResult<Series> {
let ca = self.as_array();
array_all(ca)
}
}

impl ArrayNameSpace for ArrayChunked {}
15 changes: 7 additions & 8 deletions crates/polars-ops/src/chunked_array/list/any_all.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,19 @@ where
let validity = arr.validity().cloned();

// Fast path where all values set (all is free).
let all_set = arrow::compute::boolean::all(values);
if all_set && is_all {
let mut bits = MutableBitmap::with_capacity(arr.len());
bits.extend_constant(arr.len(), true);
return Ok(BooleanArray::from_data_default(bits.into(), None).with_validity(validity));
if is_all {
let all_set = arrow::compute::boolean::all(values);
if all_set {
let mut bits = MutableBitmap::with_capacity(arr.len());
bits.extend_constant(arr.len(), true);
return Ok(BooleanArray::from_data_default(bits.into(), None).with_validity(validity));
}
}

let mut start = offsets[0] as usize;
let iter = offsets[1..].iter().map(|&end| {
let end = end as usize;
let len = end - start;
// TODO!
// We can speed this upp if the boolean array doesn't have nulls
// Then we can work directly on the byte slice.
let val = unsafe { values.clone().sliced_unchecked(start, len) };
start = end;
op(&val)
Expand Down
1 change: 1 addition & 0 deletions crates/polars-plan/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ cse = []
propagate_nans = ["polars-ops/propagate_nans"]
coalesce = []
fused = ["polars-ops/fused"]
array_any_all = ["polars-ops/array_any_all", "dtype-array"]
list_sets = ["polars-ops/list_sets"]
list_any_all = ["polars-ops/list_any_all"]
list_drop_nulls = ["polars-ops/list_drop_nulls"]
Expand Down
14 changes: 14 additions & 0 deletions crates/polars-plan/src/dsl/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,18 @@ impl ArrayNameSpace {
self.0
.map_private(FunctionExpr::ArrayExpr(ArrayFunction::ToList))
}

#[cfg(feature = "array_any_all")]
/// Evaluate whether all boolean values are true for every subarray.
pub fn all(self) -> Expr {
self.0
.map_private(FunctionExpr::ArrayExpr(ArrayFunction::All))
}

#[cfg(feature = "array_any_all")]
/// Evaluate whether any boolean value is true for every subarray
pub fn any(self) -> Expr {
self.0
.map_private(FunctionExpr::ArrayExpr(ArrayFunction::Any))
}
}
24 changes: 24 additions & 0 deletions crates/polars-plan/src/dsl/function_expr/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ pub enum ArrayFunction {
Sum,
ToList,
Unique(bool),
#[cfg(feature = "array_any_all")]
Any,
#[cfg(feature = "array_any_all")]
All,
}

impl ArrayFunction {
Expand All @@ -21,6 +25,8 @@ impl ArrayFunction {
Sum => mapper.nested_sum_type(),
ToList => mapper.try_map_dtype(map_array_dtype_to_list_dtype),
Unique(_) => mapper.try_map_dtype(map_array_dtype_to_list_dtype),
#[cfg(feature = "array_any_all")]
Any | All => mapper.with_dtype(DataType::Boolean),
}
}
}
Expand All @@ -42,6 +48,10 @@ impl Display for ArrayFunction {
Sum => "sum",
ToList => "to_list",
Unique(_) => "unique",
#[cfg(feature = "array_any_all")]
Any => "any",
#[cfg(feature = "array_any_all")]
All => "all",
};
write!(f, "arr.{name}")
}
Expand All @@ -56,6 +66,10 @@ impl From<ArrayFunction> for SpecialEq<Arc<dyn SeriesUdf>> {
Sum => map!(sum),
ToList => map!(to_list),
Unique(stable) => map!(unique, stable),
#[cfg(feature = "array_any_all")]
Any => map!(any),
#[cfg(feature = "array_any_all")]
All => map!(all),
}
}
}
Expand Down Expand Up @@ -86,3 +100,13 @@ pub(super) fn to_list(s: &Series) -> PolarsResult<Series> {
let list_dtype = map_array_dtype_to_list_dtype(s.dtype())?;
s.cast(&list_dtype)
}

#[cfg(feature = "array_any_all")]
pub(super) fn any(s: &Series) -> PolarsResult<Series> {
s.array()?.array_any()
}

#[cfg(feature = "array_any_all")]
pub(super) fn all(s: &Series) -> PolarsResult<Series> {
s.array()?.array_all()
}
1 change: 1 addition & 0 deletions crates/polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ streaming = ["polars-lazy?/streaming"]
fused = ["polars-ops/fused", "polars-lazy?/fused"]
list_sets = ["polars-lazy?/list_sets"]
list_any_all = ["polars-lazy?/list_any_all"]
array_any_all = ["polars-lazy?/array_any_all", "dtype-array"]
list_drop_nulls = ["polars-lazy?/list_drop_nulls"]
list_sample = ["polars-lazy?/list_sample"]
cutqcut = ["polars-lazy?/cutqcut"]
Expand Down
2 changes: 2 additions & 0 deletions py-polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ list_count = ["polars/list_count"]
binary_encoding = ["polars/binary_encoding"]
list_sets = ["polars-lazy/list_sets"]
list_any_all = ["polars/list_any_all"]
array_any_all = ["polars/array_any_all", "polars/dtype-array"]
list_drop_nulls = ["polars/list_drop_nulls"]
list_sample = ["polars/list_sample"]
cutqcut = ["polars/cutqcut"]
Expand All @@ -161,6 +162,7 @@ dtypes = [
]

operations = [
"array_any_all",
"is_in",
"repeat_by",
"trigonometry",
Expand Down
2 changes: 2 additions & 0 deletions py-polars/docs/source/reference/expressions/array.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,5 @@ The following methods are available under the `expr.arr` attribute.
Expr.arr.sum
Expr.arr.to_list
Expr.arr.unique
Expr.arr.all
Expr.arr.any
2 changes: 2 additions & 0 deletions py-polars/docs/source/reference/series/array.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,5 @@ The following methods are available under the `Series.arr` attribute.
Series.arr.sum
Series.arr.to_list
Series.arr.unique
Series.arr.all
Series.arr.any
70 changes: 70 additions & 0 deletions py-polars/polars/expr/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,73 @@ def to_list(self) -> Expr:
"""
return wrap_expr(self._pyexpr.arr_to_list())

def any(self) -> Expr:
"""
Evaluate whether any boolean value is true for every subarray.
Examples
--------
>>> df = pl.DataFrame(
... data={
... "a": [
... [True, True],
... [False, True],
... [False, False],
... [None, None],
... None,
... ]
... },
... schema={"a": pl.Array(pl.Boolean, 2)},
... )
>>> df.with_columns(any=pl.col("a").arr.any())
shape: (5, 2)
┌────────────────┬───────┐
│ a ┆ any │
│ --- ┆ --- │
│ array[bool, 2] ┆ bool │
╞════════════════╪═══════╡
│ [true, true] ┆ true │
│ [false, true] ┆ true │
│ [false, false] ┆ false │
│ [null, null] ┆ false │
│ null ┆ null │
└────────────────┴───────┘
"""
return wrap_expr(self._pyexpr.arr_any())

def all(self) -> Expr:
"""
Evaluate whether all boolean values are true for every subarray.
Examples
--------
>>> df = pl.DataFrame(
... data={
... "a": [
... [True, True],
... [False, True],
... [False, False],
... [None, None],
... None,
... ]
... },
... schema={"a": pl.Array(pl.Boolean, 2)},
... )
>>> df.with_columns(all=pl.col("a").arr.all())
shape: (5, 2)
┌────────────────┬───────┐
│ a ┆ all │
│ --- ┆ --- │
│ array[bool, 2] ┆ bool │
╞════════════════╪═══════╡
│ [true, true] ┆ true │
│ [false, true] ┆ false │
│ [false, false] ┆ false │
│ [null, null] ┆ true │
│ null ┆ null │
└────────────────┴───────┘
"""
return wrap_expr(self._pyexpr.arr_all())
56 changes: 56 additions & 0 deletions py-polars/polars/series/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,59 @@ def to_list(self) -> Series:
]
"""

def any(self) -> Series:
"""
Evaluate whether any boolean value is true for every subarray.
Returns
-------
Series
Series of data type :class:`Boolean`.
Examples
--------
>>> s = pl.Series(
... [[True, True], [False, True], [False, False], [None, None], None],
... dtype=pl.Array(pl.Boolean, 2),
... )
>>> s.arr.any()
shape: (5,)
Series: '' [bool]
[
true
true
false
false
null
]
"""

def all(self) -> Series:
"""
Evaluate whether all boolean values are true for every subarray.
Returns
-------
Series
Series of data type :class:`Boolean`.
Examples
--------
>>> s = pl.Series(
... [[True, True], [False, True], [False, False], [None, None], None],
... dtype=pl.Array(pl.Boolean, 2),
... )
>>> s.arr.all()
shape: (5,)
Series: '' [bool]
[
true
false
false
true
null
]
"""
8 changes: 8 additions & 0 deletions py-polars/src/expr/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,12 @@ impl PyExpr {
fn arr_to_list(&self) -> Self {
self.inner.clone().arr().to_list().into()
}

fn arr_all(&self) -> Self {
self.inner.clone().arr().all().into()
}

fn arr_any(&self) -> Self {
self.inner.clone().arr().any().into()
}
}
Loading

0 comments on commit 675e506

Please sign in to comment.