-
-
Notifications
You must be signed in to change notification settings - Fork 1.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(rust, python): Impl any/all for array type #13250
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
use arrow::array::{BooleanArray, FixedSizeListArray}; | ||
use arrow::bitmap::MutableBitmap; | ||
use arrow::legacy::utils::CustomIterTools; | ||
|
||
use super::*; | ||
|
||
fn array_all_any<F>(arr: &FixedSizeListArray, op: F, is_all: bool) -> PolarsResult<BooleanArray> | ||
where | ||
F: Fn(&BooleanArray) -> bool, | ||
{ | ||
let values = arr.values(); | ||
|
||
polars_ensure!(values.data_type() == &ArrowDataType::Boolean, ComputeError: "expected boolean elements in array"); | ||
|
||
let values = values.as_any().downcast_ref::<BooleanArray>().unwrap(); | ||
let validity = arr.validity().cloned(); | ||
|
||
// Fast path where all values set (all is free). | ||
if is_all { | ||
let all_set = arrow::compute::boolean::all(values); | ||
if all_set { | ||
let mut bits = MutableBitmap::with_capacity(arr.len()); | ||
bits.extend_constant(arr.len(), true); | ||
return Ok(BooleanArray::from_data_default(bits.into(), None).with_validity(validity)); | ||
} | ||
} | ||
|
||
let len = arr.size(); | ||
let iter = (0..values.len()).step_by(len).map(|start| { | ||
// SAFETY: start + len is in bound guarded by invariant of FixedSizeListArray | ||
let val = unsafe { values.clone().sliced_unchecked(start, len) }; | ||
op(&val) | ||
}); | ||
|
||
Ok(BooleanArray::from_trusted_len_values_iter( | ||
// SAFETY: we evaluate for every sub-array, the length is equals to arr.len(). | ||
unsafe { iter.trust_my_length(arr.len()) }, | ||
) | ||
.with_validity(validity)) | ||
} | ||
|
||
pub(super) fn array_all(ca: &ArrayChunked) -> PolarsResult<Series> { | ||
let chunks = ca | ||
.downcast_iter() | ||
.map(|arr| array_all_any(arr, arrow::compute::boolean::all, true)); | ||
Ok(BooleanChunked::try_from_chunk_iter(ca.name(), chunks)?.into_series()) | ||
} | ||
|
||
pub(super) fn array_any(ca: &ArrayChunked) -> PolarsResult<Series> { | ||
let chunks = ca | ||
.downcast_iter() | ||
.map(|arr| array_all_any(arr, arrow::compute::boolean::any, false)); | ||
Ok(BooleanChunked::try_from_chunk_iter(ca.name(), chunks)?.into_series()) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,5 @@ | ||
#[cfg(feature = "array_any_all")] | ||
mod any_all; | ||
mod min_max; | ||
mod namespace; | ||
mod sum_mean; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,20 +16,19 @@ where | |
let validity = arr.validity().cloned(); | ||
|
||
// Fast path where all values set (all is free). | ||
let all_set = arrow::compute::boolean::all(values); | ||
if all_set && is_all { | ||
let mut bits = MutableBitmap::with_capacity(arr.len()); | ||
bits.extend_constant(arr.len(), true); | ||
return Ok(BooleanArray::from_data_default(bits.into(), None).with_validity(validity)); | ||
if is_all { | ||
let all_set = arrow::compute::boolean::all(values); | ||
if all_set { | ||
let mut bits = MutableBitmap::with_capacity(arr.len()); | ||
bits.extend_constant(arr.len(), true); | ||
return Ok(BooleanArray::from_data_default(bits.into(), None).with_validity(validity)); | ||
} | ||
} | ||
|
||
let mut start = offsets[0] as usize; | ||
let iter = offsets[1..].iter().map(|&end| { | ||
let end = end as usize; | ||
let len = end - start; | ||
// TODO! | ||
// We can speed this upp if the boolean array doesn't have nulls | ||
// Then we can work directly on the byte slice. | ||
Comment on lines
-30
to
-32
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IIUC, this But looks at pub fn any(array: &BooleanArray) -> bool {
if array.is_empty() {
false
} else if array.null_count() > 0 {
array.into_iter().any(|v| v == Some(true))
} else {
let vals = array.values();
vals.unset_bits() != vals.len()
}
} If I missed something else, feel free to point it out. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Slicing an Arrow Array involves This is something we can leave as todo, and maybe do in another PR. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the detailed explanation, make sense! Will find the time to do this optimization. :) |
||
let val = unsafe { values.clone().sliced_unchecked(start, len) }; | ||
start = end; | ||
op(&val) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In the case of
any
, this calculation should be avoided.