From a8c97380f99a990058f77b2306cef8ff9ce30ecf Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Mon, 8 Apr 2024 18:20:19 +0200 Subject: [PATCH] feat: Support list group-by of non numeric lists (#15540) --- .../src/legacy/kernels/list_bytes_iter.rs | 56 ---------------- crates/polars-arrow/src/legacy/kernels/mod.rs | 1 - .../src/frame/group_by/into_groups.rs | 66 +++---------------- .../polars-core/src/hashing/vector_hasher.rs | 54 --------------- .../src/series/implementations/list.rs | 19 ------ crates/polars-ops/Cargo.toml | 1 - .../src/series/ops/is_first_distinct.rs | 2 - .../src/series/ops/is_last_distinct.rs | 2 - crates/polars/Cargo.toml | 1 - crates/polars/src/lib.rs | 1 - docs/user-guide/installation.md | 1 - py-polars/Cargo.toml | 1 - py-polars/tests/unit/operations/test_join.py | 20 ++++++ 13 files changed, 28 insertions(+), 197 deletions(-) delete mode 100644 crates/polars-arrow/src/legacy/kernels/list_bytes_iter.rs diff --git a/crates/polars-arrow/src/legacy/kernels/list_bytes_iter.rs b/crates/polars-arrow/src/legacy/kernels/list_bytes_iter.rs deleted file mode 100644 index a477457ee881..000000000000 --- a/crates/polars-arrow/src/legacy/kernels/list_bytes_iter.rs +++ /dev/null @@ -1,56 +0,0 @@ -use polars_error::{polars_bail, polars_ensure, PolarsResult}; - -use crate::array::{ListArray, PrimitiveArray}; -use crate::bitmap::Bitmap; -use crate::datatypes::PhysicalType::Primitive; -use crate::types::NativeType; -use crate::with_match_primitive_type; - -unsafe fn bytes_iter<'a, T: NativeType>( - values: &'a [T], - offsets: &'a [i64], - validity: Option<&'a Bitmap>, -) -> impl ExactSizeIterator> { - let mut start = offsets[0] as usize; - offsets[1..].iter().enumerate().map(move |(i, end)| { - let end = *end as usize; - let out = values.get_unchecked(start..end); - start = end; - - let data = out.as_ptr() as *const u8; - let out = std::slice::from_raw_parts(data, std::mem::size_of_val(out)); - match validity { - None => Some(out), - Some(validity) => { - if validity.get_bit_unchecked(i) { - Some(out) - } else { - None - } - }, - } - }) -} - -pub fn numeric_list_bytes_iter( - arr: &ListArray, -) -> PolarsResult> + '_>> { - let values = arr.values(); - polars_ensure!( - values.null_count() == 0, - ComputeError: "only allowed for child arrays without nulls" - ); - let offsets = arr.offsets().as_slice(); - let validity = arr.validity(); - - if let Primitive(primitive) = values.data_type().to_physical_type() { - with_match_primitive_type!(primitive, |$T| { - let arr: &PrimitiveArray<$T> = values.as_any().downcast_ref().unwrap(); - let values = arr.values(); - let iter = unsafe { bytes_iter(values.as_slice(), offsets, validity) }; - Ok(Box::new(iter)) - }) - } else { - polars_bail!(ComputeError: "only allowed for numeric child arrays"); - } -} diff --git a/crates/polars-arrow/src/legacy/kernels/mod.rs b/crates/polars-arrow/src/legacy/kernels/mod.rs index 22754fb830fc..cdb7cb9c2057 100644 --- a/crates/polars-arrow/src/legacy/kernels/mod.rs +++ b/crates/polars-arrow/src/legacy/kernels/mod.rs @@ -10,7 +10,6 @@ pub mod fixed_size_list; pub mod float; #[cfg(feature = "compute_take")] pub mod list; -pub mod list_bytes_iter; pub mod pow; pub mod rolling; pub mod set; diff --git a/crates/polars-core/src/frame/group_by/into_groups.rs b/crates/polars-core/src/frame/group_by/into_groups.rs index f12779c52819..4c074c3b9cff 100644 --- a/crates/polars-core/src/frame/group_by/into_groups.rs +++ b/crates/polars-core/src/frame/group_by/into_groups.rs @@ -1,10 +1,9 @@ -#[cfg(feature = "group_by_list")] -use arrow::legacy::kernels::list_bytes_iter::numeric_list_bytes_iter; use arrow::legacy::kernels::sort_partition::{create_clean_partitions, partition_to_groups}; use polars_utils::total_ord::{ToTotalOrd, TotalHash}; use super::*; use crate::config::verbose; +use crate::prelude::sort::arg_sort_multiple::_get_rows_encoded_ca_unordered; use crate::utils::_split_offsets; use crate::utils::flatten::flatten_par; @@ -368,63 +367,14 @@ impl IntoGroupsProxy for ListChunked { #[allow(clippy::needless_lifetimes)] #[allow(unused_variables)] fn group_tuples<'a>(&'a self, multithreaded: bool, sorted: bool) -> PolarsResult { - #[cfg(feature = "group_by_list")] - { - polars_ensure!( - self.inner_dtype().to_physical().is_numeric(), - ComputeError: "grouping on list type is only allowed if the inner type is numeric" - ); - - let hb = RandomState::default(); - let null_h = get_null_hash_value(&hb); - - let arr_to_hashes = |ca: &ListChunked| { - let mut out = Vec::with_capacity(ca.len()); - - for arr in ca.downcast_iter() { - out.extend(numeric_list_bytes_iter(arr)?.map(|opt_bytes| { - let hash = match opt_bytes { - Some(s) => hb.hash_one(s), - None => null_h, - }; - - // SAFETY: - // the underlying data is tied to self - unsafe { - std::mem::transmute::, BytesHash<'a>>(BytesHash::new( - opt_bytes, hash, - )) - } - })) - } - Ok(out) - }; + let by = &[self.clone().into_series()]; + let ca = if multithreaded { + encode_rows_vertical_par_unordered(by).unwrap() + } else { + _get_rows_encoded_ca_unordered("", by).unwrap() + }; - if multithreaded { - let n_partitions = _set_partition_size(); - let split = _split_offsets(self.len(), n_partitions); - - let groups: PolarsResult<_> = POOL.install(|| { - let bytes_hashes = split - .into_par_iter() - .map(|(offset, len)| { - let ca = self.slice(offset as i64, len); - arr_to_hashes(&ca) - }) - .collect::>>()?; - let bytes_hashes = bytes_hashes.iter().collect::>(); - Ok(group_by_threaded_slice(bytes_hashes, n_partitions, sorted)) - }); - groups - } else { - let hashes = arr_to_hashes(self)?; - Ok(group_by(hashes.iter(), sorted)) - } - } - #[cfg(not(feature = "group_by_list"))] - { - panic!("activate 'group_by_list' feature") - } + ca.group_tuples(multithreaded, sorted) } } diff --git a/crates/polars-core/src/hashing/vector_hasher.rs b/crates/polars-core/src/hashing/vector_hasher.rs index dc20a2baa19d..5bdf8d126bd6 100644 --- a/crates/polars-core/src/hashing/vector_hasher.rs +++ b/crates/polars-core/src/hashing/vector_hasher.rs @@ -1,6 +1,4 @@ use arrow::bitmap::utils::get_bit_unchecked; -#[cfg(feature = "group_by_list")] -use arrow::legacy::kernels::list_bytes_iter::numeric_list_bytes_iter; use polars_utils::total_ord::{ToTotalOrd, TotalHash}; use rayon::prelude::*; use xxhash_rust::xxh3::xxh3_64_with_seed; @@ -379,58 +377,6 @@ impl VecHash for BooleanChunked { } } -#[cfg(feature = "group_by_list")] -impl VecHash for ListChunked { - fn vec_hash(&self, _random_state: RandomState, _buf: &mut Vec) -> PolarsResult<()> { - polars_ensure!( - self.inner_dtype().to_physical().is_numeric(), - ComputeError: "grouping on list type is only allowed if the inner type is numeric" - ); - _buf.clear(); - _buf.reserve(self.len()); - let null_h = get_null_hash_value(&_random_state); - - for arr in self.downcast_iter() { - _buf.extend( - numeric_list_bytes_iter(arr)?.map(|opt_bytes| match opt_bytes { - Some(s) => xxh3_64_with_seed(s, null_h), - None => null_h, - }), - ) - } - Ok(()) - } - - fn vec_hash_combine( - &self, - _random_state: RandomState, - _hashes: &mut [u64], - ) -> PolarsResult<()> { - polars_ensure!( - self.inner_dtype().to_physical().is_numeric(), - ComputeError: "grouping on list type is only allowed if the inner type is numeric" - ); - - let null_h = get_null_hash_value(&_random_state); - - let mut offset = 0; - self.downcast_iter().try_for_each(|arr| { - numeric_list_bytes_iter(arr)? - .zip(&mut _hashes[offset..]) - .for_each(|(opt_bytes, h)| { - let l = match opt_bytes { - Some(s) => xxh3_64_with_seed(s, null_h), - None => null_h, - }; - *h = _boost_hash_combine(l, *h) - }); - offset += arr.len(); - PolarsResult::Ok(()) - })?; - Ok(()) - } -} - #[cfg(feature = "object")] impl VecHash for ObjectChunked where diff --git a/crates/polars-core/src/series/implementations/list.rs b/crates/polars-core/src/series/implementations/list.rs index da0c2ce27366..f5a02973ce50 100644 --- a/crates/polars-core/src/series/implementations/list.rs +++ b/crates/polars-core/src/series/implementations/list.rs @@ -44,22 +44,6 @@ impl private::PrivateSeries for SeriesWrap { IntoGroupsProxy::group_tuples(&self.0, multithreaded, sorted) } - #[cfg(feature = "group_by_list")] - fn vec_hash(&self, _build_hasher: RandomState, _buf: &mut Vec) -> PolarsResult<()> { - self.0.vec_hash(_build_hasher, _buf)?; - Ok(()) - } - - #[cfg(feature = "group_by_list")] - fn vec_hash_combine( - &self, - _build_hasher: RandomState, - _hashes: &mut [u64], - ) -> PolarsResult<()> { - self.0.vec_hash_combine(_build_hasher, _hashes)?; - Ok(()) - } - fn into_total_eq_inner<'a>(&'a self) -> Box { (&self.0).into_total_eq_inner() } @@ -154,7 +138,6 @@ impl SeriesTrait for SeriesWrap { self.0.has_validity() } - #[cfg(feature = "group_by_list")] #[cfg(feature = "algorithm_group_by")] fn unique(&self) -> PolarsResult { if !self.inner_dtype().is_numeric() { @@ -171,7 +154,6 @@ impl SeriesTrait for SeriesWrap { Ok(unsafe { self.0.clone().into_series().agg_first(&groups?) }) } - #[cfg(feature = "group_by_list")] #[cfg(feature = "algorithm_group_by")] fn n_unique(&self) -> PolarsResult { if !self.inner_dtype().is_numeric() { @@ -189,7 +171,6 @@ impl SeriesTrait for SeriesWrap { } } - #[cfg(feature = "group_by_list")] #[cfg(feature = "algorithm_group_by")] fn arg_unique(&self) -> PolarsResult { if !self.inner_dtype().is_numeric() { diff --git a/crates/polars-ops/Cargo.toml b/crates/polars-ops/Cargo.toml index 19eb92154fe3..bf67349c7cd8 100644 --- a/crates/polars-ops/Cargo.toml +++ b/crates/polars-ops/Cargo.toml @@ -104,7 +104,6 @@ extract_jsonpath = ["serde_json", "jsonpath_lib", "polars-json"] log = [] hash = [] reinterpret = ["polars-core/reinterpret"] -group_by_list = ["polars-core/group_by_list"] rolling_window = ["polars-core/rolling_window"] moment = [] mode = [] diff --git a/crates/polars-ops/src/series/ops/is_first_distinct.rs b/crates/polars-ops/src/series/ops/is_first_distinct.rs index b75ae23dba1f..de5ce5b7a8a6 100644 --- a/crates/polars-ops/src/series/ops/is_first_distinct.rs +++ b/crates/polars-ops/src/series/ops/is_first_distinct.rs @@ -88,7 +88,6 @@ fn is_first_distinct_struct(s: &Series) -> PolarsResult { Ok(BooleanChunked::with_chunk(s.name(), arr)) } -#[cfg(feature = "group_by_list")] fn is_first_distinct_list(ca: &ListChunked) -> PolarsResult { let groups = ca.group_tuples(true, false)?; let first = groups.take_group_firsts(); @@ -136,7 +135,6 @@ pub fn is_first_distinct(s: &Series) -> PolarsResult { }, #[cfg(feature = "dtype-struct")] Struct(_) => return is_first_distinct_struct(&s), - #[cfg(feature = "group_by_list")] List(inner) if inner.is_numeric() => { let ca = s.list().unwrap(); return is_first_distinct_list(ca); diff --git a/crates/polars-ops/src/series/ops/is_last_distinct.rs b/crates/polars-ops/src/series/ops/is_last_distinct.rs index 40d57d438151..e110ed7f0421 100644 --- a/crates/polars-ops/src/series/ops/is_last_distinct.rs +++ b/crates/polars-ops/src/series/ops/is_last_distinct.rs @@ -40,7 +40,6 @@ pub fn is_last_distinct(s: &Series) -> PolarsResult { }, #[cfg(feature = "dtype-struct")] Struct(_) => return is_last_distinct_struct(&s), - #[cfg(feature = "group_by_list")] List(inner) if inner.is_numeric() => { let ca = s.list().unwrap(); return is_last_distinct_list(ca); @@ -157,7 +156,6 @@ fn is_last_distinct_struct(s: &Series) -> PolarsResult { Ok(BooleanChunked::with_chunk(s.name(), arr)) } -#[cfg(feature = "group_by_list")] fn is_last_distinct_list(ca: &ListChunked) -> PolarsResult { let groups = ca.group_tuples(true, false)?; // SAFETY: all groups have at least a single member diff --git a/crates/polars/Cargo.toml b/crates/polars/Cargo.toml index 027213677801..60c8429201d7 100644 --- a/crates/polars/Cargo.toml +++ b/crates/polars/Cargo.toml @@ -160,7 +160,6 @@ extract_jsonpath = [ ] find_many = ["polars-plan/find_many"] fused = ["polars-ops/fused", "polars-lazy?/fused"] -group_by_list = ["polars-core/group_by_list", "polars-ops/group_by_list"] interpolate = ["polars-ops/interpolate", "polars-lazy?/interpolate"] is_between = ["polars-lazy?/is_between", "polars-ops/is_between"] is_first_distinct = ["polars-lazy?/is_first_distinct", "polars-ops/is_first_distinct"] diff --git a/crates/polars/src/lib.rs b/crates/polars/src/lib.rs index 3f0e84b3f9b5..1d7f43b8db36 100644 --- a/crates/polars/src/lib.rs +++ b/crates/polars/src/lib.rs @@ -225,7 +225,6 @@ //! - `asof_join` - Join ASOF, to join on nearest keys instead of exact equality match. //! - `cross_join` - Create the Cartesian product of two [`DataFrame`]s. //! - `semi_anti_join` - SEMI and ANTI joins. -//! - `group_by_list` - Allow group_by operation on keys of type List. //! - `row_hash` - Utility to hash [`DataFrame`] rows to [`UInt64Chunked`] //! - `diagonal_concat` - Concat diagonally thereby combining different schemas. //! - `dataframe_arithmetic` - Arithmetic on ([`Dataframe`] and [`DataFrame`]s) and ([`DataFrame`] on [`Series`]) diff --git a/docs/user-guide/installation.md b/docs/user-guide/installation.md index 83aa684bf92e..56c3276bf742 100644 --- a/docs/user-guide/installation.md +++ b/docs/user-guide/installation.md @@ -125,7 +125,6 @@ The opt-in features are: - `join_asof` - Join ASOF, to join on nearest keys instead of exact equality match. - `cross_join` - Create the Cartesian product of two DataFrames. - `semi_anti_join` - SEMI and ANTI joins. - - `group_by_list` - Allow group by operation on keys of type List. - `row_hash` - Utility to hash DataFrame rows to UInt64Chunked - `diagonal_concat` - Concat diagonally thereby combining different schemas. - `dataframe_arithmetic` - Arithmetic on (Dataframe and DataFrames) and (DataFrame on Series) diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml index e8f80d83dc06..ed87aa1d7ba6 100644 --- a/py-polars/Cargo.toml +++ b/py-polars/Cargo.toml @@ -163,7 +163,6 @@ dtypes = [ "dtype-i16", "dtype-u8", "dtype-u16", - "polars/group_by_list", "object", ] diff --git a/py-polars/tests/unit/operations/test_join.py b/py-polars/tests/unit/operations/test_join.py index 97a6e6593e6f..2c9e441def3e 100644 --- a/py-polars/tests/unit/operations/test_join.py +++ b/py-polars/tests/unit/operations/test_join.py @@ -815,3 +815,23 @@ def test_join_projection_invalid_name_contains_suffix_15243() -> None: .select(pl.col("b").filter(pl.col("b") == pl.col("foo_right"))) .collect() ) + + +def test_join_list_non_numeric() -> None: + assert ( + pl.DataFrame( + { + "lists": [ + ["a", "b", "c"], + ["a", "c", "b"], + ["a", "c", "b"], + ["a", "c", "d"], + ] + } + ) + ).group_by("lists", maintain_order=True).agg(pl.len().alias("count")).to_dict( + as_series=False + ) == { + "lists": [["a", "b", "c"], ["a", "c", "b"], ["a", "c", "d"]], + "count": [1, 2, 1], + }