Skip to content

Commit

Permalink
add list groupby feature
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Jul 29, 2021
1 parent ff78941 commit 9e6b879
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 1 deletion.
1 change: 1 addition & 0 deletions polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ decompress = ["polars-io/decompress"]
mode = ["polars-core/mode", "polars-lazy/mode"]
take_opt_iter = ["polars-core/take_opt_iter"]
extract_jsonpath = ["polars-core/extract_jsonpath", "polars-core/strings"]
groupby_list = ["polars-core/groupby_list"]

# don't use this
private = []
Expand Down
2 changes: 2 additions & 0 deletions polars/polars-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ reinterpret = []
take_opt_iter = []
mode = []
extract_jsonpath = ["serde_json", "jsonpath_lib"]
# allow groupby operation on list type
groupby_list = []


# opt-in datatypes for Series
Expand Down
10 changes: 9 additions & 1 deletion polars/polars-core/src/frame/groupby/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ use self::hashing::*;
use crate::chunked_array::builder::PrimitiveChunkedBuilder;
use crate::frame::select::Selection;
use crate::prelude::*;
#[cfg(feature = "groupby_list")]
use crate::utils::Wrap;
use crate::utils::{accumulate_dataframes_vertical, set_partition_size, split_ca, NoNull};
use crate::vector_hasher::{AsU64, StrHash};
use crate::POOL;
Expand Down Expand Up @@ -169,7 +171,13 @@ impl IntoGroupTuples for CategoricalChunked {
}
}

impl IntoGroupTuples for ListChunked {}
impl IntoGroupTuples for ListChunked {
#[cfg(feature = "groupby_list")]
fn group_tuples(&self, _multithreaded: bool) -> GroupTuples {
groupby(self.into_iter().map(|opt_s| opt_s.map(Wrap)))
}
}

#[cfg(feature = "object")]
impl<T> IntoGroupTuples for ObjectChunked<T>
where
Expand Down
25 changes: 25 additions & 0 deletions polars/polars-core/src/series/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,20 @@ pub mod implementations;
pub(crate) mod iterator;

use crate::chunked_array::{builder::get_list_builder, ChunkIdIter};
#[cfg(feature = "groupby_list")]
use crate::utils::Wrap;
use crate::utils::{split_ca, split_series};
use crate::{series::arithmetic::coerce_lhs_rhs, POOL};
#[cfg(feature = "groupby_list")]
use ahash::RandomState;
use arrow::compute::cast;
use itertools::Itertools;
use num::NumCast;
use rayon::prelude::*;
use std::any::Any;
use std::convert::TryFrom;
#[cfg(feature = "groupby_list")]
use std::hash::{Hash, Hasher};
use std::ops::Deref;
use std::sync::Arc;

Expand Down Expand Up @@ -1163,6 +1169,25 @@ impl<'a> (dyn SeriesTrait + 'a) {
#[derive(Clone)]
pub struct Series(pub Arc<dyn SeriesTrait>);

#[cfg(feature = "groupby_list")]
impl PartialEq for Wrap<Series> {
fn eq(&self, other: &Self) -> bool {
self.0.series_equal_missing(other)
}
}

#[cfg(feature = "groupby_list")]
impl Eq for Wrap<Series> {}

#[cfg(feature = "groupby_list")]
impl Hash for Wrap<Series> {
fn hash<H: Hasher>(&self, state: &mut H) {
let rs = RandomState::with_seeds(0, 0, 0, 0);
let h = UInt64Chunked::new_from_aligned_vec("", self.0.vec_hash(rs)).sum();
h.hash(state)
}
}

impl Series {
pub(crate) fn get_inner_mut(&mut self) -> &mut dyn SeriesTrait {
if Arc::weak_count(&self.0) + Arc::strong_count(&self.0) != 1 {
Expand Down
1 change: 1 addition & 0 deletions polars/polars-core/src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use rayon::prelude::*;
use std::borrow::Cow;
use std::ops::{Deref, DerefMut};

#[repr(transparent)]
pub struct Wrap<T>(pub T);

impl<T> Deref for Wrap<T> {
Expand Down
1 change: 1 addition & 0 deletions polars/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@
//! - `downsample` - [downsample operation](crate::frame::DataFrame::downsample) on `DataFrame`s
//! - `asof_join` - Join as of, to join on nearest keys instead of exact equality match.
//! - `cross_join` - Create the cartesian product of two DataFrames.
//! - `groupby_list` - Allow groupby operation on keys of type List.
//! * `Series` operations:
//! - `is_in` - [Check for membership in `Series`](crate::chunked_array::ops::IsIn)
//! - `zip_with` - [Zip two Series/ ChunkedArrays](crate::chunked_array::ops::ChunkZip)
Expand Down

0 comments on commit 9e6b879

Please sign in to comment.