Skip to content

Commit

Permalink
fix aggregation of empty list (#3527)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed May 29, 2022
1 parent efa6dac commit 0119e87
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 18 deletions.
2 changes: 1 addition & 1 deletion polars/polars-core/src/chunked_array/builder/list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ impl<'a> AnonymousListBuilder<'a> {
pub fn append_series(&mut self, s: &'a Series) {
// empty arrays tend to be null type and thus differ
// if we would push it the concat would fail.
if s.is_empty() {
if s.is_empty() && matches!(s.dtype(), DataType::Null) {
self.builder.push_empty()
} else {
match s.dtype() {
Expand Down
50 changes: 33 additions & 17 deletions polars/polars-core/src/chunked_array/upstream_traits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -460,21 +460,27 @@ impl FromParallelIterator<Option<Series>> for ListChunked {
I: IntoParallelIterator<Item = Option<Series>>,
{
let mut dtype = None;
let mut vectors = collect_into_linked_list(iter);
let capacity: usize = get_capacity_from_par_results(&vectors);
let mut builder = AnonymousListBuilder::new("collected", capacity, None);
for v in &mut vectors {
for val in v {
if let Some(s) = val {
if dtype.is_none() {
dtype = Some(s.dtype().clone());
}
builder.append_series(s);
} else {
builder.append_null();
}
}
}
let vectors = collect_into_linked_list(iter);

let list_capacity: usize = get_capacity_from_par_results(&vectors);
let value_capacity = vectors
.iter()
.map(|list| {
list.iter()
.map(|opt_s| {
opt_s
.as_ref()
.map(|s| {
if dtype.is_none() && !matches!(s.dtype(), DataType::Null) {
dtype = Some(s.dtype().clone())
}
s.len()
})
.unwrap_or(0)
})
.sum::<usize>()
})
.sum::<usize>();

match &dtype {
#[cfg(feature = "object")]
Expand All @@ -484,7 +490,7 @@ impl FromParallelIterator<Option<Series>> for ListChunked {
.flatten()
.find_map(|opt_s| opt_s.as_ref())
.unwrap();
let mut builder = s.get_list_builder("collected", capacity * 5, capacity);
let mut builder = s.get_list_builder("collected", value_capacity, list_capacity);

for v in vectors {
for val in v {
Expand All @@ -493,7 +499,17 @@ impl FromParallelIterator<Option<Series>> for ListChunked {
}
builder.finish()
}
_ => builder.finish(),
Some(dtype) => {
let mut builder =
get_list_builder(dtype, value_capacity, list_capacity, "collected").unwrap();
for v in &vectors {
for val in v {
builder.append_opt_series(val.as_ref());
}
}
builder.finish()
}
None => ListChunked::full_null_with_dtype("collected", list_capacity, &DataType::Null),
}
}
}
Expand Down
26 changes: 26 additions & 0 deletions py-polars/tests/test_lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,3 +214,29 @@ def test_list_eval_dtype_inference() -> None:
0.6666666865348816,
0.3333333432674408,
]


def test_list_empty_groupby_result_3521() -> None:
# Create a left relation where the join column contains a null value
left = pl.DataFrame().with_columns(
[
pl.lit(1).alias("groupby_column"),
pl.lit(None).cast(pl.Int32).alias("join_column"),
]
)

# Create a right relation where there is a column to count distinct on
right = pl.DataFrame().with_columns(
[
pl.lit(1).alias("join_column"),
pl.lit(1).alias("n_unique_column"),
]
)

# Calculate n_unique after dropping nulls
# This will panic on polars version 0.13.38 and 0.13.39
assert (
left.join(right, on="join_column", how="left")
.groupby("groupby_column")
.agg(pl.col("n_unique_column").drop_nulls())
).to_dict(False) == {"groupby_column": [1], "n_unique_column": [[]]}

0 comments on commit 0119e87

Please sign in to comment.