Skip to content

Commit

Permalink
fix bug in partitioned groupby ordering
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Jan 29, 2022
1 parent 0422873 commit c463228
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 2 deletions.
5 changes: 3 additions & 2 deletions polars/polars-lazy/src/physical_plan/executors/groupby.rs
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ fn run_partitions(
exec: &PartitionGroupByExec,
state: &ExecutionState,
n_threads: usize,
maintain_order: bool,
) -> Result<Vec<DataFrame>> {
// We do a partitioned groupby.
// Meaning that we first do the groupby operation arbitrarily
Expand All @@ -142,7 +143,7 @@ fn run_partitions(
.map(|df| {
let key = exec.key.evaluate(&df, state)?;
let phys_aggs = &exec.phys_aggs;
let gb = df.groupby_with_series(vec![key], false, false)?;
let gb = df.groupby_with_series(vec![key], false, maintain_order)?;
let groups = gb.get_groups();

let mut columns = gb.keys();
Expand Down Expand Up @@ -286,7 +287,7 @@ impl Executor for PartitionGroupByExec {

// Run the partitioned aggregations
let n_threads = POOL.current_num_threads();
let dfs = run_partitions(&original_df, self, state, n_threads)?;
let dfs = run_partitions(&original_df, self, state, n_threads, self.maintain_order)?;

// MERGE phase
// merge and hash aggregate again
Expand Down
8 changes: 8 additions & 0 deletions py-polars/tests/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -1527,6 +1527,14 @@ def test_groupby_order_dispatch() -> None:
assert df.groupby("x", maintain_order=True).agg_list().frame_equal(expected)


def test_partitioned_groupby_order() -> None:
# check if group ordering is maintained.
# we only have 30 groups, so this triggers a partitioned group by
df = pl.DataFrame({"x": [chr(v) for v in range(33, 63)], "y": range(30)})
out = df.groupby("x", maintain_order=True).agg(pl.all().list())
assert out["x"] == df["x"]


def test_schema() -> None:
df = pl.DataFrame(
{"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]}
Expand Down

0 comments on commit c463228

Please sign in to comment.