fix bug in partitioned groupby ordering

pola-rs · Jan 29, 2022 · c463228 · c463228
1 parent 0422873
commit c463228
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 2 deletions.
diff --git a/polars/polars-lazy/src/physical_plan/executors/groupby.rs b/polars/polars-lazy/src/physical_plan/executors/groupby.rs
@@ -131,6 +131,7 @@ fn run_partitions(
     exec: &PartitionGroupByExec,
     state: &ExecutionState,
     n_threads: usize,
+    maintain_order: bool,
 ) -> Result<Vec<DataFrame>> {
     // We do a partitioned groupby.
     // Meaning that we first do the groupby operation arbitrarily
@@ -142,7 +143,7 @@ fn run_partitions(
             .map(|df| {
                 let key = exec.key.evaluate(&df, state)?;
                 let phys_aggs = &exec.phys_aggs;
-                let gb = df.groupby_with_series(vec![key], false, false)?;
+                let gb = df.groupby_with_series(vec![key], false, maintain_order)?;
                 let groups = gb.get_groups();
 
                 let mut columns = gb.keys();
@@ -286,7 +287,7 @@ impl Executor for PartitionGroupByExec {
 
         // Run the partitioned aggregations
         let n_threads = POOL.current_num_threads();
-        let dfs = run_partitions(&original_df, self, state, n_threads)?;
+        let dfs = run_partitions(&original_df, self, state, n_threads, self.maintain_order)?;
 
         // MERGE phase
         // merge and hash aggregate again

diff --git a/py-polars/tests/test_df.py b/py-polars/tests/test_df.py
@@ -1527,6 +1527,14 @@ def test_groupby_order_dispatch() -> None:
     assert df.groupby("x", maintain_order=True).agg_list().frame_equal(expected)
 
 
+def test_partitioned_groupby_order() -> None:
+    # check if group ordering is maintained.
+    # we only have 30 groups, so this triggers a partitioned group by
+    df = pl.DataFrame({"x": [chr(v) for v in range(33, 63)], "y": range(30)})
+    out = df.groupby("x", maintain_order=True).agg(pl.all().list())
+    assert out["x"] == df["x"]
+
+
 def test_schema() -> None:
     df = pl.DataFrame(
         {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]}