diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index 7326d296f22b..679f99a50196 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -3991,7 +3991,10 @@ def map_batches( If set to true this can run in the streaming engine, but may yield incorrect results in group-by. Ensure you know what you are doing! agg_list - Aggregate list. + Aggregate the values of the expression into a list before applying the + function. This parameter only works in a group-by context. + The function will be invoked only once on a list of groups, rather than + once per group. Warnings -------- @@ -4020,6 +4023,46 @@ def map_batches( ╞══════╪════════╡ │ 1 ┆ 0 │ └──────┴────────┘ + + In a group-by context, the `agg_list` parameter can improve performance if used + correctly. The following example has `agg_list` set to `False`, which causes + the function to be applied once per group. The input of the function is a + Series of type `Int64`. This is less efficient. + + >>> df = pl.DataFrame( + ... { + ... "a": [0, 1, 0, 1], + ... "b": [1, 2, 3, 4], + ... } + ... ) + >>> df.group_by("a").agg( + ... pl.col("b").map_batches(lambda x: x.max(), agg_list=False) + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ list[i64] │ + ╞═════╪═══════════╡ + │ 1 ┆ [4] │ + │ 0 ┆ [3] │ + └─────┴───────────┘ + + Using `agg_list=True` would be more efficient. In this example, the input of + the function is a Series of type `List(Int64)`. + + >>> df.group_by("a").agg( + ... pl.col("b").map_batches(lambda x: x.list.max(), agg_list=True) + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴─────┘ """ if return_dtype is not None: return_dtype = py_type_to_dtype(return_dtype)