From 64f536f0b66712b6d21d41d1e1c9d07e96042a9b Mon Sep 17 00:00:00 2001 From: Weijie Guo Date: Thu, 11 Jan 2024 17:14:01 +0800 Subject: [PATCH 1/2] docs(python): Clarify documentation for the `agg_list` argument in `Expr.map_batches` --- py-polars/polars/expr/expr.py | 45 ++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index 7326d296f22b..501f9dd508c1 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -3991,7 +3991,10 @@ def map_batches( If set to true this can run in the streaming engine, but may yield incorrect results in group-by. Ensure you know what you are doing! agg_list - Aggregate list. + Collect groups to a list and then apply. This parameter only works for + group-by context. + If set to true, the function is invoked only once on a list of groups. + Otherwise, the function is invoked per-group. Warnings -------- @@ -4020,6 +4023,46 @@ def map_batches( ╞══════╪════════╡ │ 1 ┆ 0 │ └──────┴────────┘ + + >>> df = pl.DataFrame( + ... { + ... "a": [0, 1, 0, 1], + ... "b": [1, 2, 3, 4], + ... } + ... ) + + The function is applied per-group, and the input of the function is a + Series[i64]. + >>> ( + ... df.group_by("a").agg( + ... pl.col("b").map_batches(lambda x: x.max(), agg_list=False) + ... ) + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬───────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ list[i64] │ + ╞═════╪═══════════╡ + │ 1 ┆ [4] │ + │ 0 ┆ [3] │ + └─────┴───────────┘ + The function is applied only once on a list of groups, and the input of + the function is a Series[list[i64]]. + >>> ( + ... df.group_by("a").agg( + ... pl.col("b").map_batches(lambda x: x.list.max(), agg_list=True) + ... ) + ... ) # doctest: +IGNORE_RESULT + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 0 ┆ 3 │ + │ 1 ┆ 4 │ + └─────┴─────┘ """ if return_dtype is not None: return_dtype = py_type_to_dtype(return_dtype) From 9f551c4118ba520eadc5efde2c1991acc1d7a69a Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Fri, 12 Jan 2024 11:16:57 +0100 Subject: [PATCH 2/2] Update wording / formatting --- py-polars/polars/expr/expr.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index 501f9dd508c1..679f99a50196 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -3991,10 +3991,10 @@ def map_batches( If set to true this can run in the streaming engine, but may yield incorrect results in group-by. Ensure you know what you are doing! agg_list - Collect groups to a list and then apply. This parameter only works for - group-by context. - If set to true, the function is invoked only once on a list of groups. - Otherwise, the function is invoked per-group. + Aggregate the values of the expression into a list before applying the + function. This parameter only works in a group-by context. + The function will be invoked only once on a list of groups, rather than + once per group. Warnings -------- @@ -4024,19 +4024,19 @@ def map_batches( │ 1 ┆ 0 │ └──────┴────────┘ + In a group-by context, the `agg_list` parameter can improve performance if used + correctly. The following example has `agg_list` set to `False`, which causes + the function to be applied once per group. The input of the function is a + Series of type `Int64`. This is less efficient. + >>> df = pl.DataFrame( ... { ... "a": [0, 1, 0, 1], ... "b": [1, 2, 3, 4], ... } ... ) - - The function is applied per-group, and the input of the function is a - Series[i64]. - >>> ( - ... df.group_by("a").agg( - ... pl.col("b").map_batches(lambda x: x.max(), agg_list=False) - ... ) + >>> df.group_by("a").agg( + ... pl.col("b").map_batches(lambda x: x.max(), agg_list=False) ... ) # doctest: +IGNORE_RESULT shape: (2, 2) ┌─────┬───────────┐ @@ -4047,12 +4047,12 @@ def map_batches( │ 1 ┆ [4] │ │ 0 ┆ [3] │ └─────┴───────────┘ - The function is applied only once on a list of groups, and the input of - the function is a Series[list[i64]]. - >>> ( - ... df.group_by("a").agg( - ... pl.col("b").map_batches(lambda x: x.list.max(), agg_list=True) - ... ) + + Using `agg_list=True` would be more efficient. In this example, the input of + the function is a Series of type `List(Int64)`. + + >>> df.group_by("a").agg( + ... pl.col("b").map_batches(lambda x: x.list.max(), agg_list=True) ... ) # doctest: +IGNORE_RESULT shape: (2, 2) ┌─────┬─────┐