docs[python]: enhanced docstrings for "apply" methods (#4942)

pola-rs · Sep 23, 2022 · 52824d9 · 52824d9
1 parent 8dcc2cc
commit 52824d9
Show file tree

Hide file tree

Showing 6 changed files with 79 additions and 41 deletions.
diff --git a/py-polars/polars/internals/dataframe/frame.py b/py-polars/polars/internals/dataframe/frame.py
@@ -3925,20 +3925,21 @@ def apply(
         inference_size: int = 256,
     ) -> DF:
         """
-        Apply a custom function over the rows of the DataFrame.
+        Apply a custom/user-defined function (UDF) over the rows of the DataFrame.
 
-        The rows are passed as tuple.
+        The UDF will receive each row as a tuple of values: ``udf(row)``.
 
-        Implementing logic using this .apply method is generally slower and more memory
-        intensive than implementing the same logic using the expression API because:
+        Implementing logic using a Python function is almost always _significantly_
+        slower and more memory intensive than implementing the same logic using
+        the native expression API because:
 
-        - with .apply the logic is implemented in Python but with an expression the
-          logic is implemented in Rust
-        - with .apply the DataFrame is materialized in memory
-        - expressions can be parallelised
-        - expressions can be optimised
+        - The native expression engine runs in Rust; UDFs run in Python.
+        - Use of Python UDFs forces the DataFrame to be materialized in memory.
+        - Polars-native expressions can be parallelised (UDFs cannot).
+        - Polars-native expressions can be logically optimised (UDFs cannot).
 
-        If possible, use the expression API for best performance.
+        Wherever possible you should strongly prefer the native expression API
+        to achieve the best performance.
 
         Parameters
         ----------
@@ -3950,6 +3951,13 @@ def apply(
             Only used in the case when the custom function returns rows.
             This uses the first `n` rows to determine the output schema
 
+        Notes
+        -----
+        The frame-level ``apply`` cannot track column names (as the UDF is a black-box
+        that may arbitrarily drop, rearrange, transform, or add new columns); if you
+        want to apply a UDF such that column names are preserved, you should use the
+        expression-level ``apply`` syntax instead.
+
         Examples
         --------
         >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]})

diff --git a/py-polars/polars/internals/dataframe/groupby.py b/py-polars/polars/internals/dataframe/groupby.py
@@ -174,18 +174,19 @@ def _groups(self) -> DF:  # pragma: no cover
 
     def apply(self, f: Callable[[pli.DataFrame], pli.DataFrame]) -> DF:
         """
-        Apply a function over the groups as a sub-DataFrame.
+        Apply a custom/user-defined function (UDF) over the groups as a sub-DataFrame.
 
-        Implementing logic using this .apply method is generally slower and more memory
-        intensive than implementing the same logic using the expression API because:
+        Implementing logic using a Python function is almost always _significantly_
+        slower and more memory intensive than implementing the same logic using
+        the native expression API because:
 
-        - with .apply the logic is implemented in Python but with an expression the
-          logic is implemented in Rust
-        - with .apply the DataFrame is materialized in memory
-        - expressions can be parallelised
-        - expressions can be optimised
+        - The native expression engine runs in Rust; UDFs run in Python.
+        - Use of Python UDFs forces the DataFrame to be materialized in memory.
+        - Polars-native expressions can be parallelised (UDFs cannot).
+        - Polars-native expressions can be logically optimised (UDFs cannot).
 
-        If possible use the expression API for best performance.
+        Wherever possible you should strongly prefer the native expression API
+        to achieve the best performance.
 
         Parameters
         ----------

diff --git a/py-polars/polars/internals/expr/expr.py b/py-polars/polars/internals/expr/expr.py
@@ -3008,7 +3008,7 @@ def apply(
         return_dtype: type[DataType] | None = None,
     ) -> Expr:
         """
-        Apply a custom function in a GroupBy or Projection context.
+        Apply a custom/user-defined function (UDF) in a GroupBy or Projection context.
 
         Depending on the context it has the following behavior:
 
@@ -3019,17 +3019,17 @@ def apply(
             Expects `f` to be of type Callable[[Series], Series].
             Applies a python function over each group.
 
-        Implementing logic using the ``.apply`` method is generally slower and more
-        memory intensive than implementing the same logic using the expression API
-        because:
+        Implementing logic using a Python function is almost always _significantly_
+        slower and more memory intensive than implementing the same logic using
+        the native expression API because:
 
-        - with .apply the logic is implemented in Python but with an expression the
-          logic is implemented in Rust
-        - with ``.apply`` the DataFrame is materialized in memory
-        - expressions can be parallelised
-        - expressions can be optimised
+        - The native expression engine runs in Rust; UDFs run in Python.
+        - Use of Python UDFs forces the DataFrame to be materialized in memory.
+        - Polars-native expressions can be parallelised (UDFs cannot).
+        - Polars-native expressions can be logically optimised (UDFs cannot).
 
-        If possible, use the expression API for best performance.
+        Wherever possible you should strongly prefer the native expression API
+        to achieve the best performance.
 
         Parameters
         ----------

diff --git a/py-polars/polars/internals/lazy_functions.py b/py-polars/polars/internals/lazy_functions.py
@@ -878,7 +878,7 @@ def apply(
     return_dtype: type[DataType] | None = None,
 ) -> pli.Expr:
     """
-    Apply a custom function in a GroupBy context.
+    Apply a custom/user-defined function (UDF) in a GroupBy context.
 
     Depending on the context it has the following behavior:
 
@@ -1863,6 +1863,33 @@ def coalesce(
     exprs
         Expressions to coalesce.
 
+    Examples
+    --------
+    >>> df = pl.DataFrame(
+    ...     data=[
+    ...         (None, 1.0, 1.0),
+    ...         (None, 2.0, 2.0),
+    ...         (None, None, 3.0),
+    ...         (None, None, None),
+    ...     ],
+    ...     columns=[("a", pl.Float64), ("b", pl.Float64), ("c", pl.Float64)],
+    ... )
+    >>> df.with_column(pl.coalesce(["a", "b", "c", 99.9]).alias("d"))
+    shape: (4, 4)
+    ┌──────┬──────┬──────┬──────┐
+    │ a    ┆ b    ┆ c    ┆ d    │
+    │ ---  ┆ ---  ┆ ---  ┆ ---  │
+    │ f64  ┆ f64  ┆ f64  ┆ f64  │
+    ╞══════╪══════╪══════╪══════╡
+    │ null ┆ 1.0  ┆ 1.0  ┆ 1.0  │
+    ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
+    │ null ┆ 2.0  ┆ 2.0  ┆ 2.0  │
+    ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
+    │ null ┆ null ┆ 3.0  ┆ 3.0  │
+    ├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
+    │ null ┆ null ┆ null ┆ 99.9 │
+    └──────┴──────┴──────┴──────┘
+
     """
     exprs = pli.selection_to_pyexpr_list(exprs)
     return pli.wrap_expr(_coalesce_exprs(exprs))
diff --git a/py-polars/polars/internals/lazyframe/groupby.py b/py-polars/polars/internals/lazyframe/groupby.py
@@ -175,18 +175,19 @@ def apply(
         self, f: Callable[[pli.DataFrame], pli.DataFrame], schema: Schema | None
     ) -> LDF:
         """
-        Apply a function over the groups as a new `DataFrame`.
+        Apply a custom/user-defined function (UDF) over the groups as a new DataFrame.
 
-        Implementing logic using this .apply method is generally slower and more memory
-        intensive than implementing the same logic using the expression API because:
+        Implementing logic using a Python function is almost always _significantly_
+        slower and more memory intensive than implementing the same logic using
+        the native expression API because:
 
-        - with .apply the logic is implemented in Python but with an expression the
-          logic is implemented in Rust
-        - with .apply the DataFrame is materialized in memory
-        - expressions can be parallelised
-        - expressions can be optimised
+        - The native expression engine runs in Rust; UDFs run in Python.
+        - Use of Python UDFs forces the DataFrame to be materialized in memory.
+        - Polars-native expressions can be parallelised (UDFs cannot).
+        - Polars-native expressions can be logically optimised (UDFs cannot).
 
-        If possible use the expression API for best performance.
+        Wherever possible you should strongly prefer the native expression API
+        to achieve the best performance.
 
         Parameters
         ----------

diff --git a/py-polars/polars/internals/series/series.py b/py-polars/polars/internals/series/series.py
@@ -3004,7 +3004,8 @@ def apply(
         skip_nulls: bool = True,
     ) -> Series:
         """
-        Apply a function over elements in this Series and return a new Series.
+        Apply a custom/user-defined function (UDF) over elements in this Series and
+        return a new Series.
 
         If the function returns another datatype, the return_dtype arg should be set,
         otherwise the method will fail.
@@ -3037,7 +3038,7 @@ def apply(
         -------
         Series
 
-        """
+        """  # noqa: D400,D205
         if return_dtype is None:
             pl_return_dtype = None
         else: