docs(python): Improve some docstrings (#11644)

pola-rs · Oct 10, 2023 · 35f3ff9 · 35f3ff9
1 parent 7f1fec0
commit 35f3ff9
Show file tree

Hide file tree

Showing 6 changed files with 139 additions and 39 deletions.
diff --git a/py-polars/polars/config.py b/py-polars/polars/config.py
@@ -399,7 +399,28 @@ def set_ascii_tables(cls, active: bool | None = True) -> type[Config]:
 
     @classmethod
     def set_auto_structify(cls, active: bool | None = False) -> type[Config]:
-        """Allow multi-output expressions to be automatically turned into Structs."""
+        """
+        Allow multi-output expressions to be automatically turned into Structs.
+
+        Examples
+        --------
+        >>> df = pl.DataFrame({"v": [1, 2, 3], "v2": [4, 5, 6]})
+        >>> with pl.Config(set_auto_structify=True):
+        ...     out = df.select(pl.all())
+        ...
+        >>> out
+        shape: (3, 1)
+        ┌───────────┐
+        │ v         │
+        │ ---       │
+        │ struct[2] │
+        ╞═══════════╡
+        │ {1,4}     │
+        │ {2,5}     │
+        │ {3,6}     │
+        └───────────┘
+
+        """
         if active is None:
             os.environ.pop("POLARS_AUTO_STRUCTIFY", None)
         else:
@@ -409,12 +430,45 @@ def set_auto_structify(cls, active: bool | None = False) -> type[Config]:
     @classmethod
     def set_fmt_float(cls, fmt: FloatFmt | None = "mixed") -> type[Config]:
         """
-        Control how floating  point values are displayed.
+        Control how floating point values are displayed.
 
         Parameters
         ----------
         fmt : {"mixed", "full"}
-            How to format floating point numbers.
+            How to format floating point numbers:
+
+            - "mixed": Limit the number of decimal places and use scientific
+                notation for large/small values.
+            - "full": Print the full precision of the floating point number.
+
+        Examples
+        --------
+        "mixed" float formatting:
+
+        >>> s = pl.Series([1.2304980958725870923, 1e6, 1e-8])
+        >>> with pl.Config(set_fmt_float="mixed"):
+        ...     print(s)
+        ...
+        shape: (3,)
+        Series: '' [f64]
+        [
+            1.230498
+            1e6
+            1.0000e-8
+        ]
+
+        "full" float formatting:
+
+        >>> with pl.Config(set_fmt_float="full"):
+        ...     print(s)
+        ...
+        shape: (3,)
+        Series: '' [f64]
+        [
+            1.230498095872587
+            1000000
+            0.00000001
+        ]
 
         """
         _set_float_fmt(fmt="mixed" if fmt is None else fmt)

diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
@@ -6524,7 +6524,9 @@ def clear(self, n: int = 0) -> Self:
 
     def clone(self) -> Self:
         """
-        Cheap deepcopy/clone.
+        Create a copy of this DataFrame.
+
+        This is a cheap operation that does not copy data.
 
         See Also
         --------

diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py
@@ -133,7 +133,7 @@ def read_csv(
         ``utf8-lossy``, the input is first decoded in memory with
         python. Defaults to ``utf8``.
     low_memory
-        Reduce memory usage at expense of performance.
+        Reduce memory pressure at the expense of performance.
     rechunk
         Make sure that all columns are contiguous in memory by
         aggregating the chunks into a single array.
@@ -504,7 +504,7 @@ def read_csv_batched(
         ``utf8-lossy``, the input is first decoded in memory with
         python. Defaults to ``utf8``.
     low_memory
-        Reduce memory usage at expense of performance.
+        Reduce memory pressure at the expense of performance.
     rechunk
         Make sure that all columns are contiguous in memory by
         aggregating the chunks into a single array.
@@ -787,7 +787,7 @@ def scan_csv(
         Lossy means that invalid utf8 values are replaced with ``�``
         characters. Defaults to "utf8".
     low_memory
-        Reduce memory usage in expense of performance.
+        Reduce memory pressure at the expense of performance.
     rechunk
         Reallocate to contiguous memory when all chunks/ files are parsed.
     skip_rows_after_header

diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py
@@ -236,21 +236,28 @@ def scan_parquet(
     retries
         Number of retries if accessing a cloud instance fails.
 
+    See Also
+    --------
+    read_parquet
+    scan_pyarrow_dataset
+
     Examples
     --------
+    Scan a local Parquet file.
+
+    >>> pl.scan_parquet("path/to/file.parquet")  # doctest: +SKIP
+
+    Scan a file on AWS S3.
+
     >>> source = "s3://bucket/*.parquet"
+    >>> pl.scan_parquet(source)  # doctest: +SKIP
     >>> storage_options = {
     ...     "aws_access_key_id": "<secret>",
     ...     "aws_secret_access_key": "<secret>",
     ...     "aws_region": "us-east-1",
     ... }
     >>> pl.scan_parquet(source, storage_options=storage_options)  # doctest: +SKIP
 
-    See Also
-    --------
-    read_parquet
-    scan_pyarrow_dataset
-
     """
     if isinstance(source, (str, Path)):
         source = normalize_filepath(source)

diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py
@@ -1627,18 +1627,18 @@ def collect(
         predicate_pushdown: bool = True,
         projection_pushdown: bool = True,
         simplify_expression: bool = True,
-        no_optimization: bool = False,
         slice_pushdown: bool = True,
         comm_subplan_elim: bool = True,
         comm_subexpr_elim: bool = True,
+        no_optimization: bool = False,
         streaming: bool = False,
         _eager: bool = False,
     ) -> DataFrame:
         """
-        Collect into a DataFrame.
+        Materialize this LazyFrame into a DataFrame.
 
-        Note: use :func:`fetch` if you want to run your query on the first `n` rows
-        only. This can be a huge time saver in debugging queries.
+        By default, all query optimizations are enabled. Individual optimizations may
+        be disabled by setting the corresponding parameter to ``False``.
 
         Parameters
         ----------
@@ -1650,21 +1650,38 @@ def collect(
             Do projection pushdown optimization.
         simplify_expression
             Run simplify expressions optimization.
-        no_optimization
-            Turn off (certain) optimizations.
         slice_pushdown
             Slice pushdown optimization.
         comm_subplan_elim
             Will try to cache branching subplans that occur on self-joins or unions.
         comm_subexpr_elim
             Common subexpressions will be cached and reused.
+        no_optimization
+            Turn off (certain) optimizations.
         streaming
-            Run parts of the query in a streaming fashion (this is in an alpha state)
+            Process the query in batches to handle larger-than-memory data.
+            If set to ``False`` (default), the entire query is processed in a single
+            batch.
+
+            .. warning::
+                This functionality is currently in an alpha state.
+
+            .. note::
+                Use :func:`explain` to see if Polars can process the query in streaming
+                mode.
 
         Returns
         -------
         DataFrame
 
+        See Also
+        --------
+        fetch: Run the query on the first `n` rows only for debugging purposes.
+        explain : Print the query plan that is evaluated with collect.
+        profile : Collect the LazyFrame and time each node in the computation graph.
+        polars.collect_all : Collect multiple LazyFrames at the same time.
+        polars.Config.set_streaming_chunk_size : Set the size of streaming batches.
+
         Examples
         --------
         >>> lf = pl.LazyFrame(
@@ -1674,7 +1691,23 @@ def collect(
         ...         "c": [6, 5, 4, 3, 2, 1],
         ...     }
         ... )
-        >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).collect()
+        >>> lf.group_by("a").agg(pl.all().sum()).collect()  # doctest: +SKIP
+        shape: (3, 3)
+        ┌─────┬─────┬─────┐
+        │ a   ┆ b   ┆ c   │
+        │ --- ┆ --- ┆ --- │
+        │ str ┆ i64 ┆ i64 │
+        ╞═════╪═════╪═════╡
+        │ a   ┆ 4   ┆ 10  │
+        │ b   ┆ 11  ┆ 10  │
+        │ c   ┆ 6   ┆ 1   │
+        └─────┴─────┴─────┘
+
+        Collect in streaming mode
+
+        >>> lf.group_by("a").agg(pl.all().sum()).collect(
+        ...     streaming=True
+        ... )  # doctest: +SKIP
         shape: (3, 3)
         ┌─────┬─────┬─────┐
         │ a   ┆ b   ┆ c   │
@@ -1882,11 +1915,11 @@ def sink_parquet(
         predicate_pushdown: bool = True,
         projection_pushdown: bool = True,
         simplify_expression: bool = True,
-        no_optimization: bool = False,
         slice_pushdown: bool = True,
+        no_optimization: bool = False,
     ) -> DataFrame:
         """
-        Persists a LazyFrame at the provided path.
+        Evaluate the query in streaming mode and write to a Parquet file.
 
         This allows streaming results that are larger than RAM to be written to disk.
 
@@ -1927,10 +1960,10 @@ def sink_parquet(
             Do projection pushdown optimization.
         simplify_expression
             Run simplify expressions optimization.
-        no_optimization
-            Turn off (certain) optimizations.
         slice_pushdown
             Slice pushdown optimization.
+        no_optimization
+            Turn off (certain) optimizations.
 
         Returns
         -------
@@ -1947,8 +1980,8 @@ def sink_parquet(
             predicate_pushdown=predicate_pushdown,
             projection_pushdown=projection_pushdown,
             simplify_expression=simplify_expression,
-            no_optimization=no_optimization,
             slice_pushdown=slice_pushdown,
+            no_optimization=no_optimization,
         )
 
         return lf.sink_parquet(
@@ -1971,11 +2004,11 @@ def sink_ipc(
         predicate_pushdown: bool = True,
         projection_pushdown: bool = True,
         simplify_expression: bool = True,
-        no_optimization: bool = False,
         slice_pushdown: bool = True,
+        no_optimization: bool = False,
     ) -> DataFrame:
         """
-        Persists a LazyFrame at the provided path.
+        Evaluate the query in streaming mode and write to an IPC file.
 
         This allows streaming results that are larger than RAM to be written to disk.
 
@@ -1997,10 +2030,10 @@ def sink_ipc(
             Do projection pushdown optimization.
         simplify_expression
             Run simplify expressions optimization.
-        no_optimization
-            Turn off (certain) optimizations.
         slice_pushdown
             Slice pushdown optimization.
+        no_optimization
+            Turn off (certain) optimizations.
 
         Returns
         -------
@@ -2017,8 +2050,8 @@ def sink_ipc(
             predicate_pushdown=predicate_pushdown,
             projection_pushdown=projection_pushdown,
             simplify_expression=simplify_expression,
-            no_optimization=no_optimization,
             slice_pushdown=slice_pushdown,
+            no_optimization=no_optimization,
         )
 
         return lf.sink_ipc(
@@ -2048,11 +2081,11 @@ def sink_csv(
         predicate_pushdown: bool = True,
         projection_pushdown: bool = True,
         simplify_expression: bool = True,
-        no_optimization: bool = False,
         slice_pushdown: bool = True,
+        no_optimization: bool = False,
     ) -> DataFrame:
         """
-        Persists a LazyFrame at the provided path.
+        Evaluate the query in streaming mode and write to a CSV file.
 
         This allows streaming results that are larger than RAM to be written to disk.
 
@@ -2116,10 +2149,10 @@ def sink_csv(
             Do projection pushdown optimization.
         simplify_expression
             Run simplify expressions optimization.
-        no_optimization
-            Turn off (certain) optimizations.
         slice_pushdown
             Slice pushdown optimization.
+        no_optimization
+            Turn off (certain) optimizations.
 
         Returns
         -------
@@ -2141,8 +2174,8 @@ def sink_csv(
             predicate_pushdown=predicate_pushdown,
             projection_pushdown=projection_pushdown,
             simplify_expression=simplify_expression,
-            no_optimization=no_optimization,
             slice_pushdown=slice_pushdown,
+            no_optimization=no_optimization,
         )
 
         return lf.sink_csv(
@@ -2168,8 +2201,8 @@ def _set_sink_optimizations(
         predicate_pushdown: bool = True,
         projection_pushdown: bool = True,
         simplify_expression: bool = True,
-        no_optimization: bool = False,
         slice_pushdown: bool = True,
+        no_optimization: bool = False,
     ) -> PyLazyFrame:
         if no_optimization:
             predicate_pushdown = False
@@ -2449,7 +2482,9 @@ def clear(self, n: int = 0) -> LazyFrame:
 
     def clone(self) -> Self:
         """
-        Very cheap deepcopy/clone.
+        Create a copy of this LazyFrame.
+
+        This is a cheap operation that does not copy data.
 
         See Also
         --------
@@ -2698,7 +2733,7 @@ def group_by(
         maintain_order
             Ensure that the order of the groups is consistent with the input data.
             This is slower than a default group by.
-            Settings this to ``True`` blocks the possibility
+            Setting this to ``True`` blocks the possibility
             to run on the streaming engine.
 
         Examples

diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py
@@ -4499,7 +4499,9 @@ def clear(self, n: int = 0) -> Series:
 
     def clone(self) -> Self:
         """
-        Very cheap deepcopy/clone.
+        Create a copy of this Series.
+
+        This is a cheap operation that does not copy data.
 
         See Also
         --------