docs: rework some MWE and minor formatting fixes (#4082)

pola-rs · Jul 19, 2022 · 05aac02 · 05aac02
1 parent 5c3430f
commit 05aac02
Show file tree

Hide file tree

Showing 6 changed files with 307 additions and 129 deletions.
diff --git a/py-polars/polars/internals/expr.py b/py-polars/polars/internals/expr.py
@@ -3143,14 +3143,14 @@ def hash(
     def reinterpret(self, signed: bool) -> Expr:
         """
         Reinterpret the underlying bits as a signed/unsigned integer.
+
         This operation is only allowed for 64bit integers. For lower bits integers,
         you can safely use that cast operation.
 
         Parameters
         ----------
         signed
-            True -> pl.Int64
-            False -> pl.UInt64
+            If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`.
         """
         return wrap_expr(self._pyexpr.reinterpret(signed))
 
@@ -3966,46 +3966,69 @@ def rank(self, method: str = "average", reverse: bool = False) -> Expr:
 
         Parameters
         ----------
-        method
-            {'average', 'min', 'max', 'dense', 'ordinal', 'random'}, optional
+        method : {'average', 'min', 'max', 'dense', 'ordinal', 'random'}, optional
             The method used to assign ranks to tied elements.
             The following methods are available (default is 'average'):
-            - 'average': The average of the ranks that would have been assigned to
-            all the tied values is assigned to each value.
-            - 'min': The minimum of the ranks that would have been assigned to all
-            the tied values is assigned to each value.  (This is also
-            referred to as "competition" ranking.)
-            - 'max': The maximum of the ranks that would have been assigned to all
-            the tied values is assigned to each value.
-            - 'dense': Like 'min', but the rank of the next highest element is
-            assigned the rank immediately after those assigned to the tied
-            elements.
-            - 'ordinal': All values are given a distinct rank, corresponding to
-            the order that the values occur in `a`.
-            - 'random': Like 'ordinal', but the rank for ties is not dependent
-            on the order that the values occur in `a`.
+
+            - 'average' : The average of the ranks that would have been assigned to
+              all the tied values is assigned to each value.
+            - 'min' : The minimum of the ranks that would have been assigned to all
+              the tied values is assigned to each value. (This is also referred to
+              as "competition" ranking.)
+            - 'max' : The maximum of the ranks that would have been assigned to all
+              the tied values is assigned to each value.
+            - 'dense' : Like 'min', but the rank of the next highest element is
+              assigned the rank immediately after those assigned to the tied
+              elements.
+            - 'ordinal' : All values are given a distinct rank, corresponding to
+              the order that the values occur in the Series.
+            - 'random' : Like 'ordinal', but the rank for ties is not dependent
+              on the order that the values occur in the Series.
         reverse
             Reverse the operation.
 
         Examples
         --------
-        >>> df = pl.DataFrame({"a": [0, 1, 2, 2, 4]})
+        The 'average' method:
+
+        >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]})
         >>> df.select(pl.col("a").rank())
         shape: (5, 1)
         ┌─────┐
         │ a   │
         │ --- │
         │ f32 │
         ╞═════╡
-        │ 1.0 │
+        │ 3.0 │
         ├╌╌╌╌╌┤
-        │ 2.0 │
+        │ 4.5 │
+        ├╌╌╌╌╌┤
+        │ 1.5 │
+        ├╌╌╌╌╌┤
+        │ 1.5 │
+        ├╌╌╌╌╌┤
+        │ 4.5 │
+        └─────┘
+
+        The 'ordinal' method:
+
+        >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]})
+        >>> df.select(pl.col("a").rank("ordinal"))
+        shape: (5, 1)
+        ┌─────┐
+        │ a   │
+        │ --- │
+        │ u32 │
+        ╞═════╡
+        │ 3   │
         ├╌╌╌╌╌┤
-        │ 3.5 │
+        │ 4   │
         ├╌╌╌╌╌┤
-        │ 3.5 │
+        │ 1   │
         ├╌╌╌╌╌┤
-        │ 5.0 │
+        │ 2   │
+        ├╌╌╌╌╌┤
+        │ 5   │
         └─────┘
 
         """
@@ -6400,11 +6423,54 @@ def slice(self, start: int, length: int | None = None) -> Expr:
             Starting index of the slice (zero-indexed). Negative indexing
             may be used.
         length
-            Optional length of the slice.
+            Optional length of the slice. If None (default), the slice is taken to the end
+            of the string.
 
         Returns
         -------
         Series of Utf8 type
+
+        Examples
+        --------
+        >>> df = pl.DataFrame({"s": ["pear", None, "papaya", "dragonfruit"]})
+        >>> df.with_column(
+        ...     pl.col("s").str.slice(-3).alias("s_sliced"),
+        ... )
+        shape: (4, 2)
+        ┌─────────────┬──────────┐
+        │ s           ┆ s_sliced │
+        │ ---         ┆ ---      │
+        │ str         ┆ str      │
+        ╞═════════════╪══════════╡
+        │ pear        ┆ ear      │
+        ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
+        │ null        ┆ null     │
+        ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
+        │ papaya      ┆ aya      │
+        ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
+        │ dragonfruit ┆ uit      │
+        └─────────────┴──────────┘
+
+        Using the optional `length` parameter
+
+        >>> df.with_column(
+        ...     pl.col("s").str.slice(4, length=3).alias("s_sliced"),
+        ... )
+        shape: (4, 2)
+        ┌─────────────┬──────────┐
+        │ s           ┆ s_sliced │
+        │ ---         ┆ ---      │
+        │ str         ┆ str      │
+        ╞═════════════╪══════════╡
+        │ pear        ┆          │
+        ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
+        │ null        ┆ null     │
+        ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
+        │ papaya      ┆ ya       │
+        ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
+        │ dragonfruit ┆ onf      │
+        └─────────────┴──────────┘
+
         """
         return wrap_expr(self._pyexpr.str_slice(start, length))
 

diff --git a/py-polars/polars/internals/frame.py b/py-polars/polars/internals/frame.py
@@ -654,8 +654,12 @@ def _read_parquet(
         row_count_offset: int = 0,
         low_memory: bool = False,
     ) -> DF:
-        """
-        See Also: `pl.read_csv`
+        """Read into a DataFrame from a parquet file.
+
+        See Also
+        --------
+        read_parquet
+
         """
         if isinstance(file, (str, Path)):
             file = format_path(file)
@@ -1276,11 +1280,11 @@ def transpose(
 
         Parameters
         ----------
-        include_header:
+        include_header
             If set, the column names will be added as first column.
-        header_name:
-            If `include_header` is set, this determines the name of the column that will be inserted
-        column_names:
+        header_name
+            If `include_header` is set, this determines the name of the column that will be inserted.
+        column_names
             Optional generator/iterator that yields column names. Will be used to replace the columns in the DataFrame.
 
         Notes
@@ -1306,7 +1310,7 @@ def transpose(
         │ b      ┆ 1        ┆ 2        ┆ 3        │
         └────────┴──────────┴──────────┴──────────┘
 
-        # replace the auto generated column names with a list
+        Replace the auto-generated column names with a list
 
         >>> df.transpose(include_header=False, column_names=["a", "b", "c"])
         shape: (2, 3)
@@ -1336,7 +1340,7 @@ def transpose(
         │ b   ┆ 1   ┆ 2   ┆ 3   │
         └─────┴─────┴─────┴─────┘
 
-        Replace the auto generated column with column names from a generator function
+        Replace the auto-generated column with column names from a generator function
 
         >>> def name_generator():
         ...     base_name = "my_column_"
@@ -1388,46 +1392,42 @@ def write_parquet(
         **kwargs: Any,
     ) -> None:
         """
-        Write the DataFrame disk in parquet format.
+        Write the DataFrame to disk in parquet format.
 
         Parameters
         ----------
         file
             File path to which the file should be written.
         compression
             Compression method. Choose one of:
-                - "uncompressed" (not supported by pyarrow)
-                - "snappy"
-                - "gzip"
-                - "lzo"
-                - "brotli"
-                - "lz4"
-                - "zstd"
+
+            - "uncompressed" (not supported by pyarrow)
+            - "snappy"
+            - "gzip"
+            - "lzo"
+            - "brotli"
+            - "lz4"
+            - "zstd"
 
             The default compression "lz4" (actually lz4raw) has very good performance, but may not yet been supported
             by older readers. If you want more compatability guarantees, consider using "snappy".
         compression_level
-            Supported by {'gzip', 'brotli', 'zstd'}
-                - "gzip"
-                    * min-level: 0
-                    * max-level: 10
-                - "brotli"
-                    * min-level: 0
-                    * max-level: 11
-                - "zstd"
-                    * min-level: 1
-                    * max-level: 22
+            The level of compression to use. Higher compression means smaller files on disk.
+
+            - "gzip" : min-level: 0, max-level: 10.
+            - "brotli" : min-level: 0, max-level: 11.
+            - "zstd" : min-level: 1, max-level: 22.
         statistics
             Write statistics to the parquet headers. This requires extra compute.
         row_group_size
-            Size of the row groups. If none the chunks of the `DataFrame` are used.
+            Size of the row groups. If None (default), the chunks of the `DataFrame` are used.
             Writing in smaller chunks may reduce memory pressure and improve writing speeds.
             This argument has no effect if 'pyarrow' is used.
         use_pyarrow
             Use C++ parquet implementation vs rust parquet implementation.
             At the moment C++ supports more features.
         kwargs
-            Arguments are passed to pyarrow.parquet.write_table.
+            Arguments are passed to ``pyarrow.parquet.write_table``.
         """
         if compression is None:
             compression = "uncompressed"
@@ -2653,6 +2653,11 @@ def drop_nulls(self: DF, subset: str | list[str] | None = None) -> DF:
         """
         Return a new DataFrame where the null values are dropped.
 
+        Parameters
+        ----------
+        subset
+            Subset of column(s) on which ``drop_nulls`` will be applied.
+
         Examples
         --------
         >>> df = pl.DataFrame(
@@ -4854,13 +4859,17 @@ def lazy(self: DF) -> pli.LazyFrame[DF]:
 
         Operations on a `LazyFrame` are not executed until this is requested by either calling:
 
-        * `.fetch()` (run on a small number of rows)
-        * `.collect()` (run on all data)
-        * `.describe_plan()` (print unoptimized query plan)
-        * `.describe_optimized_plan()` (print optimized query plan)
-        * `.show_graph()` (show (un)optimized query plan) as graphviz graph)
+        * :meth:`.fetch() <polars.LazyFrame.fetch>` (run on a small number of rows)
+        * :meth:`.collect() <polars.LazyFrame.collect>` (run on all data)
+        * :meth:`.describe_plan() <polars.LazyFrame.describe_plan>` (print unoptimized query plan)
+        * :meth:`.describe_optimized_plan() <polars.LazyFrame.describe_optimized_plan>` (print optimized query plan)
+        * :meth:`.show_graph() <polars.LazyFrame.show_graph>` (show (un)optimized query plan as graphviz graph)
 
         Lazy operations are advised because they allow for query optimization and more parallelization.
+
+        Returns
+        -------
+        LazyFrame
         """
         return self._lazyframe_class._from_pyldf(self._df.lazy())
 
@@ -5868,8 +5877,10 @@ def to_struct(self, name: str) -> pli.Series:
 
     def unnest(self: DF, names: str | list[str]) -> DF:
         """
-        Decompose a struct into its fields. The fields will be inserted in to the `DataFrame` on the
-        location of the `struct` type.
+        Decompose a struct into its fields.
+
+        The fields will be inserted into the `DataFrame` on the location of the
+        `struct` type.
 
         Parameters
         ----------
@@ -5878,40 +5889,38 @@ def unnest(self: DF, names: str | list[str]) -> DF:
 
         Examples
         --------
-        >>> df = (
-        ...     pl.DataFrame(
-        ...         {
-        ...             "int": [1, 2],
-        ...             "str": ["a", "b"],
-        ...             "bool": [True, None],
-        ...             "list": [[1, 2], [3]],
-        ...         }
-        ...     )
-        ...     .to_struct("my_struct")
-        ...     .to_frame()
-        ... )
+        >>> df = pl.DataFrame(
+        ...     {
+        ...         "before": ["foo", "bar"],
+        ...         "t_a": [1, 2],
+        ...         "t_b": ["a", "b"],
+        ...         "t_c": [True, None],
+        ...         "t_d": [[1, 2], [3]],
+        ...         "after": ["baz", "womp"],
+        ...     }
+        ... ).select(["before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after"])
         >>> df
-        shape: (2, 1)
-        ┌─────────────────────┐
-        │ my_struct           │
-        │ ---                 │
-        │ struct[4]           │
-        ╞═════════════════════╡
-        │ {1,"a",true,[1, 2]} │
-        ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
-        │ {2,"b",null,[3]}    │
-        └─────────────────────┘
-        >>> df.unnest("my_struct")
-        shape: (2, 4)
-        ┌─────┬─────┬──────┬───────────┐
-        │ int ┆ str ┆ bool ┆ list      │
-        │ --- ┆ --- ┆ ---  ┆ ---       │
-        │ i64 ┆ str ┆ bool ┆ list[i64] │
-        ╞═════╪═════╪══════╪═══════════╡
-        │ 1   ┆ a   ┆ true ┆ [1, 2]    │
-        ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
-        │ 2   ┆ b   ┆ null ┆ [3]       │
-        └─────┴─────┴──────┴───────────┘
+        shape: (2, 3)
+        ┌────────┬─────────────────────┬───────┐
+        │ before ┆ t_struct            ┆ after │
+        │ ---    ┆ ---                 ┆ ---   │
+        │ str    ┆ struct[4]           ┆ str   │
+        ╞════════╪═════════════════════╪═══════╡
+        │ foo    ┆ {1,"a",true,[1, 2]} ┆ baz   │
+        ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+        │ bar    ┆ {2,"b",null,[3]}    ┆ womp  │
+        └────────┴─────────────────────┴───────┘
+        >>> df.unnest("t_struct")
+        shape: (2, 6)
+        ┌────────┬─────┬─────┬──────┬───────────┬───────┐
+        │ before ┆ t_a ┆ t_b ┆ t_c  ┆ t_d       ┆ after │
+        │ ---    ┆ --- ┆ --- ┆ ---  ┆ ---       ┆ ---   │
+        │ str    ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str   │
+        ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡
+        │ foo    ┆ 1   ┆ a   ┆ true ┆ [1, 2]    ┆ baz   │
+        ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+        │ bar    ┆ 2   ┆ b   ┆ null ┆ [3]       ┆ womp  │
+        └────────┴─────┴─────┴──────┴───────────┴───────┘
 
         """
         if isinstance(names, str):