Skip to content

Commit

Permalink
docs: rework some MWE and minor formatting fixes (#4082)
Browse files Browse the repository at this point in the history
  • Loading branch information
thatlittleboy committed Jul 19, 2022
1 parent 5c3430f commit 05aac02
Show file tree
Hide file tree
Showing 6 changed files with 307 additions and 129 deletions.
116 changes: 91 additions & 25 deletions py-polars/polars/internals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3143,14 +3143,14 @@ def hash(
def reinterpret(self, signed: bool) -> Expr:
"""
Reinterpret the underlying bits as a signed/unsigned integer.
This operation is only allowed for 64bit integers. For lower bits integers,
you can safely use that cast operation.
Parameters
----------
signed
True -> pl.Int64
False -> pl.UInt64
If True, reinterpret as `pl.Int64`. Otherwise, reinterpret as `pl.UInt64`.
"""
return wrap_expr(self._pyexpr.reinterpret(signed))

Expand Down Expand Up @@ -3966,46 +3966,69 @@ def rank(self, method: str = "average", reverse: bool = False) -> Expr:
Parameters
----------
method
{'average', 'min', 'max', 'dense', 'ordinal', 'random'}, optional
method : {'average', 'min', 'max', 'dense', 'ordinal', 'random'}, optional
The method used to assign ranks to tied elements.
The following methods are available (default is 'average'):
- 'average': The average of the ranks that would have been assigned to
all the tied values is assigned to each value.
- 'min': The minimum of the ranks that would have been assigned to all
the tied values is assigned to each value. (This is also
referred to as "competition" ranking.)
- 'max': The maximum of the ranks that would have been assigned to all
the tied values is assigned to each value.
- 'dense': Like 'min', but the rank of the next highest element is
assigned the rank immediately after those assigned to the tied
elements.
- 'ordinal': All values are given a distinct rank, corresponding to
the order that the values occur in `a`.
- 'random': Like 'ordinal', but the rank for ties is not dependent
on the order that the values occur in `a`.
- 'average' : The average of the ranks that would have been assigned to
all the tied values is assigned to each value.
- 'min' : The minimum of the ranks that would have been assigned to all
the tied values is assigned to each value. (This is also referred to
as "competition" ranking.)
- 'max' : The maximum of the ranks that would have been assigned to all
the tied values is assigned to each value.
- 'dense' : Like 'min', but the rank of the next highest element is
assigned the rank immediately after those assigned to the tied
elements.
- 'ordinal' : All values are given a distinct rank, corresponding to
the order that the values occur in the Series.
- 'random' : Like 'ordinal', but the rank for ties is not dependent
on the order that the values occur in the Series.
reverse
Reverse the operation.
Examples
--------
>>> df = pl.DataFrame({"a": [0, 1, 2, 2, 4]})
The 'average' method:
>>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]})
>>> df.select(pl.col("a").rank())
shape: (5, 1)
┌─────┐
│ a │
│ --- │
│ f32 │
╞═════╡
1.0 │
3.0 │
├╌╌╌╌╌┤
│ 2.0 │
│ 4.5 │
├╌╌╌╌╌┤
│ 1.5 │
├╌╌╌╌╌┤
│ 1.5 │
├╌╌╌╌╌┤
│ 4.5 │
└─────┘
The 'ordinal' method:
>>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]})
>>> df.select(pl.col("a").rank("ordinal"))
shape: (5, 1)
┌─────┐
│ a │
│ --- │
│ u32 │
╞═════╡
│ 3 │
├╌╌╌╌╌┤
3.5
4
├╌╌╌╌╌┤
3.5
1
├╌╌╌╌╌┤
│ 5.0 │
│ 2 │
├╌╌╌╌╌┤
│ 5 │
└─────┘
"""
Expand Down Expand Up @@ -6400,11 +6423,54 @@ def slice(self, start: int, length: int | None = None) -> Expr:
Starting index of the slice (zero-indexed). Negative indexing
may be used.
length
Optional length of the slice.
Optional length of the slice. If None (default), the slice is taken to the end
of the string.
Returns
-------
Series of Utf8 type
Examples
--------
>>> df = pl.DataFrame({"s": ["pear", None, "papaya", "dragonfruit"]})
>>> df.with_column(
... pl.col("s").str.slice(-3).alias("s_sliced"),
... )
shape: (4, 2)
┌─────────────┬──────────┐
│ s ┆ s_sliced │
│ --- ┆ --- │
│ str ┆ str │
╞═════════════╪══════════╡
│ pear ┆ ear │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ null ┆ null │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ papaya ┆ aya │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ dragonfruit ┆ uit │
└─────────────┴──────────┘
Using the optional `length` parameter
>>> df.with_column(
... pl.col("s").str.slice(4, length=3).alias("s_sliced"),
... )
shape: (4, 2)
┌─────────────┬──────────┐
│ s ┆ s_sliced │
│ --- ┆ --- │
│ str ┆ str │
╞═════════════╪══════════╡
│ pear ┆ │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ null ┆ null │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ papaya ┆ ya │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ dragonfruit ┆ onf │
└─────────────┴──────────┘
"""
return wrap_expr(self._pyexpr.str_slice(start, length))

Expand Down
145 changes: 77 additions & 68 deletions py-polars/polars/internals/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -654,8 +654,12 @@ def _read_parquet(
row_count_offset: int = 0,
low_memory: bool = False,
) -> DF:
"""
See Also: `pl.read_csv`
"""Read into a DataFrame from a parquet file.
See Also
--------
read_parquet
"""
if isinstance(file, (str, Path)):
file = format_path(file)
Expand Down Expand Up @@ -1276,11 +1280,11 @@ def transpose(
Parameters
----------
include_header:
include_header
If set, the column names will be added as first column.
header_name:
If `include_header` is set, this determines the name of the column that will be inserted
column_names:
header_name
If `include_header` is set, this determines the name of the column that will be inserted.
column_names
Optional generator/iterator that yields column names. Will be used to replace the columns in the DataFrame.
Notes
Expand All @@ -1306,7 +1310,7 @@ def transpose(
│ b ┆ 1 ┆ 2 ┆ 3 │
└────────┴──────────┴──────────┴──────────┘
# replace the auto generated column names with a list
Replace the auto-generated column names with a list
>>> df.transpose(include_header=False, column_names=["a", "b", "c"])
shape: (2, 3)
Expand Down Expand Up @@ -1336,7 +1340,7 @@ def transpose(
│ b ┆ 1 ┆ 2 ┆ 3 │
└─────┴─────┴─────┴─────┘
Replace the auto generated column with column names from a generator function
Replace the auto-generated column with column names from a generator function
>>> def name_generator():
... base_name = "my_column_"
Expand Down Expand Up @@ -1388,46 +1392,42 @@ def write_parquet(
**kwargs: Any,
) -> None:
"""
Write the DataFrame disk in parquet format.
Write the DataFrame to disk in parquet format.
Parameters
----------
file
File path to which the file should be written.
compression
Compression method. Choose one of:
- "uncompressed" (not supported by pyarrow)
- "snappy"
- "gzip"
- "lzo"
- "brotli"
- "lz4"
- "zstd"
- "uncompressed" (not supported by pyarrow)
- "snappy"
- "gzip"
- "lzo"
- "brotli"
- "lz4"
- "zstd"
The default compression "lz4" (actually lz4raw) has very good performance, but may not yet been supported
by older readers. If you want more compatability guarantees, consider using "snappy".
compression_level
Supported by {'gzip', 'brotli', 'zstd'}
- "gzip"
* min-level: 0
* max-level: 10
- "brotli"
* min-level: 0
* max-level: 11
- "zstd"
* min-level: 1
* max-level: 22
The level of compression to use. Higher compression means smaller files on disk.
- "gzip" : min-level: 0, max-level: 10.
- "brotli" : min-level: 0, max-level: 11.
- "zstd" : min-level: 1, max-level: 22.
statistics
Write statistics to the parquet headers. This requires extra compute.
row_group_size
Size of the row groups. If none the chunks of the `DataFrame` are used.
Size of the row groups. If None (default), the chunks of the `DataFrame` are used.
Writing in smaller chunks may reduce memory pressure and improve writing speeds.
This argument has no effect if 'pyarrow' is used.
use_pyarrow
Use C++ parquet implementation vs rust parquet implementation.
At the moment C++ supports more features.
kwargs
Arguments are passed to pyarrow.parquet.write_table.
Arguments are passed to ``pyarrow.parquet.write_table``.
"""
if compression is None:
compression = "uncompressed"
Expand Down Expand Up @@ -2653,6 +2653,11 @@ def drop_nulls(self: DF, subset: str | list[str] | None = None) -> DF:
"""
Return a new DataFrame where the null values are dropped.
Parameters
----------
subset
Subset of column(s) on which ``drop_nulls`` will be applied.
Examples
--------
>>> df = pl.DataFrame(
Expand Down Expand Up @@ -4854,13 +4859,17 @@ def lazy(self: DF) -> pli.LazyFrame[DF]:
Operations on a `LazyFrame` are not executed until this is requested by either calling:
* `.fetch()` (run on a small number of rows)
* `.collect()` (run on all data)
* `.describe_plan()` (print unoptimized query plan)
* `.describe_optimized_plan()` (print optimized query plan)
* `.show_graph()` (show (un)optimized query plan) as graphviz graph)
* :meth:`.fetch() <polars.LazyFrame.fetch>` (run on a small number of rows)
* :meth:`.collect() <polars.LazyFrame.collect>` (run on all data)
* :meth:`.describe_plan() <polars.LazyFrame.describe_plan>` (print unoptimized query plan)
* :meth:`.describe_optimized_plan() <polars.LazyFrame.describe_optimized_plan>` (print optimized query plan)
* :meth:`.show_graph() <polars.LazyFrame.show_graph>` (show (un)optimized query plan as graphviz graph)
Lazy operations are advised because they allow for query optimization and more parallelization.
Returns
-------
LazyFrame
"""
return self._lazyframe_class._from_pyldf(self._df.lazy())

Expand Down Expand Up @@ -5868,8 +5877,10 @@ def to_struct(self, name: str) -> pli.Series:

def unnest(self: DF, names: str | list[str]) -> DF:
"""
Decompose a struct into its fields. The fields will be inserted in to the `DataFrame` on the
location of the `struct` type.
Decompose a struct into its fields.
The fields will be inserted into the `DataFrame` on the location of the
`struct` type.
Parameters
----------
Expand All @@ -5878,40 +5889,38 @@ def unnest(self: DF, names: str | list[str]) -> DF:
Examples
--------
>>> df = (
... pl.DataFrame(
... {
... "int": [1, 2],
... "str": ["a", "b"],
... "bool": [True, None],
... "list": [[1, 2], [3]],
... }
... )
... .to_struct("my_struct")
... .to_frame()
... )
>>> df = pl.DataFrame(
... {
... "before": ["foo", "bar"],
... "t_a": [1, 2],
... "t_b": ["a", "b"],
... "t_c": [True, None],
... "t_d": [[1, 2], [3]],
... "after": ["baz", "womp"],
... }
... ).select(["before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after"])
>>> df
shape: (2, 1)
┌─────────────────────┐
my_struct
│ ---
│ struct[4] │
╞═════════════════════╡
│ {1,"a",true,[1, 2]} │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ {2,"b",null,[3]} │
└─────────────────────┘
>>> df.unnest("my_struct")
shape: (2, 4)
┌─────┬─────┬─────┬───────────┐
intstrboollist
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ bool ┆ list[i64] │
╞═════╪═════╪═════╪═══════════╡
│ 1 ┆ a ┆ true ┆ [1, 2] │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ b ┆ null ┆ [3] │
└─────┴─────┴─────┴───────────┘
shape: (2, 3)
┌─────────────────────────────┬───────┐
before ┆ t_struct ┆ after
│ --- ┆ --- ┆ ---
str ┆ struct[4] ┆ str
╞═════════════════════════════╪═══════╡
foo ┆ {1,"a",true,[1, 2]} ┆ baz
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
bar ┆ {2,"b",null,[3]} ┆ womp
└─────────────────────────────┴───────┘
>>> df.unnest("t_struct")
shape: (2, 6)
┌────────┬─────┬─────┬──────┬───────────┬───────┐
beforet_at_bt_c ┆ t_d ┆ after
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ ---
str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str
╞════════╪═════╪═════╪══════╪═══════════╪═══════╡
foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp
└────────┴─────┴─────┴──────┴───────────┴───────┘
"""
if isinstance(names, str):
Expand Down

0 comments on commit 05aac02

Please sign in to comment.