Skip to content

Commit

Permalink
docs(python): Improve some docstrings (#11644)
Browse files Browse the repository at this point in the history
  • Loading branch information
stinodego committed Oct 10, 2023
1 parent 7f1fec0 commit 35f3ff9
Show file tree
Hide file tree
Showing 6 changed files with 139 additions and 39 deletions.
60 changes: 57 additions & 3 deletions py-polars/polars/config.py
Expand Up @@ -399,7 +399,28 @@ def set_ascii_tables(cls, active: bool | None = True) -> type[Config]:

@classmethod
def set_auto_structify(cls, active: bool | None = False) -> type[Config]:
"""Allow multi-output expressions to be automatically turned into Structs."""
"""
Allow multi-output expressions to be automatically turned into Structs.
Examples
--------
>>> df = pl.DataFrame({"v": [1, 2, 3], "v2": [4, 5, 6]})
>>> with pl.Config(set_auto_structify=True):
... out = df.select(pl.all())
...
>>> out
shape: (3, 1)
┌───────────┐
│ v │
│ --- │
│ struct[2] │
╞═══════════╡
│ {1,4} │
│ {2,5} │
│ {3,6} │
└───────────┘
"""
if active is None:
os.environ.pop("POLARS_AUTO_STRUCTIFY", None)
else:
Expand All @@ -409,12 +430,45 @@ def set_auto_structify(cls, active: bool | None = False) -> type[Config]:
@classmethod
def set_fmt_float(cls, fmt: FloatFmt | None = "mixed") -> type[Config]:
"""
Control how floating point values are displayed.
Control how floating point values are displayed.
Parameters
----------
fmt : {"mixed", "full"}
How to format floating point numbers.
How to format floating point numbers:
- "mixed": Limit the number of decimal places and use scientific
notation for large/small values.
- "full": Print the full precision of the floating point number.
Examples
--------
"mixed" float formatting:
>>> s = pl.Series([1.2304980958725870923, 1e6, 1e-8])
>>> with pl.Config(set_fmt_float="mixed"):
... print(s)
...
shape: (3,)
Series: '' [f64]
[
1.230498
1e6
1.0000e-8
]
"full" float formatting:
>>> with pl.Config(set_fmt_float="full"):
... print(s)
...
shape: (3,)
Series: '' [f64]
[
1.230498095872587
1000000
0.00000001
]
"""
_set_float_fmt(fmt="mixed" if fmt is None else fmt)
Expand Down
4 changes: 3 additions & 1 deletion py-polars/polars/dataframe/frame.py
Expand Up @@ -6524,7 +6524,9 @@ def clear(self, n: int = 0) -> Self:

def clone(self) -> Self:
"""
Cheap deepcopy/clone.
Create a copy of this DataFrame.
This is a cheap operation that does not copy data.
See Also
--------
Expand Down
6 changes: 3 additions & 3 deletions py-polars/polars/io/csv/functions.py
Expand Up @@ -133,7 +133,7 @@ def read_csv(
``utf8-lossy``, the input is first decoded in memory with
python. Defaults to ``utf8``.
low_memory
Reduce memory usage at expense of performance.
Reduce memory pressure at the expense of performance.
rechunk
Make sure that all columns are contiguous in memory by
aggregating the chunks into a single array.
Expand Down Expand Up @@ -504,7 +504,7 @@ def read_csv_batched(
``utf8-lossy``, the input is first decoded in memory with
python. Defaults to ``utf8``.
low_memory
Reduce memory usage at expense of performance.
Reduce memory pressure at the expense of performance.
rechunk
Make sure that all columns are contiguous in memory by
aggregating the chunks into a single array.
Expand Down Expand Up @@ -787,7 +787,7 @@ def scan_csv(
Lossy means that invalid utf8 values are replaced with ``�``
characters. Defaults to "utf8".
low_memory
Reduce memory usage in expense of performance.
Reduce memory pressure at the expense of performance.
rechunk
Reallocate to contiguous memory when all chunks/ files are parsed.
skip_rows_after_header
Expand Down
17 changes: 12 additions & 5 deletions py-polars/polars/io/parquet/functions.py
Expand Up @@ -236,21 +236,28 @@ def scan_parquet(
retries
Number of retries if accessing a cloud instance fails.
See Also
--------
read_parquet
scan_pyarrow_dataset
Examples
--------
Scan a local Parquet file.
>>> pl.scan_parquet("path/to/file.parquet") # doctest: +SKIP
Scan a file on AWS S3.
>>> source = "s3://bucket/*.parquet"
>>> pl.scan_parquet(source) # doctest: +SKIP
>>> storage_options = {
... "aws_access_key_id": "<secret>",
... "aws_secret_access_key": "<secret>",
... "aws_region": "us-east-1",
... }
>>> pl.scan_parquet(source, storage_options=storage_options) # doctest: +SKIP
See Also
--------
read_parquet
scan_pyarrow_dataset
"""
if isinstance(source, (str, Path)):
source = normalize_filepath(source)
Expand Down
87 changes: 61 additions & 26 deletions py-polars/polars/lazyframe/frame.py
Expand Up @@ -1627,18 +1627,18 @@ def collect(
predicate_pushdown: bool = True,
projection_pushdown: bool = True,
simplify_expression: bool = True,
no_optimization: bool = False,
slice_pushdown: bool = True,
comm_subplan_elim: bool = True,
comm_subexpr_elim: bool = True,
no_optimization: bool = False,
streaming: bool = False,
_eager: bool = False,
) -> DataFrame:
"""
Collect into a DataFrame.
Materialize this LazyFrame into a DataFrame.
Note: use :func:`fetch` if you want to run your query on the first `n` rows
only. This can be a huge time saver in debugging queries.
By default, all query optimizations are enabled. Individual optimizations may
be disabled by setting the corresponding parameter to ``False``.
Parameters
----------
Expand All @@ -1650,21 +1650,38 @@ def collect(
Do projection pushdown optimization.
simplify_expression
Run simplify expressions optimization.
no_optimization
Turn off (certain) optimizations.
slice_pushdown
Slice pushdown optimization.
comm_subplan_elim
Will try to cache branching subplans that occur on self-joins or unions.
comm_subexpr_elim
Common subexpressions will be cached and reused.
no_optimization
Turn off (certain) optimizations.
streaming
Run parts of the query in a streaming fashion (this is in an alpha state)
Process the query in batches to handle larger-than-memory data.
If set to ``False`` (default), the entire query is processed in a single
batch.
.. warning::
This functionality is currently in an alpha state.
.. note::
Use :func:`explain` to see if Polars can process the query in streaming
mode.
Returns
-------
DataFrame
See Also
--------
fetch: Run the query on the first `n` rows only for debugging purposes.
explain : Print the query plan that is evaluated with collect.
profile : Collect the LazyFrame and time each node in the computation graph.
polars.collect_all : Collect multiple LazyFrames at the same time.
polars.Config.set_streaming_chunk_size : Set the size of streaming batches.
Examples
--------
>>> lf = pl.LazyFrame(
Expand All @@ -1674,7 +1691,23 @@ def collect(
... "c": [6, 5, 4, 3, 2, 1],
... }
... )
>>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).collect()
>>> lf.group_by("a").agg(pl.all().sum()).collect() # doctest: +SKIP
shape: (3, 3)
┌─────┬─────┬─────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ a ┆ 4 ┆ 10 │
│ b ┆ 11 ┆ 10 │
│ c ┆ 6 ┆ 1 │
└─────┴─────┴─────┘
Collect in streaming mode
>>> lf.group_by("a").agg(pl.all().sum()).collect(
... streaming=True
... ) # doctest: +SKIP
shape: (3, 3)
┌─────┬─────┬─────┐
│ a ┆ b ┆ c │
Expand Down Expand Up @@ -1882,11 +1915,11 @@ def sink_parquet(
predicate_pushdown: bool = True,
projection_pushdown: bool = True,
simplify_expression: bool = True,
no_optimization: bool = False,
slice_pushdown: bool = True,
no_optimization: bool = False,
) -> DataFrame:
"""
Persists a LazyFrame at the provided path.
Evaluate the query in streaming mode and write to a Parquet file.
This allows streaming results that are larger than RAM to be written to disk.
Expand Down Expand Up @@ -1927,10 +1960,10 @@ def sink_parquet(
Do projection pushdown optimization.
simplify_expression
Run simplify expressions optimization.
no_optimization
Turn off (certain) optimizations.
slice_pushdown
Slice pushdown optimization.
no_optimization
Turn off (certain) optimizations.
Returns
-------
Expand All @@ -1947,8 +1980,8 @@ def sink_parquet(
predicate_pushdown=predicate_pushdown,
projection_pushdown=projection_pushdown,
simplify_expression=simplify_expression,
no_optimization=no_optimization,
slice_pushdown=slice_pushdown,
no_optimization=no_optimization,
)

return lf.sink_parquet(
Expand All @@ -1971,11 +2004,11 @@ def sink_ipc(
predicate_pushdown: bool = True,
projection_pushdown: bool = True,
simplify_expression: bool = True,
no_optimization: bool = False,
slice_pushdown: bool = True,
no_optimization: bool = False,
) -> DataFrame:
"""
Persists a LazyFrame at the provided path.
Evaluate the query in streaming mode and write to an IPC file.
This allows streaming results that are larger than RAM to be written to disk.
Expand All @@ -1997,10 +2030,10 @@ def sink_ipc(
Do projection pushdown optimization.
simplify_expression
Run simplify expressions optimization.
no_optimization
Turn off (certain) optimizations.
slice_pushdown
Slice pushdown optimization.
no_optimization
Turn off (certain) optimizations.
Returns
-------
Expand All @@ -2017,8 +2050,8 @@ def sink_ipc(
predicate_pushdown=predicate_pushdown,
projection_pushdown=projection_pushdown,
simplify_expression=simplify_expression,
no_optimization=no_optimization,
slice_pushdown=slice_pushdown,
no_optimization=no_optimization,
)

return lf.sink_ipc(
Expand Down Expand Up @@ -2048,11 +2081,11 @@ def sink_csv(
predicate_pushdown: bool = True,
projection_pushdown: bool = True,
simplify_expression: bool = True,
no_optimization: bool = False,
slice_pushdown: bool = True,
no_optimization: bool = False,
) -> DataFrame:
"""
Persists a LazyFrame at the provided path.
Evaluate the query in streaming mode and write to a CSV file.
This allows streaming results that are larger than RAM to be written to disk.
Expand Down Expand Up @@ -2116,10 +2149,10 @@ def sink_csv(
Do projection pushdown optimization.
simplify_expression
Run simplify expressions optimization.
no_optimization
Turn off (certain) optimizations.
slice_pushdown
Slice pushdown optimization.
no_optimization
Turn off (certain) optimizations.
Returns
-------
Expand All @@ -2141,8 +2174,8 @@ def sink_csv(
predicate_pushdown=predicate_pushdown,
projection_pushdown=projection_pushdown,
simplify_expression=simplify_expression,
no_optimization=no_optimization,
slice_pushdown=slice_pushdown,
no_optimization=no_optimization,
)

return lf.sink_csv(
Expand All @@ -2168,8 +2201,8 @@ def _set_sink_optimizations(
predicate_pushdown: bool = True,
projection_pushdown: bool = True,
simplify_expression: bool = True,
no_optimization: bool = False,
slice_pushdown: bool = True,
no_optimization: bool = False,
) -> PyLazyFrame:
if no_optimization:
predicate_pushdown = False
Expand Down Expand Up @@ -2449,7 +2482,9 @@ def clear(self, n: int = 0) -> LazyFrame:

def clone(self) -> Self:
"""
Very cheap deepcopy/clone.
Create a copy of this LazyFrame.
This is a cheap operation that does not copy data.
See Also
--------
Expand Down Expand Up @@ -2698,7 +2733,7 @@ def group_by(
maintain_order
Ensure that the order of the groups is consistent with the input data.
This is slower than a default group by.
Settings this to ``True`` blocks the possibility
Setting this to ``True`` blocks the possibility
to run on the streaming engine.
Examples
Expand Down
4 changes: 3 additions & 1 deletion py-polars/polars/series/series.py
Expand Up @@ -4499,7 +4499,9 @@ def clear(self, n: int = 0) -> Series:

def clone(self) -> Self:
"""
Very cheap deepcopy/clone.
Create a copy of this Series.
This is a cheap operation that does not copy data.
See Also
--------
Expand Down

0 comments on commit 35f3ff9

Please sign in to comment.