Skip to content

Commit

Permalink
feat: Default rechunk=False for read_parquet (#16427)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed May 23, 2024
1 parent ceb895e commit 12f7bd7
Showing 1 changed file with 3 additions and 11 deletions.
14 changes: 3 additions & 11 deletions py-polars/polars/io/parquet/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def read_parquet(
hive_partitioning: bool = True,
glob: bool = True,
hive_schema: SchemaDict | None = None,
rechunk: bool = True,
rechunk: bool = False,
low_memory: bool = False,
storage_options: dict[str, Any] | None = None,
retries: int = 0,
Expand Down Expand Up @@ -130,14 +130,6 @@ def read_parquet(
--------
scan_parquet
scan_pyarrow_dataset
Notes
-----
* When benchmarking:
This operation defaults to a `rechunk` operation at the end, meaning that all
data will be stored continuously in memory. Set `rechunk=False` if you are
benchmarking the parquet-reader as `rechunk` can be an expensive operation
that should not contribute to the timings.
"""
if hive_schema is not None:
msg = "The `hive_schema` parameter of `read_parquet` is considered unstable."
Expand Down Expand Up @@ -242,7 +234,7 @@ def _read_parquet_binary(
row_index_offset: int = 0,
parallel: ParallelStrategy = "auto",
use_statistics: bool = True,
rechunk: bool = True,
rechunk: bool = False,
low_memory: bool = False,
) -> DataFrame:
projection, columns = parse_columns_arg(columns)
Expand Down Expand Up @@ -417,7 +409,7 @@ def _scan_parquet_impl(
n_rows: int | None = None,
cache: bool = True,
parallel: ParallelStrategy = "auto",
rechunk: bool = True,
rechunk: bool = False,
row_index_name: str | None = None,
row_index_offset: int = 0,
storage_options: dict[str, object] | None = None,
Expand Down

0 comments on commit 12f7bd7

Please sign in to comment.