feat: Default rechunk=False for read_parquet (#16427)

pola-rs · May 23, 2024 · 12f7bd7 · 12f7bd7
1 parent ceb895e
commit 12f7bd7
Showing 1 changed file with 3 additions and 11 deletions.
diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py
@@ -43,7 +43,7 @@ def read_parquet(
     hive_partitioning: bool = True,
     glob: bool = True,
     hive_schema: SchemaDict | None = None,
-    rechunk: bool = True,
+    rechunk: bool = False,
     low_memory: bool = False,
     storage_options: dict[str, Any] | None = None,
     retries: int = 0,
@@ -130,14 +130,6 @@ def read_parquet(
     --------
     scan_parquet
     scan_pyarrow_dataset
-
-    Notes
-    -----
-    * When benchmarking:
-        This operation defaults to a `rechunk` operation at the end, meaning that all
-        data will be stored continuously in memory. Set `rechunk=False` if you are
-        benchmarking the parquet-reader as `rechunk` can be an expensive operation
-        that should not contribute to the timings.
     """
     if hive_schema is not None:
         msg = "The `hive_schema` parameter of `read_parquet` is considered unstable."
@@ -242,7 +234,7 @@ def _read_parquet_binary(
     row_index_offset: int = 0,
     parallel: ParallelStrategy = "auto",
     use_statistics: bool = True,
-    rechunk: bool = True,
+    rechunk: bool = False,
     low_memory: bool = False,
 ) -> DataFrame:
     projection, columns = parse_columns_arg(columns)
@@ -417,7 +409,7 @@ def _scan_parquet_impl(
     n_rows: int | None = None,
     cache: bool = True,
     parallel: ParallelStrategy = "auto",
-    rechunk: bool = True,
+    rechunk: bool = False,
     row_index_name: str | None = None,
     row_index_offset: int = 0,
     storage_options: dict[str, object] | None = None,