perf: Fix pathological small chunk parquet writing (#16433)

pola-rs · May 23, 2024 · 717277e · 717277e
1 parent 462bc8b
commit 717277e
Show file tree

Hide file tree

Showing 6 changed files with 53 additions and 11 deletions.
diff --git a/crates/polars-core/src/frame/chunks.rs b/crates/polars-core/src/frame/chunks.rs
@@ -22,7 +22,7 @@ impl TryFrom<(RecordBatch, &[ArrowField])> for DataFrame {
 }
 
 impl DataFrame {
-    pub fn split_chunks(mut self) -> impl Iterator<Item = DataFrame> {
+    pub fn split_chunks(&mut self) -> impl Iterator<Item = DataFrame> + '_ {
         self.align_chunks();
 
         (0..self.n_chunks()).map(move |i| unsafe {

diff --git a/crates/polars-io/src/utils.rs b/crates/polars-io/src/utils.rs
@@ -276,11 +276,56 @@ pub(crate) fn chunk_df_for_writing(
     // ensures all chunks are aligned.
     df.align_chunks();
 
+    // Accumulate many small chunks to the row group size.
+    // See: #16403
+    if !df.get_columns().is_empty()
+        && df.get_columns()[0]
+            .chunk_lengths()
+            .take(5)
+            .all(|len| len < row_group_size)
+    {
+        fn finish(scratch: &mut Vec<DataFrame>, new_chunks: &mut Vec<DataFrame>) {
+            let mut new = accumulate_dataframes_vertical_unchecked(scratch.drain(..));
+            new.as_single_chunk_par();
+            new_chunks.push(new);
+        }
+
+        let mut new_chunks = Vec::with_capacity(df.n_chunks()); // upper limit;
+        let mut scratch = vec![];
+        let mut remaining = row_group_size;
+
+        for df in df.split_chunks() {
+            remaining = remaining.saturating_sub(df.height());
+            scratch.push(df);
+
+            if remaining == 0 {
+                remaining = row_group_size;
+                finish(&mut scratch, &mut new_chunks);
+            }
+        }
+        if !scratch.is_empty() {
+            finish(&mut scratch, &mut new_chunks);
+        }
+        return Ok(Cow::Owned(accumulate_dataframes_vertical_unchecked(
+            new_chunks,
+        )));
+    }
+
     let n_splits = df.height() / row_group_size;
     let result = if n_splits > 0 {
-        Cow::Owned(accumulate_dataframes_vertical_unchecked(split_df_as_ref(
-            df, n_splits, false,
-        )))
+        let mut splits = split_df_as_ref(df, n_splits, false);
+
+        for df in splits.iter_mut() {
+            // If the chunks are small enough, writing many small chunks
+            // leads to slow writing performance, so in that case we
+            // merge them.
+            let n_chunks = df.n_chunks();
+            if n_chunks > 1 && (df.estimated_size() / n_chunks < 128 * 1024) {
+                df.as_single_chunk_par();
+            }
+        }
+
+        Cow::Owned(accumulate_dataframes_vertical_unchecked(splits))
     } else {
         Cow::Borrowed(df)
     };

diff --git a/crates/polars-lazy/src/physical_plan/executors/filter.rs b/crates/polars-lazy/src/physical_plan/executors/filter.rs
@@ -63,7 +63,7 @@ impl FilterExec {
 
     fn execute_impl(
         &mut self,
-        df: DataFrame,
+        mut df: DataFrame,
         state: &mut ExecutionState,
     ) -> PolarsResult<DataFrame> {
         let n_partitions = POOL.current_num_threads();

diff --git a/crates/polars-lazy/src/physical_plan/executors/scan/csv.rs b/crates/polars-lazy/src/physical_plan/executors/scan/csv.rs
@@ -64,7 +64,7 @@ impl CsvExec {
             for i in 0..self.paths.len() {
                 let path = &self.paths[i];
 
-                let df = options_base
+                let mut df = options_base
                     .clone()
                     .with_row_index(self.file_options.row_index.clone().map(|mut ri| {
                         ri.offset += n_rows_read as IdxSize;

diff --git a/py-polars/tests/unit/io/test_lazy_parquet.py b/py-polars/tests/unit/io/test_lazy_parquet.py
@@ -251,7 +251,7 @@ def test_parquet_statistics(monkeypatch: Any, capfd: Any, tmp_path: Path) -> Non
     assert df.n_chunks("all") == [4, 4]
 
     file_path = tmp_path / "stats.parquet"
-    df.write_parquet(file_path, statistics=True, use_pyarrow=False)
+    df.write_parquet(file_path, statistics=True, use_pyarrow=False, row_group_size=50)
 
     for pred in [
         pl.col("idx") < 50,

diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py
@@ -93,14 +93,11 @@ def small_parquet_path(io_files_path: Path) -> Path:
 def test_to_from_buffer(
     df: pl.DataFrame, compression: ParquetCompression, use_pyarrow: bool
 ) -> None:
-    print(df)
     df = df[["list_str"]]
-    print(df)
     buf = io.BytesIO()
     df.write_parquet(buf, compression=compression, use_pyarrow=use_pyarrow)
     buf.seek(0)
     read_df = pl.read_parquet(buf, use_pyarrow=use_pyarrow)
-    print(read_df)
     assert_frame_equal(df, read_df, categorical_as_str=True)
 
 
@@ -113,7 +110,7 @@ def test_read_parquet_respects_rechunk_16416(
     df = pl.DataFrame({"a": [1]})
     df = pl.concat([df, df, df])
     buf = io.BytesIO()
-    df.write_parquet(buf)
+    df.write_parquet(buf, row_group_size=1)
     buf.seek(0)
 
     rechunk, expected_chunks = rechunk_and_expected_chunks