fix[rust]: don't divide by zero on parquet write row_groups > df.size (…

…#4406)
pola-rs · Aug 14, 2022 · 8684945 · 8684945
1 parent 98e46ba
commit 8684945
Show file tree

Hide file tree

Showing 4 changed files with 20 additions and 3 deletions.
diff --git a/polars/polars-core/src/utils/mod.rs b/polars/polars-core/src/utils/mod.rs
@@ -158,6 +158,9 @@ fn flatten_df(df: &DataFrame) -> impl Iterator<Item = DataFrame> + '_ {
 #[cfg(feature = "private")]
 #[doc(hidden)]
 pub fn split_df(df: &DataFrame, n: usize) -> Result<Vec<DataFrame>> {
+    if n == 0 {
+        return Ok(vec![df.clone()]);
+    }
     let total_len = df.height();
     let chunk_size = total_len / n;
 

diff --git a/polars/polars-io/src/parquet/write.rs b/polars/polars-io/src/parquet/write.rs
@@ -55,7 +55,7 @@ where
         self
     }
 
-    /// Set the row group size during writing. This can reduce memory pressure and improve
+    /// Set the row group size (in number of rows) during writing. This can reduce memory pressure and improve
     /// writing performance.
     pub fn with_row_group_size(mut self, size: Option<usize>) -> Self {
         self.row_group_size = size;
@@ -68,7 +68,10 @@ where
         df.rechunk();
 
         if let Some(n) = self.row_group_size {
-            *df = accumulate_dataframes_vertical_unchecked(split_df(df, df.height() / n)?);
+            let n_splits = df.height() / n;
+            if n_splits > 0 {
+                *df = accumulate_dataframes_vertical_unchecked(split_df(df, n_splits)?);
+            }
         };
 
         let fields = df.schema().to_arrow().fields;

diff --git a/py-polars/polars/internals/frame.py b/py-polars/polars/internals/frame.py
@@ -1333,7 +1333,8 @@ def write_parquet(
         statistics
             Write statistics to the parquet headers. This requires extra compute.
         row_group_size
-            Size of the row groups. If None (default), the chunks of the `DataFrame` are
+            Size of the row groups in number of rows.
+            If None (default), the chunks of the `DataFrame` are
             used. Writing in smaller chunks may reduce memory pressure and improve
             writing speeds. This argument has no effect if 'pyarrow' is used.
         use_pyarrow

diff --git a/py-polars/tests/io/test_parquet.py b/py-polars/tests/io/test_parquet.py
@@ -222,3 +222,13 @@ def test_nested_dictionary() -> None:
 
         read_df = pl.read_parquet(f)
         assert df.frame_equal(read_df)
+
+
+def test_row_group_size_saturation() -> None:
+    df = pl.DataFrame({"a": [1, 2, 3]})
+    f = io.BytesIO()
+
+    # request larger chunk than rows in df
+    df.write_parquet(f, row_group_size=1024)
+    f.seek(0)
+    assert pl.read_parquet(f).frame_equal(df)