parquet statistics, don't skip if a column has not stats (#2532)

pola-rs · Feb 3, 2022 · 21f287a · 21f287a
1 parent a8a1938
commit 21f287a
Show file tree

Hide file tree

Showing 3 changed files with 46 additions and 2 deletions.
diff --git a/polars/polars-io/src/parquet/read_impl.rs b/polars/polars-io/src/parquet/read_impl.rs
@@ -51,8 +51,12 @@ pub fn read_parquet<R: MmapBytesReader>(
         if let Some(pred) = &predicate {
             if let Some(pred) = pred.as_stats_evaluator() {
                 if let Some(stats) = collect_statistics(md.columns(), schema)? {
-                    if !pred.should_read(&stats)? {
+                    let should_read = pred.should_read(&stats);
+                    // a parquet file may not have statistics of all columns
+                    if matches!(should_read, Ok(false)) {
                         continue;
+                    } else if !matches!(should_read, Err(PolarsError::NotFound(_))) {
+                        let _ = should_read?;
                     }
                 }
             }
@@ -157,8 +161,12 @@ pub(crate) fn parallel_read<R: MmapBytesReader>(
         if let Some(pred) = &predicate {
             if let Some(pred) = pred.as_stats_evaluator() {
                 if let Some(stats) = collect_statistics(md.columns(), arrow_schema)? {
-                    if !pred.should_read(&stats)? {
+                    let should_read = pred.should_read(&stats);
+                    // a parquet file may not have statistics of all columns
+                    if matches!(should_read, Ok(false)) {
                         continue;
+                    } else if !matches!(should_read, Err(PolarsError::NotFound(_))) {
+                        let _ = should_read?;
                     }
                 }
             }

diff --git a/py-polars/tests/lazy_io/.gitignore b/py-polars/tests/lazy_io/.gitignore
@@ -0,0 +1 @@
+*.parquet
diff --git a/py-polars/tests/lazy_io/test_parquet.py b/py-polars/tests/lazy_io/test_parquet.py
@@ -0,0 +1,35 @@
+from os import path
+
+import polars as pl
+
+
+def test_categorical_parquet_statistics() -> None:
+    file = path.join(path.dirname(__file__), "books.parquet")
+    (
+        pl.DataFrame(
+            {
+                "book": [
+                    "bookA",
+                    "bookA",
+                    "bookB",
+                    "bookA",
+                    "bookA",
+                    "bookC",
+                    "bookC",
+                    "bookC",
+                ],
+                "transaction_id": [1, 2, 3, 4, 5, 6, 7, 8],
+                "user": ["bob", "bob", "bob", "tim", "lucy", "lucy", "lucy", "lucy"],
+            }
+        )
+        .with_column(pl.col("book").cast(pl.Categorical))
+        .to_parquet(file, statistics=True)
+    )
+
+    for par in [True, False]:
+        df = (
+            pl.scan_parquet(file, parallel=par)
+            .filter(pl.col("book") == "bookA")
+            .collect()
+        )
+    assert df.shape == (4, 3)