Skip to content

Commit

Permalink
parquet statistics, don't skip if a column has not stats (#2532)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Feb 3, 2022
1 parent a8a1938 commit 21f287a
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 2 deletions.
12 changes: 10 additions & 2 deletions polars/polars-io/src/parquet/read_impl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,12 @@ pub fn read_parquet<R: MmapBytesReader>(
if let Some(pred) = &predicate {
if let Some(pred) = pred.as_stats_evaluator() {
if let Some(stats) = collect_statistics(md.columns(), schema)? {
if !pred.should_read(&stats)? {
let should_read = pred.should_read(&stats);
// a parquet file may not have statistics of all columns
if matches!(should_read, Ok(false)) {
continue;
} else if !matches!(should_read, Err(PolarsError::NotFound(_))) {
let _ = should_read?;
}
}
}
Expand Down Expand Up @@ -157,8 +161,12 @@ pub(crate) fn parallel_read<R: MmapBytesReader>(
if let Some(pred) = &predicate {
if let Some(pred) = pred.as_stats_evaluator() {
if let Some(stats) = collect_statistics(md.columns(), arrow_schema)? {
if !pred.should_read(&stats)? {
let should_read = pred.should_read(&stats);
// a parquet file may not have statistics of all columns
if matches!(should_read, Ok(false)) {
continue;
} else if !matches!(should_read, Err(PolarsError::NotFound(_))) {
let _ = should_read?;
}
}
}
Expand Down
1 change: 1 addition & 0 deletions py-polars/tests/lazy_io/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.parquet
35 changes: 35 additions & 0 deletions py-polars/tests/lazy_io/test_parquet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from os import path

import polars as pl


def test_categorical_parquet_statistics() -> None:
file = path.join(path.dirname(__file__), "books.parquet")
(
pl.DataFrame(
{
"book": [
"bookA",
"bookA",
"bookB",
"bookA",
"bookA",
"bookC",
"bookC",
"bookC",
],
"transaction_id": [1, 2, 3, 4, 5, 6, 7, 8],
"user": ["bob", "bob", "bob", "tim", "lucy", "lucy", "lucy", "lucy"],
}
)
.with_column(pl.col("book").cast(pl.Categorical))
.to_parquet(file, statistics=True)
)

for par in [True, False]:
df = (
pl.scan_parquet(file, parallel=par)
.filter(pl.col("book") == "bookA")
.collect()
)
assert df.shape == (4, 3)

0 comments on commit 21f287a

Please sign in to comment.