Skip to content

Commit

Permalink
expose writing parquet statistics (#2320)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Jan 8, 2022
1 parent eee62da commit 30f4608
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 4 deletions.
9 changes: 8 additions & 1 deletion polars/polars-io/src/parquet/write.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ impl FallibleStreamingIterator for Bla {
pub struct ParquetWriter<W> {
writer: W,
compression: write::Compression,
statistics: bool,
}

pub use write::Compression as ParquetCompression;
Expand All @@ -61,6 +62,7 @@ where
ParquetWriter {
writer,
compression: write::Compression::Snappy,
statistics: false,
}
}

Expand All @@ -70,13 +72,18 @@ where
self
}

pub fn with_statistics(mut self, statistics: bool) -> Self {
self.statistics = statistics;
self
}

/// Write the given DataFrame in the the writer `W`.
pub fn finish(mut self, df: &DataFrame) -> Result<()> {
let fields = df.schema().to_arrow().fields().clone();
let rb_iter = df.iter_chunks();

let options = write::WriteOptions {
write_statistics: false,
write_statistics: self.statistics,
compression: self.compression,
version: write::Version::V2,
};
Expand Down
11 changes: 9 additions & 2 deletions py-polars/polars/internals/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1068,6 +1068,7 @@ def to_parquet(
str,
]
] = "snappy",
statistics: bool = False,
use_pyarrow: bool = False,
**kwargs: Any,
) -> None:
Expand All @@ -1087,6 +1088,8 @@ def to_parquet(
- "brotli"
- "lz4"
- "zstd"
statistics
Write statistics to the parquet headers. This requires extra compute.
use_pyarrow
Use C++ parquet implementation vs rust parquet implementation.
At the moment C++ supports more features.
Expand Down Expand Up @@ -1119,10 +1122,14 @@ def to_parquet(
tbl = pa.table(data)

pa.parquet.write_table(
table=tbl, where=file, compression=compression, **kwargs
table=tbl,
where=file,
compression=compression,
write_statistics=statistics,
**kwargs,
)
else:
self._df.to_parquet(file, compression)
self._df.to_parquet(file, compression, statistics)

def to_numpy(self) -> np.ndarray:
"""
Expand Down
3 changes: 2 additions & 1 deletion py-polars/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ impl PyDataFrame {
}

#[cfg(feature = "parquet")]
pub fn to_parquet(&self, py_f: PyObject, compression: &str) -> PyResult<()> {
pub fn to_parquet(&self, py_f: PyObject, compression: &str, statistics: bool) -> PyResult<()> {
let compression = match compression {
"uncompressed" => ParquetCompression::Uncompressed,
"snappy" => ParquetCompression::Snappy,
Expand All @@ -365,6 +365,7 @@ impl PyDataFrame {

ParquetWriter::new(buf)
.with_compression(compression)
.with_statistics(statistics)
.finish(&self.df)
.map_err(PyPolarsEr::from)?;
Ok(())
Expand Down

0 comments on commit 30f4608

Please sign in to comment.