Skip to content

Commit

Permalink
add bytes_estimate for binary push in parquet deserialize (jorgecarle…
Browse files Browse the repository at this point in the history
  • Loading branch information
sundy-li committed Dec 12, 2022
1 parent 1417f88 commit 1fcfd7c
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 3 deletions.
8 changes: 6 additions & 2 deletions src/io/parquet/read/deserialize/binary/basic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -477,9 +477,13 @@ impl<'a, O: Offset> utils::Decoder<'a> for BinaryDecoder<O> {

pub(super) fn finish<O: Offset, A: TraitBinaryArray<O>>(
data_type: &DataType,
values: Binary<O>,
validity: MutableBitmap,
mut values: Binary<O>,
mut validity: MutableBitmap,
) -> Result<A> {
values.offsets.shrink_to_fit();
values.values.shrink_to_fit();
validity.shrink_to_fit();

A::try_new(
data_type.clone(),
values.offsets.into(),
Expand Down
10 changes: 9 additions & 1 deletion src/io/parquet/read/deserialize/binary/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,20 @@ impl<O: Offset> Binary<O> {
pub fn with_capacity(capacity: usize) -> Self {
Self {
offsets: Offsets::with_capacity(capacity),
values: Vec::with_capacity(capacity * 24),
values: Vec::with_capacity(capacity.min(100) * 24),
}
}

#[inline]
pub fn push(&mut self, v: &[u8]) {
if self.offsets.len() == 100 && self.offsets.capacity() > 100 {
let bytes_per_row = self.values.len() / 100 + 1;
let bytes_estimate = bytes_per_row * self.offsets.capacity();
if bytes_estimate > self.values.capacity() {
self.values.reserve(bytes_estimate - self.values.capacity());
}
}

self.values.extend(v);
self.offsets.try_push_usize(v.len()).unwrap()
}
Expand Down

0 comments on commit 1fcfd7c

Please sign in to comment.