From d09164f39713363184330a68302ac7f3cb0db8d7 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sun, 13 Nov 2022 05:18:53 +0100 Subject: [PATCH] Improved ZipValidity iterators (#1284) --- src/array/binary/mod.rs | 2 +- src/array/boolean/iterator.rs | 3 ++- src/array/boolean/mod.rs | 5 +---- src/array/dictionary/mod.rs | 5 +---- src/array/fixed_size_binary/iterator.rs | 7 ++----- src/array/fixed_size_list/iterator.rs | 5 +---- src/array/list/iterator.rs | 5 +---- src/array/map/iterator.rs | 5 +---- src/array/primitive/iterator.rs | 3 ++- src/array/primitive/mod.rs | 5 +---- src/array/struct_/iterator.rs | 5 +---- src/array/utf8/mod.rs | 2 +- src/bitmap/utils/zip_validity.rs | 19 ++++++++++++++++++- src/io/avro/write/serialize.rs | 4 ++-- src/io/json/write/serialize.rs | 7 ++----- 15 files changed, 37 insertions(+), 45 deletions(-) diff --git a/src/array/binary/mod.rs b/src/array/binary/mod.rs index 24d91057c0f..e7a4e9fe8c0 100644 --- a/src/array/binary/mod.rs +++ b/src/array/binary/mod.rs @@ -119,7 +119,7 @@ impl BinaryArray { /// Returns an iterator of `Option<&[u8]>` over every element of this array. pub fn iter(&self) -> ZipValidity<&[u8], BinaryValueIter, BitmapIter> { - ZipValidity::new(self.values_iter(), self.validity.as_ref().map(|x| x.iter())) + ZipValidity::new_with_validity(self.values_iter(), self.validity.as_ref()) } /// Returns an iterator of `&[u8]` over every element of this array, ignoring the validity diff --git a/src/array/boolean/iterator.rs b/src/array/boolean/iterator.rs index 019ba5b388f..8243a8d985f 100644 --- a/src/array/boolean/iterator.rs +++ b/src/array/boolean/iterator.rs @@ -22,7 +22,8 @@ impl IntoIterator for BooleanArray { fn into_iter(self) -> Self::IntoIter { let (_, values, validity) = self.into_inner(); let values = values.into_iter(); - let validity = validity.map(|x| x.into_iter()); + let validity = + validity.and_then(|validity| (validity.unset_bits() > 0).then(|| validity.into_iter())); ZipValidity::new(values, validity) } } diff --git a/src/array/boolean/mod.rs b/src/array/boolean/mod.rs index d5fe08fbc04..1d364cfe567 100644 --- a/src/array/boolean/mod.rs +++ b/src/array/boolean/mod.rs @@ -88,10 +88,7 @@ impl BooleanArray { /// Returns an iterator over the optional values of this [`BooleanArray`]. #[inline] pub fn iter(&self) -> ZipValidity { - ZipValidity::new( - self.values().iter(), - self.validity.as_ref().map(|x| x.iter()), - ) + ZipValidity::new_with_validity(self.values().iter(), self.validity()) } /// Returns an iterator over the values of this [`BooleanArray`]. diff --git a/src/array/dictionary/mod.rs b/src/array/dictionary/mod.rs index 02d72ccd260..8b62c91dce9 100644 --- a/src/array/dictionary/mod.rs +++ b/src/array/dictionary/mod.rs @@ -191,10 +191,7 @@ impl DictionaryArray { /// This function will allocate a new [`Scalar`] per item and is usually not performant. /// Consider calling `keys_iter` and `values`, downcasting `values`, and iterating over that. pub fn iter(&self) -> ZipValidity, DictionaryValuesIter, BitmapIter> { - ZipValidity::new( - DictionaryValuesIter::new(self), - self.keys.validity().as_ref().map(|x| x.iter()), - ) + ZipValidity::new_with_validity(DictionaryValuesIter::new(self), self.keys.validity()) } /// Returns an iterator of [`Box`] diff --git a/src/array/fixed_size_binary/iterator.rs b/src/array/fixed_size_binary/iterator.rs index ad3c36968fa..0d456e0ba35 100644 --- a/src/array/fixed_size_binary/iterator.rs +++ b/src/array/fixed_size_binary/iterator.rs @@ -19,7 +19,7 @@ impl<'a> FixedSizeBinaryArray { pub fn iter( &'a self, ) -> ZipValidity<&'a [u8], std::slice::ChunksExact<'a, u8>, BitmapIter<'a>> { - ZipValidity::new(self.values_iter(), self.validity.as_ref().map(|x| x.iter())) + ZipValidity::new_with_validity(self.values_iter(), self.validity()) } /// Returns iterator over the values of [`FixedSizeBinaryArray`] @@ -42,10 +42,7 @@ impl<'a> MutableFixedSizeBinaryArray { pub fn iter( &'a self, ) -> ZipValidity<&'a [u8], std::slice::ChunksExact<'a, u8>, BitmapIter<'a>> { - ZipValidity::new( - self.iter_values(), - self.validity().as_ref().map(|x| x.iter()), - ) + ZipValidity::new(self.iter_values(), self.validity().map(|x| x.iter())) } /// Returns iterator over the values of [`MutableFixedSizeBinaryArray`] diff --git a/src/array/fixed_size_list/iterator.rs b/src/array/fixed_size_list/iterator.rs index 4ee178da5a7..2f2b70b2eff 100644 --- a/src/array/fixed_size_list/iterator.rs +++ b/src/array/fixed_size_list/iterator.rs @@ -36,10 +36,7 @@ impl<'a> IntoIterator for &'a FixedSizeListArray { impl<'a> FixedSizeListArray { /// Returns an iterator of `Option>` pub fn iter(&'a self) -> ZipIter<'a> { - ZipValidity::new( - FixedSizeListValuesIter::new(self), - self.validity.as_ref().map(|x| x.iter()), - ) + ZipValidity::new_with_validity(FixedSizeListValuesIter::new(self), self.validity()) } /// Returns an iterator of `Box` diff --git a/src/array/list/iterator.rs b/src/array/list/iterator.rs index 4aaa444665a..82b5c7dca5f 100644 --- a/src/array/list/iterator.rs +++ b/src/array/list/iterator.rs @@ -35,10 +35,7 @@ impl<'a, O: Offset> IntoIterator for &'a ListArray { impl<'a, O: Offset> ListArray { /// Returns an iterator of `Option>` pub fn iter(&'a self) -> ZipIter<'a, O> { - ZipValidity::new( - ListValuesIter::new(self), - self.validity.as_ref().map(|x| x.iter()), - ) + ZipValidity::new_with_validity(ListValuesIter::new(self), self.validity.as_ref()) } /// Returns an iterator of `Box` diff --git a/src/array/map/iterator.rs b/src/array/map/iterator.rs index 8bd8ce820ed..5867372c886 100644 --- a/src/array/map/iterator.rs +++ b/src/array/map/iterator.rs @@ -72,10 +72,7 @@ impl<'a> IntoIterator for &'a MapArray { impl<'a> MapArray { /// Returns an iterator of `Option>` pub fn iter(&'a self) -> ZipValidity, MapValuesIter<'a>, BitmapIter<'a>> { - ZipValidity::new( - MapValuesIter::new(self), - self.validity.as_ref().map(|x| x.iter()), - ) + ZipValidity::new_with_validity(MapValuesIter::new(self), self.validity()) } /// Returns an iterator of `Box` diff --git a/src/array/primitive/iterator.rs b/src/array/primitive/iterator.rs index 89a8842e8fd..18e213b563d 100644 --- a/src/array/primitive/iterator.rs +++ b/src/array/primitive/iterator.rs @@ -16,7 +16,8 @@ impl IntoIterator for PrimitiveArray { fn into_iter(self) -> Self::IntoIter { let (_, values, validity) = self.into_inner(); let values = values.into_iter(); - let validity = validity.map(|x| x.into_iter()); + let validity = + validity.and_then(|validity| (validity.unset_bits() > 0).then(|| validity.into_iter())); ZipValidity::new(values, validity) } } diff --git a/src/array/primitive/mod.rs b/src/array/primitive/mod.rs index a35ae530a2e..6b62fe220cd 100644 --- a/src/array/primitive/mod.rs +++ b/src/array/primitive/mod.rs @@ -141,10 +141,7 @@ impl PrimitiveArray { /// Returns an iterator over the values and validity, `Option<&T>`. #[inline] pub fn iter(&self) -> ZipValidity<&T, std::slice::Iter, BitmapIter> { - ZipValidity::new( - self.values().iter(), - self.validity().as_ref().map(|x| x.iter()), - ) + ZipValidity::new_with_validity(self.values().iter(), self.validity()) } /// Returns an iterator of the values, `&T`, ignoring the arrays' validity. diff --git a/src/array/struct_/iterator.rs b/src/array/struct_/iterator.rs index c5c75a6b925..2b6849d2e26 100644 --- a/src/array/struct_/iterator.rs +++ b/src/array/struct_/iterator.rs @@ -89,10 +89,7 @@ impl<'a> IntoIterator for &'a StructArray { impl<'a> StructArray { /// Returns an iterator of `Option>` pub fn iter(&'a self) -> ZipIter<'a> { - ZipValidity::new( - StructValueIter::new(self), - self.validity.as_ref().map(|x| x.iter()), - ) + ZipValidity::new_with_validity(StructValueIter::new(self), self.validity()) } /// Returns an iterator of `Box` diff --git a/src/array/utf8/mod.rs b/src/array/utf8/mod.rs index 3c07bef7c93..f91e1466451 100644 --- a/src/array/utf8/mod.rs +++ b/src/array/utf8/mod.rs @@ -133,7 +133,7 @@ impl Utf8Array { /// Returns an iterator of `Option<&str>` pub fn iter(&self) -> ZipValidity<&str, Utf8ValuesIter, BitmapIter> { - ZipValidity::new(self.values_iter(), self.validity.as_ref().map(|x| x.iter())) + ZipValidity::new_with_validity(self.values_iter(), self.validity()) } /// Returns an iterator of `&str` diff --git a/src/bitmap/utils/zip_validity.rs b/src/bitmap/utils/zip_validity.rs index 1b6dd3901a6..40965bab411 100644 --- a/src/bitmap/utils/zip_validity.rs +++ b/src/bitmap/utils/zip_validity.rs @@ -1,3 +1,5 @@ +use crate::bitmap::utils::BitmapIter; +use crate::bitmap::Bitmap; use crate::trusted_len::TrustedLen; /// An [`Iterator`] over validity and values. @@ -111,7 +113,22 @@ where pub fn new(values: I, validity: Option) -> Self { match validity { Some(validity) => Self::Optional(ZipValidityIter::new(values, validity)), - None => Self::Required(values), + _ => Self::Required(values), + } + } +} + +impl<'a, T, I> ZipValidity> +where + I: Iterator, +{ + /// Returns a new [`ZipValidity`] and drops the `validity` if all values + /// are valid. + pub fn new_with_validity(values: I, validity: Option<&'a Bitmap>) -> Self { + // only if the validity has nulls we take the optional branch. + match validity.and_then(|validity| (validity.unset_bits() > 0).then(|| validity.iter())) { + Some(validity) => Self::Optional(ZipValidityIter::new(values, validity)), + _ => Self::Required(values), } } } diff --git a/src/io/avro/write/serialize.rs b/src/io/avro/write/serialize.rs index f3a56a22501..cb94b78d28b 100644 --- a/src/io/avro/write/serialize.rs +++ b/src/io/avro/write/serialize.rs @@ -126,7 +126,7 @@ fn list_optional<'a, O: Offset>(array: &'a ListArray, schema: &AvroSchema) -> .offsets() .windows(2) .map(|w| (w[1] - w[0]).to_usize() as i64); - let lengths = ZipValidity::new(lengths, array.validity().as_ref().map(|x| x.iter())); + let lengths = ZipValidity::new_with_validity(lengths, array.validity()); Box::new(BufStreamingIterator::new( lengths, @@ -180,7 +180,7 @@ fn struct_optional<'a>(array: &'a StructArray, schema: &Record) -> BoxSerializer .map(|(x, schema)| new_serializer(x.as_ref(), schema)) .collect::>(); - let iterator = ZipValidity::new(0..array.len(), array.validity().as_ref().map(|x| x.iter())); + let iterator = ZipValidity::new_with_validity(0..array.len(), array.validity()); Box::new(BufStreamingIterator::new( iterator, diff --git a/src/io/json/write/serialize.rs b/src/io/json/write/serialize.rs index ebdb2f3b81b..aa323109e9b 100644 --- a/src/io/json/write/serialize.rs +++ b/src/io/json/write/serialize.rs @@ -103,7 +103,7 @@ fn struct_serializer<'a>( let names = array.fields().iter().map(|f| f.name.as_str()); Box::new(BufStreamingIterator::new( - ZipValidity::new(0..array.len(), array.validity().map(|x| x.iter())), + ZipValidity::new_with_validity(0..array.len(), array.validity()), move |maybe, buf| { if maybe.is_some() { let names = names.clone(); @@ -140,10 +140,7 @@ fn list_serializer<'a, O: Offset>( let mut serializer = new_serializer(array.values().as_ref()); Box::new(BufStreamingIterator::new( - ZipValidity::new( - array.offsets().windows(2), - array.validity().map(|x| x.iter()), - ), + ZipValidity::new_with_validity(array.offsets().windows(2), array.validity()), move |offset, buf| { if let Some(offset) = offset { let length = (offset[1] - offset[0]).to_usize();