From e68f2c026429e59d88f95e2e7b77afebba64e2d5 Mon Sep 17 00:00:00 2001 From: Jorge Leitao Date: Sat, 10 Dec 2022 06:08:36 +0100 Subject: [PATCH] Added `Offsets` and `OffsetsBuffer` (#1316) --- benches/iter_list.rs | 5 +- src/array/binary/ffi.rs | 16 +- src/array/binary/fmt.rs | 3 +- src/array/binary/from.rs | 2 +- src/array/binary/iterator.rs | 3 +- src/array/binary/mod.rs | 231 +++----- src/array/binary/mutable.rs | 70 +-- src/array/binary/mutable_values.rs | 118 ++--- src/array/dictionary/mutable.rs | 15 +- src/array/equal/binary.rs | 3 +- src/array/equal/list.rs | 3 +- src/array/equal/mod.rs | 1 + src/array/equal/utf8.rs | 3 +- src/array/growable/binary.rs | 29 +- src/array/growable/list.rs | 78 +-- src/array/growable/utf8.rs | 31 +- src/array/growable/utils.rs | 15 +- src/array/list/ffi.rs | 15 +- src/array/list/fmt.rs | 2 +- src/array/list/iterator.rs | 2 +- src/array/list/mod.rs | 131 +---- src/array/list/mutable.rs | 155 ++---- src/array/map/ffi.rs | 11 +- src/array/map/mod.rs | 47 +- src/array/mod.rs | 3 +- src/array/ord.rs | 1 + src/array/physical_binary.rs | 140 +---- src/array/specification.rs | 183 ++++--- src/array/utf8/ffi.rs | 16 +- src/array/utf8/fmt.rs | 3 +- src/array/utf8/from.rs | 2 +- src/array/utf8/iterator.rs | 3 +- src/array/utf8/mod.rs | 74 +-- src/array/utf8/mutable.rs | 20 +- src/array/utf8/mutable_values.rs | 45 +- src/compute/aggregate/memory.rs | 2 +- src/compute/aggregate/min_max.rs | 3 +- src/compute/cast/binary_to.rs | 23 +- src/compute/cast/boolean_to.rs | 3 +- src/compute/cast/mod.rs | 30 +- src/compute/cast/primitive_to.rs | 11 +- src/compute/cast/utf8_to.rs | 28 +- src/compute/comparison/binary.rs | 3 +- src/compute/comparison/utf8.rs | 3 +- src/compute/contains.rs | 3 +- src/compute/hash.rs | 3 +- src/compute/length.rs | 2 + src/compute/like.rs | 3 +- src/compute/regex_match.rs | 6 +- src/compute/sort/binary.rs | 3 +- src/compute/sort/mod.rs | 1 + src/compute/sort/row/mod.rs | 3 +- src/compute/sort/utf8.rs | 4 +- src/compute/substring.rs | 13 +- src/compute/take/binary.rs | 3 +- src/compute/take/generic_binary.rs | 76 +-- src/compute/take/list.rs | 3 +- src/compute/take/utf8.rs | 3 +- src/compute/utf8.rs | 3 +- src/ffi/mmap.rs | 3 +- src/io/avro/read/nested.rs | 36 +- src/io/avro/write/serialize.rs | 3 + src/io/csv/read_utils.rs | 15 +- src/io/csv/write/serialize.rs | 6 +- src/io/ipc/read/array/binary.rs | 5 +- src/io/ipc/read/array/list.rs | 5 +- src/io/ipc/read/array/map.rs | 2 +- src/io/ipc/read/array/utf8.rs | 5 +- src/io/ipc/write/serialize.rs | 16 +- src/io/json/read/deserialize.rs | 79 +-- src/io/json/write/serialize.rs | 3 +- src/io/json_integration/read/array.rs | 16 +- src/io/odbc/read/deserialize.rs | 12 +- src/io/odbc/write/serialize.rs | 3 + src/io/orc/read/mod.rs | 40 +- .../parquet/read/deserialize/binary/basic.rs | 21 +- .../read/deserialize/binary/dictionary.rs | 8 +- .../parquet/read/deserialize/binary/nested.rs | 4 +- .../parquet/read/deserialize/binary/utils.rs | 53 +- src/io/parquet/read/deserialize/mod.rs | 8 +- src/io/parquet/read/deserialize/nested.rs | 2 +- src/io/parquet/read/statistics/binary.rs | 3 +- src/io/parquet/read/statistics/list.rs | 11 +- src/io/parquet/read/statistics/map.rs | 2 +- src/io/parquet/read/statistics/utf8.rs | 3 +- src/io/parquet/write/binary/basic.rs | 5 +- src/io/parquet/write/binary/nested.rs | 3 +- src/io/parquet/write/nested/def.rs | 2 +- src/io/parquet/write/nested/mod.rs | 2 +- src/io/parquet/write/pages.rs | 9 +- src/io/parquet/write/utf8/basic.rs | 5 +- src/io/parquet/write/utf8/nested.rs | 3 +- src/lib.rs | 1 + src/offset.rs | 496 ++++++++++++++++++ src/scalar/binary.rs | 2 +- src/scalar/list.rs | 2 +- src/scalar/utf8.rs | 2 +- src/temporal_conversions.rs | 3 +- src/types/index.rs | 8 + src/util/bench_util.rs | 2 +- tests/it/array/binary/mod.rs | 19 +- tests/it/array/binary/mutable.rs | 16 +- tests/it/array/binary/mutable_values.rs | 29 +- tests/it/array/binary/to_mutable.rs | 8 +- tests/it/array/equal/list.rs | 5 +- tests/it/array/equal/utf8.rs | 1 + tests/it/array/list/mod.rs | 10 +- tests/it/array/list/mutable.rs | 6 +- tests/it/array/map/mod.rs | 7 +- tests/it/array/utf8/mod.rs | 76 +-- tests/it/array/utf8/mutable.rs | 15 +- tests/it/array/utf8/mutable_values.rs | 31 +- tests/it/array/utf8/to_mutable.rs | 12 +- tests/it/compute/length.rs | 1 + tests/it/compute/regex_match.rs | 3 +- tests/it/compute/substring.rs | 2 +- tests/it/compute/take.rs | 14 +- tests/it/compute/utf8.rs | 2 +- tests/it/ffi/data.rs | 6 +- tests/it/io/avro/write.rs | 4 +- tests/it/io/ipc/read/file.rs | 6 + tests/it/io/json/write.rs | 2 +- tests/it/io/ndjson/mod.rs | 3 +- tests/it/io/parquet/mod.rs | 18 +- 124 files changed, 1449 insertions(+), 1470 deletions(-) create mode 100644 src/offset.rs diff --git a/benches/iter_list.rs b/benches/iter_list.rs index ba576cb67b0..f77c9536e6d 100644 --- a/benches/iter_list.rs +++ b/benches/iter_list.rs @@ -16,8 +16,7 @@ fn add_benchmark(c: &mut Criterion) { let values = Buffer::from_iter(0..size as i32); let values = PrimitiveArray::::from_data(DataType::Int32, values, None); - let mut offsets = (0..size as i32).step_by(2).collect::>(); - offsets.push(size as i32); + let offsets = (0..=size as i32).step_by(2).collect::>(); let validity = (0..(offsets.len() - 1)) .map(|i| i % 4 == 0) @@ -26,7 +25,7 @@ fn add_benchmark(c: &mut Criterion) { let data_type = ListArray::::default_datatype(DataType::Int32); let array = ListArray::::from_data( data_type, - offsets.into(), + offsets.try_into().unwrap(), Box::new(values), Some(validity), ); diff --git a/src/array/binary/ffi.rs b/src/array/binary/ffi.rs index 2c6792237ad..6f971c4226f 100644 --- a/src/array/binary/ffi.rs +++ b/src/array/binary/ffi.rs @@ -1,7 +1,8 @@ use crate::{ - array::{FromFfi, Offset, ToFfi}, + array::{FromFfi, ToFfi}, bitmap::align, ffi, + offset::{Offset, OffsetsBuffer}, }; use crate::error::Result; @@ -12,13 +13,13 @@ unsafe impl ToFfi for BinaryArray { fn buffers(&self) -> Vec> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), - Some(self.offsets.as_ptr().cast::()), + Some(self.offsets.buffer().as_ptr().cast::()), Some(self.values.as_ptr().cast::()), ] } fn offset(&self) -> Option { - let offset = self.offsets.offset(); + let offset = self.offsets.buffer().offset(); if let Some(bitmap) = self.validity.as_ref() { if bitmap.offset() == offset { Some(offset) @@ -31,7 +32,7 @@ unsafe impl ToFfi for BinaryArray { } fn to_ffi_aligned(&self) -> Self { - let offset = self.offsets.offset(); + let offset = self.offsets.buffer().offset(); let validity = self.validity.as_ref().map(|bitmap| { if bitmap.offset() == offset { @@ -58,8 +59,9 @@ impl FromFfi for BinaryArray { let offsets = unsafe { array.buffer::(1) }?; let values = unsafe { array.buffer::(2) }?; - Ok(Self::from_data_unchecked( - data_type, offsets, values, validity, - )) + // assumption that data from FFI is well constructed + let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets) }; + + Ok(Self::new(data_type, offsets, values, validity)) } } diff --git a/src/array/binary/fmt.rs b/src/array/binary/fmt.rs index c068d0d3656..c2ec8737fb2 100644 --- a/src/array/binary/fmt.rs +++ b/src/array/binary/fmt.rs @@ -1,7 +1,8 @@ use std::fmt::{Debug, Formatter, Result, Write}; +use crate::offset::Offset; + use super::super::fmt::write_vec; -use super::super::Offset; use super::BinaryArray; pub fn write_value(array: &BinaryArray, index: usize, f: &mut W) -> Result { diff --git a/src/array/binary/from.rs b/src/array/binary/from.rs index aa575ccd9cc..8556da6906f 100644 --- a/src/array/binary/from.rs +++ b/src/array/binary/from.rs @@ -1,6 +1,6 @@ use std::iter::FromIterator; -use crate::array::Offset; +use crate::offset::Offset; use super::{BinaryArray, MutableBinaryArray}; diff --git a/src/array/binary/iterator.rs b/src/array/binary/iterator.rs index 2af79e6a296..042913a71d9 100644 --- a/src/array/binary/iterator.rs +++ b/src/array/binary/iterator.rs @@ -1,6 +1,7 @@ use crate::{ - array::{ArrayAccessor, ArrayValuesIter, Offset}, + array::{ArrayAccessor, ArrayValuesIter}, bitmap::utils::{BitmapIter, ZipValidity}, + offset::Offset, }; use super::{BinaryArray, MutableBinaryValuesArray}; diff --git a/src/array/binary/mod.rs b/src/array/binary/mod.rs index e7a4e9fe8c0..1a5abdcc330 100644 --- a/src/array/binary/mod.rs +++ b/src/array/binary/mod.rs @@ -6,15 +6,13 @@ use crate::{ buffer::Buffer, datatypes::DataType, error::Error, + offset::{Offset, OffsetsBuffer}, trusted_len::TrustedLen, }; use either::Either; -use super::{ - specification::{try_check_offsets, try_check_offsets_bounds}, - Array, GenericBinaryArray, Offset, -}; +use super::{specification::try_check_offsets_bounds, Array, GenericBinaryArray}; mod ffi; pub(super) mod fmt; @@ -42,7 +40,7 @@ pub use mutable::*; /// assert_eq!(array.values_iter().collect::>(), vec![[1, 2].as_ref(), &[], &[3]]); /// // the underlying representation: /// assert_eq!(array.values(), &Buffer::from(vec![1, 2, 3])); -/// assert_eq!(array.offsets(), &Buffer::from(vec![0, 2, 2, 3])); +/// assert_eq!(array.offsets().buffer(), &Buffer::from(vec![0, 2, 2, 3])); /// assert_eq!(array.validity(), Some(&Bitmap::from([true, false, true]))); /// ``` /// @@ -59,7 +57,7 @@ pub use mutable::*; #[derive(Clone)] pub struct BinaryArray { data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, } @@ -69,23 +67,22 @@ impl BinaryArray { /// /// # Errors /// This function returns an error iff: - /// * the offsets are not monotonically increasing /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. + /// * the validity's length is not equal to `offsets.len()`. /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`. /// # Implementation - /// This function is `O(N)` - checking monotinicity is `O(N)` + /// This function is `O(1)` pub fn try_new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Result { - try_check_offsets(&offsets, values.len())?; + try_check_offsets_bounds(&offsets, values.len())?; if validity .as_ref() - .map_or(false, |validity| validity.len() != offsets.len() - 1) + .map_or(false, |validity| validity.len() != offsets.len()) { return Err(Error::oos( "validity mask length must match the number of values", @@ -130,7 +127,7 @@ impl BinaryArray { /// Returns the length of this array #[inline] pub fn len(&self) -> usize { - self.offsets.len() - 1 + self.offsets.len() } /// Returns the element at index `i` @@ -148,8 +145,7 @@ impl BinaryArray { #[inline] pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] { // soundness: the invariant of the function - let start = self.offsets.get_unchecked(i).to_usize(); - let end = self.offsets.get_unchecked(i + 1).to_usize(); + let (start, end) = self.offsets.start_end_unchecked(i); // soundness: the invariant of the struct self.values.get_unchecked(start..end) @@ -169,7 +165,7 @@ impl BinaryArray { /// Returns the offsets of this [`BinaryArray`]. #[inline] - pub fn offsets(&self) -> &Buffer { + pub fn offsets(&self) -> &OffsetsBuffer { &self.offsets } @@ -249,90 +245,78 @@ impl BinaryArray { if let Some(bitmap) = self.validity { match bitmap.into_mut() { // Safety: invariants are preserved - Left(bitmap) => Left(unsafe { - BinaryArray::new_unchecked( - self.data_type, - self.offsets, - self.values, - Some(bitmap), - ) - }), + Left(bitmap) => Left(BinaryArray::new( + self.data_type, + self.offsets, + self.values, + Some(bitmap), + )), Right(mutable_bitmap) => match ( self.values.get_mut().map(std::mem::take), - self.offsets.get_mut().map(std::mem::take), + self.offsets.get_mut(), ) { - (None, None) => { - // Safety: invariants are preserved - Left(unsafe { - BinaryArray::new_unchecked( - self.data_type, - self.offsets, - self.values, - Some(mutable_bitmap.into()), - ) - }) - } - (None, Some(offsets)) => { - // Safety: invariants are preserved - Left(unsafe { - BinaryArray::new_unchecked( - self.data_type, - offsets.into(), - self.values, - Some(mutable_bitmap.into()), - ) - }) - } - (Some(mutable_values), None) => { - // Safety: invariants are preserved - Left(unsafe { - BinaryArray::new_unchecked( - self.data_type, - self.offsets, - mutable_values.into(), - Some(mutable_bitmap.into()), - ) - }) - } - (Some(values), Some(offsets)) => Right(unsafe { - MutableBinaryArray::from_data( - self.data_type, - offsets, - values, - Some(mutable_bitmap), - ) - }), + (None, None) => Left(BinaryArray::new( + self.data_type, + self.offsets, + self.values, + Some(mutable_bitmap.into()), + )), + (None, Some(offsets)) => Left(BinaryArray::new( + self.data_type, + offsets.into(), + self.values, + Some(mutable_bitmap.into()), + )), + (Some(mutable_values), None) => Left(BinaryArray::new( + self.data_type, + self.offsets, + mutable_values.into(), + Some(mutable_bitmap.into()), + )), + (Some(values), Some(offsets)) => Right(MutableBinaryArray::from_data( + self.data_type, + offsets, + values, + Some(mutable_bitmap), + )), }, } } else { match ( self.values.get_mut().map(std::mem::take), - self.offsets.get_mut().map(std::mem::take), + self.offsets.get_mut(), ) { - (None, None) => Left(unsafe { - BinaryArray::new_unchecked(self.data_type, self.offsets, self.values, None) - }), - (None, Some(offsets)) => Left(unsafe { - BinaryArray::new_unchecked(self.data_type, offsets.into(), self.values, None) - }), - (Some(values), None) => Left(unsafe { - BinaryArray::new_unchecked(self.data_type, self.offsets, values.into(), None) - }), - (Some(values), Some(offsets)) => Right(unsafe { - MutableBinaryArray::from_data(self.data_type, offsets, values, None) - }), + (None, None) => Left(BinaryArray::new( + self.data_type, + self.offsets, + self.values, + None, + )), + (None, Some(offsets)) => Left(BinaryArray::new( + self.data_type, + offsets.into(), + self.values, + None, + )), + (Some(values), None) => Left(BinaryArray::new( + self.data_type, + self.offsets, + values.into(), + None, + )), + (Some(values), Some(offsets)) => Right(MutableBinaryArray::from_data( + self.data_type, + offsets, + values, + None, + )), } } } /// Creates an empty [`BinaryArray`], i.e. whose `.len` is zero. pub fn new_empty(data_type: DataType) -> Self { - Self::new( - data_type, - Buffer::from(vec![O::zero()]), - Buffer::new(), - None, - ) + Self::new(data_type, OffsetsBuffer::new(), Buffer::new(), None) } /// Creates an null [`BinaryArray`], i.e. whose `.null_count() == .len()`. @@ -340,7 +324,7 @@ impl BinaryArray { pub fn new_null(data_type: DataType, length: usize) -> Self { Self::new( data_type, - vec![O::default(); 1 + length].into(), + vec![O::default(); 1 + length].try_into().unwrap(), Buffer::new(), Some(Bitmap::new_zeroed(length)), ) @@ -355,72 +339,16 @@ impl BinaryArray { } } - /// Creates a new [`BinaryArray`] without checking for offsets monotinicity. - /// - /// # Errors - /// This function returns an error iff: - /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. - /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`. - /// # Safety - /// This function is unsafe iff: - /// * the offsets are not monotonically increasing - /// # Implementation - /// This function is `O(1)` - pub unsafe fn try_new_unchecked( - data_type: DataType, - offsets: Buffer, - values: Buffer, - validity: Option, - ) -> Result { - try_check_offsets_bounds(&offsets, values.len())?; - - if validity - .as_ref() - .map_or(false, |validity| validity.len() != offsets.len() - 1) - { - return Err(Error::oos( - "validity mask length must match the number of values", - )); - } - - if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { - return Err(Error::oos( - "BinaryArray can only be initialized with DataType::Binary or DataType::LargeBinary", - )); - } - - Ok(Self { - data_type, - offsets, - values, - validity, - }) - } - /// Alias for unwrapping [`Self::try_new`] pub fn new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Self { Self::try_new(data_type, offsets, values, validity).unwrap() } - /// Alias for unwrapping [`Self::try_new_unchecked`] - /// # Safety - /// This function is unsafe iff: - /// * the offsets are not monotonically increasing - pub unsafe fn new_unchecked( - data_type: DataType, - offsets: Buffer, - values: Buffer, - validity: Option, - ) -> Self { - Self::try_new_unchecked(data_type, offsets, values, validity).unwrap() - } - /// Returns a [`BinaryArray`] from an iterator of trusted length. /// /// The [`BinaryArray`] is guaranteed to not have a validity @@ -486,23 +414,10 @@ impl BinaryArray { unsafe { Self::try_from_trusted_len_iter_unchecked(iter) } } - /// Alias for [`Self::new_unchecked`] - /// # Safety - /// This function is unsafe iff: - /// * the offsets are not monotonically increasing - pub unsafe fn from_data_unchecked( - data_type: DataType, - offsets: Buffer, - values: Buffer, - validity: Option, - ) -> Self { - Self::new_unchecked(data_type, offsets, values, validity) - } - /// Alias for `new` pub fn from_data( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Self { @@ -557,6 +472,6 @@ unsafe impl GenericBinaryArray for BinaryArray { #[inline] fn offsets(&self) -> &[O] { - self.offsets() + self.offsets().buffer() } } diff --git a/src/array/binary/mutable.rs b/src/array/binary/mutable.rs index 15035aed0ed..0f8655e33bf 100644 --- a/src/array/binary/mutable.rs +++ b/src/array/binary/mutable.rs @@ -1,13 +1,14 @@ use std::{iter::FromIterator, sync::Arc}; use crate::{ - array::{Array, MutableArray, Offset, TryExtend, TryExtendFromSelf, TryPush}, + array::{Array, MutableArray, TryExtend, TryExtendFromSelf, TryPush}, bitmap::{ utils::{BitmapIter, ZipValidity}, Bitmap, MutableBitmap, }, datatypes::DataType, error::{Error, Result}, + offset::{Offset, Offsets}, trusted_len::TrustedLen, }; @@ -53,15 +54,14 @@ impl MutableBinaryArray { /// /// # Errors /// This function returns an error iff: - /// * the offsets are not monotonically increasing /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. + /// * the validity's length is not equal to `offsets.len()`. /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`. /// # Implementation - /// This function is `O(N)` - checking monotinicity is `O(N)` + /// This function is `O(1)` pub fn try_new( data_type: DataType, - offsets: Vec, + offsets: Offsets, values: Vec, validity: Option, ) -> Result { @@ -79,26 +79,6 @@ impl MutableBinaryArray { Ok(Self { values, validity }) } - /// Create a [`MutableBinaryArray`] out of its inner attributes. - /// # Safety - /// The caller must ensure that every value between offsets is a valid utf8. - /// # Panics - /// This function panics iff: - /// * The `offsets` and `values` are inconsistent - /// * The validity is not `None` and its length is different from `offsets`'s length minus one. - pub unsafe fn new_unchecked( - data_type: DataType, - offsets: Vec, - values: Vec, - validity: Option, - ) -> Self { - let values = MutableBinaryValuesArray::new_unchecked(data_type, offsets, values); - if let Some(ref validity) = validity { - assert_eq!(values.len(), validity.len()); - } - Self { values, validity } - } - /// Creates a new [`MutableBinaryArray`] from a slice of optional `&[u8]`. // Note: this can't be `impl From` because Rust does not allow double `AsRef` on it. pub fn from, P: AsRef<[Option]>>(slice: P) -> Self { @@ -184,7 +164,7 @@ impl MutableBinaryArray { /// Equivalent to `Self::try_new(...).unwrap()` pub fn from_data( data_type: DataType, - offsets: Vec, + offsets: Offsets, values: Vec, validity: Option, ) -> Self { @@ -199,7 +179,7 @@ impl MutableBinaryArray { } /// returns its offsets. - pub fn offsets(&self) -> &Vec { + pub fn offsets(&self) -> &Offsets { self.values.offsets() } @@ -224,34 +204,24 @@ impl MutableArray for MutableBinaryArray { } fn as_box(&mut self) -> Box { - // Safety: - // `MutableBinaryArray` has the same invariants as `BinaryArray` and thus - // `BinaryArray` can be safely created from `MutableBinaryArray` without checks. let (data_type, offsets, values) = std::mem::take(&mut self.values).into_inner(); - unsafe { - BinaryArray::new_unchecked( - data_type, - offsets.into(), - values.into(), - std::mem::take(&mut self.validity).map(|x| x.into()), - ) - } + BinaryArray::new( + data_type, + offsets.into(), + values.into(), + std::mem::take(&mut self.validity).map(|x| x.into()), + ) .boxed() } fn as_arc(&mut self) -> Arc { - // Safety: - // `MutableBinaryArray` has the same invariants as `BinaryArray` and thus - // `BinaryArray` can be safely created from `MutableBinaryArray` without checks. let (data_type, offsets, values) = std::mem::take(&mut self.values).into_inner(); - unsafe { - BinaryArray::new_unchecked( - data_type, - offsets.into(), - values.into(), - std::mem::take(&mut self.validity).map(|x| x.into()), - ) - } + BinaryArray::new( + data_type, + offsets.into(), + values.into(), + std::mem::take(&mut self.validity).map(|x| x.into()), + ) .arced() } @@ -322,7 +292,7 @@ impl MutableBinaryArray { pub unsafe fn from_trusted_len_values_iter_unchecked, I: Iterator>( iterator: I, ) -> Self { - let (offsets, values) = unsafe { trusted_len_values_iter(iterator) }; + let (offsets, values) = trusted_len_values_iter(iterator); Self::from_data(Self::default_data_type(), offsets, values, None) } diff --git a/src/array/binary/mutable_values.rs b/src/array/binary/mutable_values.rs index 53a43d69c7a..e52516877dd 100644 --- a/src/array/binary/mutable_values.rs +++ b/src/array/binary/mutable_values.rs @@ -2,13 +2,13 @@ use std::{iter::FromIterator, sync::Arc}; use crate::{ array::{ - specification::{check_offsets_minimal, try_check_offsets}, - Array, ArrayAccessor, ArrayValuesIter, MutableArray, Offset, TryExtend, TryExtendFromSelf, - TryPush, + specification::try_check_offsets_bounds, Array, ArrayAccessor, ArrayValuesIter, + MutableArray, TryExtend, TryExtendFromSelf, TryPush, }, bitmap::MutableBitmap, datatypes::DataType, error::{Error, Result}, + offset::{Offset, Offsets}, trusted_len::TrustedLen, }; @@ -20,38 +20,24 @@ use crate::array::physical_binary::*; #[derive(Debug, Clone)] pub struct MutableBinaryValuesArray { data_type: DataType, - offsets: Vec, + offsets: Offsets, values: Vec, } impl From> for BinaryArray { fn from(other: MutableBinaryValuesArray) -> Self { - // Safety: - // `MutableBinaryValuesArray` has the same invariants as `BinaryArray` and thus - // `BinaryArray` can be safely created from `MutableBinaryValuesArray` without checks. - unsafe { - BinaryArray::::from_data_unchecked( - other.data_type, - other.offsets.into(), - other.values.into(), - None, - ) - } + BinaryArray::::new( + other.data_type, + other.offsets.into(), + other.values.into(), + None, + ) } } impl From> for MutableBinaryArray { fn from(other: MutableBinaryValuesArray) -> Self { - // Safety: - // `MutableBinaryValuesArray` has the same invariants as `MutableBinaryArray` - unsafe { - MutableBinaryArray::::new_unchecked( - other.data_type, - other.offsets, - other.values, - None, - ) - } + MutableBinaryArray::::from_data(other.data_type, other.offsets, other.values, None) } } @@ -66,7 +52,7 @@ impl MutableBinaryValuesArray { pub fn new() -> Self { Self { data_type: Self::default_data_type(), - offsets: vec![O::default()], + offsets: Offsets::new(), values: Vec::::new(), } } @@ -75,13 +61,13 @@ impl MutableBinaryValuesArray { /// /// # Errors /// This function returns an error iff: - /// * the offsets are not monotonically increasing /// * The last offset is not equal to the values' length. /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`. /// # Implementation - /// This function is `O(N)` - checking monotinicity is `O(N)` - pub fn try_new(data_type: DataType, offsets: Vec, values: Vec) -> Result { - try_check_offsets(&offsets, values.len())?; + /// This function is `O(1)` + pub fn try_new(data_type: DataType, offsets: Offsets, values: Vec) -> Result { + try_check_offsets_bounds(&offsets, values.len())?; + if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { return Err(Error::oos( "MutableBinaryValuesArray can only be initialized with DataType::Binary or DataType::LargeBinary", @@ -95,31 +81,6 @@ impl MutableBinaryValuesArray { }) } - /// Returns a [`MutableBinaryValuesArray`] created from its internal representation. - /// - /// # Panic - /// This function does not panic iff: - /// * The last offset is equal to the values' length. - /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is equal to either `Binary` or `LargeBinary`. - /// # Safety - /// This function is safe iff: - /// * the offsets are monotonically increasing - /// # Implementation - /// This function is `O(1)` - pub unsafe fn new_unchecked(data_type: DataType, offsets: Vec, values: Vec) -> Self { - check_offsets_minimal(&offsets, values.len()); - - if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { - panic!("MutableBinaryValuesArray can only be initialized with DataType::Binary or DataType::LargeBinary") - } - - Self { - data_type, - offsets, - values, - } - } - /// Returns the default [`DataType`] of this container: [`DataType::Utf8`] or [`DataType::LargeUtf8`] /// depending on the generic [`Offset`]. pub fn default_data_type() -> DataType { @@ -133,12 +94,9 @@ impl MutableBinaryValuesArray { /// Initializes a new [`MutableBinaryValuesArray`] with a pre-allocated capacity of items and values. pub fn with_capacities(capacity: usize, values: usize) -> Self { - let mut offsets = Vec::::with_capacity(capacity + 1); - offsets.push(O::default()); - Self { data_type: Self::default_data_type(), - offsets, + offsets: Offsets::::with_capacity(capacity), values: Vec::::with_capacity(values), } } @@ -151,26 +109,26 @@ impl MutableBinaryValuesArray { /// returns its offsets. #[inline] - pub fn offsets(&self) -> &Vec { + pub fn offsets(&self) -> &Offsets { &self.offsets } /// Reserves `additional` elements and `additional_values` on the values. #[inline] pub fn reserve(&mut self, additional: usize, additional_values: usize) { - self.offsets.reserve(additional + 1); + self.offsets.reserve(additional); self.values.reserve(additional_values); } /// Returns the capacity in number of items pub fn capacity(&self) -> usize { - self.offsets.capacity() - 1 + self.offsets.capacity() } /// Returns the length of this array #[inline] pub fn len(&self) -> usize { - self.offsets.len() - 1 + self.offsets.len() } /// Pushes a new item to the array. @@ -188,7 +146,7 @@ impl MutableBinaryValuesArray { return None; } self.offsets.pop()?; - let start = self.offsets.last()?.to_usize(); + let start = self.offsets.last().to_usize(); let value = self.values.split_off(start); Some(value.to_vec()) } @@ -208,8 +166,7 @@ impl MutableBinaryValuesArray { #[inline] pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] { // soundness: the invariant of the function - let start = self.offsets.get_unchecked(i).to_usize(); - let end = self.offsets.get_unchecked(i + 1).to_usize(); + let (start, end) = self.offsets.start_end(i); // soundness: the invariant of the struct self.values.get_unchecked(start..end) @@ -227,7 +184,7 @@ impl MutableBinaryValuesArray { } /// Extract the low-end APIs from the [`MutableBinaryValuesArray`]. - pub fn into_inner(self) -> (DataType, Vec, Vec) { + pub fn into_inner(self) -> (DataType, Offsets, Vec) { (self.data_type, self.offsets, self.values) } } @@ -242,21 +199,13 @@ impl MutableArray for MutableBinaryValuesArray { } fn as_box(&mut self) -> Box { - // Safety: - // `MutableBinaryValuesArray` has the same invariants as `BinaryArray` and thus - // `BinaryArray` can be safely created from `MutableBinaryValuesArray` without checks. let (data_type, offsets, values) = std::mem::take(self).into_inner(); - unsafe { BinaryArray::from_data_unchecked(data_type, offsets.into(), values.into(), None) } - .boxed() + BinaryArray::new(data_type, offsets.into(), values.into(), None).boxed() } fn as_arc(&mut self) -> Arc { - // Safety: - // `MutableBinaryValuesArray` has the same invariants as `BinaryArray` and thus - // `BinaryArray` can be safely created from `MutableBinaryValuesArray` without checks. let (data_type, offsets, values) = std::mem::take(self).into_inner(); - unsafe { BinaryArray::from_data_unchecked(data_type, offsets.into(), values.into(), None) } - .arced() + BinaryArray::new(data_type, offsets.into(), values.into(), None).arced() } fn data_type(&self) -> &DataType { @@ -288,8 +237,7 @@ impl MutableArray for MutableBinaryValuesArray { impl> FromIterator

for MutableBinaryValuesArray { fn from_iter>(iter: I) -> Self { let (offsets, values) = values_iter(iter.into_iter()); - // soundness: T: AsRef<[u8]> and offsets are monotonically increasing - unsafe { Self::new_unchecked(Self::default_data_type(), offsets, values) } + Self::try_new(Self::default_data_type(), offsets, values).unwrap() } } @@ -349,9 +297,7 @@ impl MutableBinaryValuesArray { I: Iterator, { let (offsets, values) = trusted_len_values_iter(iterator); - - // soundness: offsets are monotonically increasing - Self::new_unchecked(Self::default_data_type(), offsets, values) + Self::try_new(Self::default_data_type(), offsets, values).unwrap() } /// Returns a new [`MutableBinaryValuesArray`] from an iterator. @@ -388,11 +334,7 @@ impl> TryPush for MutableBinaryValuesArray { fn try_push(&mut self, value: T) -> Result<()> { let bytes = value.as_ref(); self.values.extend_from_slice(bytes); - - let size = O::from_usize(self.values.len()).ok_or(Error::Overflow)?; - - self.offsets.push(size); - Ok(()) + self.offsets.try_push_usize(bytes.len()) } } @@ -413,6 +355,6 @@ unsafe impl<'a, O: Offset> ArrayAccessor<'a> for MutableBinaryValuesArray { impl TryExtendFromSelf for MutableBinaryValuesArray { fn try_extend_from_self(&mut self, other: &Self) -> Result<()> { self.values.extend_from_slice(&other.values); - try_extend_offsets(&mut self.offsets, &other.offsets) + self.offsets.try_extend_from_self(&other.offsets) } } diff --git a/src/array/dictionary/mutable.rs b/src/array/dictionary/mutable.rs index 665d6fc9234..444de34bcc4 100644 --- a/src/array/dictionary/mutable.rs +++ b/src/array/dictionary/mutable.rs @@ -157,15 +157,12 @@ impl MutableDictionaryArray { } fn take_into(&mut self) -> DictionaryArray { - // Safety - the invariant of this struct ensures that this is up-held - unsafe { - DictionaryArray::::try_new( - self.data_type.clone(), - std::mem::take(&mut self.keys).into(), - self.values.as_box(), - ) - .unwrap() - } + DictionaryArray::::try_new( + self.data_type.clone(), + std::mem::take(&mut self.keys).into(), + self.values.as_box(), + ) + .unwrap() } } diff --git a/src/array/equal/binary.rs b/src/array/equal/binary.rs index 1c86fab6dce..bed8588efb5 100644 --- a/src/array/equal/binary.rs +++ b/src/array/equal/binary.rs @@ -1,4 +1,5 @@ -use crate::array::{BinaryArray, Offset}; +use crate::array::BinaryArray; +use crate::offset::Offset; pub(super) fn equal(lhs: &BinaryArray, rhs: &BinaryArray) -> bool { lhs.data_type() == rhs.data_type() && lhs.len() == rhs.len() && lhs.iter().eq(rhs.iter()) diff --git a/src/array/equal/list.rs b/src/array/equal/list.rs index 4eee0b821e2..26faa1598fa 100644 --- a/src/array/equal/list.rs +++ b/src/array/equal/list.rs @@ -1,4 +1,5 @@ -use crate::array::{Array, ListArray, Offset}; +use crate::array::{Array, ListArray}; +use crate::offset::Offset; pub(super) fn equal(lhs: &ListArray, rhs: &ListArray) -> bool { lhs.data_type() == rhs.data_type() && lhs.len() == rhs.len() && lhs.iter().eq(rhs.iter()) diff --git a/src/array/equal/mod.rs b/src/array/equal/mod.rs index aa2ea602882..2bb3ba77f1f 100644 --- a/src/array/equal/mod.rs +++ b/src/array/equal/mod.rs @@ -1,3 +1,4 @@ +use crate::offset::Offset; use crate::types::NativeType; use super::*; diff --git a/src/array/equal/utf8.rs b/src/array/equal/utf8.rs index 3a8f0e5f012..1327221ca33 100644 --- a/src/array/equal/utf8.rs +++ b/src/array/equal/utf8.rs @@ -1,4 +1,5 @@ -use crate::array::{Offset, Utf8Array}; +use crate::array::Utf8Array; +use crate::offset::Offset; pub(super) fn equal(lhs: &Utf8Array, rhs: &Utf8Array) -> bool { lhs.data_type() == rhs.data_type() && lhs.len() == rhs.len() && lhs.iter().eq(rhs.iter()) diff --git a/src/array/growable/binary.rs b/src/array/growable/binary.rs index c3403877e9e..aebfb1580cd 100644 --- a/src/array/growable/binary.rs +++ b/src/array/growable/binary.rs @@ -1,13 +1,14 @@ use std::sync::Arc; use crate::{ - array::{Array, BinaryArray, Offset}, + array::{Array, BinaryArray}, bitmap::MutableBitmap, datatypes::DataType, + offset::{Offset, Offsets}, }; use super::{ - utils::{build_extend_null_bits, extend_offset_values, extend_offsets, ExtendNullBits}, + utils::{build_extend_null_bits, extend_offset_values, ExtendNullBits}, Growable, }; @@ -17,8 +18,7 @@ pub struct GrowableBinary<'a, O: Offset> { data_type: DataType, validity: MutableBitmap, values: Vec, - offsets: Vec, - length: O, // always equal to the last offset at `offsets`. + offsets: Offsets, extend_null_bits: Vec>, } @@ -40,16 +40,11 @@ impl<'a, O: Offset> GrowableBinary<'a, O> { .map(|array| build_extend_null_bits(*array, use_validity)) .collect(); - let mut offsets = Vec::with_capacity(capacity + 1); - let length = O::default(); - offsets.push(length); - Self { arrays, data_type, values: Vec::with_capacity(0), - offsets, - length, + offsets: Offsets::with_capacity(capacity), validity: MutableBitmap::with_capacity(capacity), extend_null_bits, } @@ -73,18 +68,16 @@ impl<'a, O: Offset> Growable<'a> for GrowableBinary<'a, O> { let offsets = array.offsets(); let values = array.values(); - extend_offsets::( - &mut self.offsets, - &mut self.length, - &offsets[start..start + len + 1], - ); + self.offsets + .try_extend_from_slice(offsets, start, len) + .unwrap(); + // values - extend_offset_values::(&mut self.values, offsets, values, start, len); + extend_offset_values::(&mut self.values, offsets.buffer(), values, start, len); } fn extend_validity(&mut self, additional: usize) { - self.offsets - .resize(self.offsets.len() + additional, self.length); + self.offsets.extend_constant(additional); self.validity.extend_constant(additional, false); } diff --git a/src/array/growable/list.rs b/src/array/growable/list.rs index 0e1e7ceb5f8..bc78a2d8e86 100644 --- a/src/array/growable/list.rs +++ b/src/array/growable/list.rs @@ -1,13 +1,14 @@ use std::sync::Arc; use crate::{ - array::{Array, ListArray, Offset}, + array::{Array, ListArray}, bitmap::MutableBitmap, + offset::{Offset, Offsets}, }; use super::{ make_growable, - utils::{build_extend_null_bits, extend_offsets, ExtendNullBits}, + utils::{build_extend_null_bits, ExtendNullBits}, Growable, }; @@ -20,37 +21,15 @@ fn extend_offset_values( let array = growable.arrays[index]; let offsets = array.offsets(); - if array.null_count() == 0 { - // offsets - extend_offsets::( - &mut growable.offsets, - &mut growable.last_offset, - &offsets[start..start + len + 1], - ); - - let end = offsets[start + len].to_usize(); - let start = offsets[start].to_usize(); - let len = end - start; - growable.values.extend(index, start, len) - } else { - growable.offsets.reserve(len); - - let new_offsets = &mut growable.offsets; - let inner_values = &mut growable.values; - let last_offset = &mut growable.last_offset; - (start..start + len).for_each(|i| { - if array.is_valid(i) { - let len = offsets[i + 1] - offsets[i]; - // compute the new offset - *last_offset += len; - - // append value - inner_values.extend(index, offsets[i].to_usize(), len.to_usize()); - } - // append offset - new_offsets.push(*last_offset); - }) - } + growable + .offsets + .try_extend_from_slice(offsets, start, len) + .unwrap(); + + let end = offsets.buffer()[start + len].to_usize(); + let start = offsets.buffer()[start].to_usize(); + let len = end - start; + growable.values.extend(index, start, len); } /// Concrete [`Growable`] for the [`ListArray`]. @@ -58,8 +37,7 @@ pub struct GrowableList<'a, O: Offset> { arrays: Vec<&'a ListArray>, validity: MutableBitmap, values: Box + 'a>, - offsets: Vec, - last_offset: O, // always equal to the last offset at `offsets`. + offsets: Offsets, extend_null_bits: Vec>, } @@ -85,16 +63,11 @@ impl<'a, O: Offset> GrowableList<'a, O> { .collect::>(); let values = make_growable(&inner, use_validity, 0); - let mut offsets = Vec::with_capacity(capacity + 1); - let length = O::default(); - offsets.push(length); - Self { arrays, - offsets, + offsets: Offsets::with_capacity(capacity), values, validity: MutableBitmap::with_capacity(capacity), - last_offset: O::default(), extend_null_bits, } } @@ -104,20 +77,12 @@ impl<'a, O: Offset> GrowableList<'a, O> { let offsets = std::mem::take(&mut self.offsets); let values = self.values.as_box(); - #[cfg(debug_assertions)] - { - crate::array::specification::try_check_offsets(&offsets, values.len()).unwrap(); - } - - // Safety - the invariant of this struct ensures that this is up-held - unsafe { - ListArray::::new_unchecked( - self.arrays[0].data_type().clone(), - offsets.into(), - values, - validity.into(), - ) - } + ListArray::::new( + self.arrays[0].data_type().clone(), + offsets.into(), + values, + validity.into(), + ) } } @@ -128,8 +93,7 @@ impl<'a, O: Offset> Growable<'a> for GrowableList<'a, O> { } fn extend_validity(&mut self, additional: usize) { - self.offsets - .resize(self.offsets.len() + additional, self.last_offset); + self.offsets.extend_constant(additional); self.validity.extend_constant(additional, false); } diff --git a/src/array/growable/utf8.rs b/src/array/growable/utf8.rs index adfdba2cd53..5e901577901 100644 --- a/src/array/growable/utf8.rs +++ b/src/array/growable/utf8.rs @@ -1,12 +1,13 @@ use std::sync::Arc; use crate::{ - array::{Array, Offset, Utf8Array}, + array::{Array, Utf8Array}, bitmap::MutableBitmap, + offset::{Offset, Offsets}, }; use super::{ - utils::{build_extend_null_bits, extend_offset_values, extend_offsets, ExtendNullBits}, + utils::{build_extend_null_bits, extend_offset_values, ExtendNullBits}, Growable, }; @@ -15,8 +16,7 @@ pub struct GrowableUtf8<'a, O: Offset> { arrays: Vec<&'a Utf8Array>, validity: MutableBitmap, values: Vec, - offsets: Vec, - length: O, // always equal to the last offset at `offsets`. + offsets: Offsets, extend_null_bits: Vec>, } @@ -36,15 +36,10 @@ impl<'a, O: Offset> GrowableUtf8<'a, O> { .map(|array| build_extend_null_bits(*array, use_validity)) .collect(); - let mut offsets = Vec::with_capacity(capacity + 1); - let length = O::default(); - offsets.push(length); - Self { arrays: arrays.to_vec(), values: Vec::with_capacity(0), - offsets, - length, + offsets: Offsets::with_capacity(capacity), validity: MutableBitmap::with_capacity(capacity), extend_null_bits, } @@ -57,7 +52,7 @@ impl<'a, O: Offset> GrowableUtf8<'a, O> { #[cfg(debug_assertions)] { - crate::array::specification::try_check_offsets_and_utf8(&offsets, &values).unwrap(); + crate::array::specification::try_check_utf8(&offsets, &values).unwrap(); } unsafe { @@ -80,18 +75,16 @@ impl<'a, O: Offset> Growable<'a> for GrowableUtf8<'a, O> { let offsets = array.offsets(); let values = array.values(); - extend_offsets::( - &mut self.offsets, - &mut self.length, - &offsets[start..start + len + 1], - ); + self.offsets + .try_extend_from_slice(offsets, start, len) + .unwrap(); + // values - extend_offset_values::(&mut self.values, offsets, values, start, len); + extend_offset_values::(&mut self.values, offsets.as_slice(), values, start, len); } fn extend_validity(&mut self, additional: usize) { - self.offsets - .resize(self.offsets.len() + additional, self.length); + self.offsets.extend_constant(additional); self.validity.extend_constant(additional, false); } diff --git a/src/array/growable/utils.rs b/src/array/growable/utils.rs index 7e39df295de..06a85cd9ad4 100644 --- a/src/array/growable/utils.rs +++ b/src/array/growable/utils.rs @@ -1,17 +1,4 @@ -use crate::{ - array::{Array, Offset}, - bitmap::MutableBitmap, -}; - -pub(super) fn extend_offsets(buffer: &mut Vec, last_offset: &mut T, offsets: &[T]) { - buffer.reserve(offsets.len() - 1); - offsets.windows(2).for_each(|offsets| { - // compute the new offset - let length = offsets[1] - offsets[0]; - *last_offset += length; - buffer.push(*last_offset); - }); -} +use crate::{array::Array, bitmap::MutableBitmap, offset::Offset}; // function used to extend nulls from arrays. This function's lifetime is bound to the array // because it reads nulls from it. diff --git a/src/array/list/ffi.rs b/src/array/list/ffi.rs index c6f1b2985dd..2b6be75e782 100644 --- a/src/array/list/ffi.rs +++ b/src/array/list/ffi.rs @@ -1,13 +1,15 @@ use crate::{array::FromFfi, bitmap::align, error::Result, ffi}; -use super::super::{ffi::ToFfi, Array, Offset}; +use crate::offset::{Offset, OffsetsBuffer}; + +use super::super::{ffi::ToFfi, Array}; use super::ListArray; unsafe impl ToFfi for ListArray { fn buffers(&self) -> Vec> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), - Some(self.offsets.as_ptr().cast::()), + Some(self.offsets.buffer().as_ptr().cast::()), ] } @@ -16,7 +18,7 @@ unsafe impl ToFfi for ListArray { } fn offset(&self) -> Option { - let offset = self.offsets.offset(); + let offset = self.offsets.buffer().offset(); if let Some(bitmap) = self.validity.as_ref() { if bitmap.offset() == offset { Some(offset) @@ -29,7 +31,7 @@ unsafe impl ToFfi for ListArray { } fn to_ffi_aligned(&self) -> Self { - let offset = self.offsets.offset(); + let offset = self.offsets.buffer().offset(); let validity = self.validity.as_ref().map(|bitmap| { if bitmap.offset() == offset { @@ -56,6 +58,9 @@ impl FromFfi for ListArray { let child = unsafe { array.child(0)? }; let values = ffi::try_from(child)?; - Ok(Self::from_data(data_type, offsets, values, validity)) + // assumption that data from FFI is well constructed + let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets) }; + + Ok(Self::new(data_type, offsets, values, validity)) } } diff --git a/src/array/list/fmt.rs b/src/array/list/fmt.rs index e6103ded6cb..4b10fc8f936 100644 --- a/src/array/list/fmt.rs +++ b/src/array/list/fmt.rs @@ -1,6 +1,6 @@ use std::fmt::{Debug, Formatter, Result, Write}; -use crate::array::Offset; +use crate::offset::Offset; use super::super::fmt::{get_display, write_vec}; use super::ListArray; diff --git a/src/array/list/iterator.rs b/src/array/list/iterator.rs index 82b5c7dca5f..86a12dfe769 100644 --- a/src/array/list/iterator.rs +++ b/src/array/list/iterator.rs @@ -1,6 +1,6 @@ -use crate::array::Offset; use crate::array::{Array, ArrayAccessor, ArrayValuesIter}; use crate::bitmap::utils::{BitmapIter, ZipValidity}; +use crate::offset::Offset; use super::ListArray; diff --git a/src/array/list/mod.rs b/src/array/list/mod.rs index feb5a5df93b..7c7e96c9c04 100644 --- a/src/array/list/mod.rs +++ b/src/array/list/mod.rs @@ -1,16 +1,12 @@ use crate::{ bitmap::Bitmap, - buffer::Buffer, datatypes::{DataType, Field}, error::Error, + offset::{Offset, OffsetsBuffer}, }; use std::sync::Arc; -use super::{ - new_empty_array, - specification::{try_check_offsets, try_check_offsets_bounds}, - Array, Offset, -}; +use super::{new_empty_array, specification::try_check_offsets_bounds, Array}; mod ffi; pub(super) mod fmt; @@ -23,7 +19,7 @@ pub use mutable::*; #[derive(Clone)] pub struct ListArray { data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Box, validity: Option, } @@ -33,24 +29,23 @@ impl ListArray { /// /// # Errors /// This function returns an error iff: - /// * the offsets are not monotonically increasing /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. + /// * the validity's length is not equal to `offsets.len()`. /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either [`crate::datatypes::PhysicalType::List`] or [`crate::datatypes::PhysicalType::LargeList`]. /// * The `data_type`'s inner field's data type is not equal to `values.data_type`. /// # Implementation - /// This function is `O(N)` - checking monotinicity is `O(N)` + /// This function is `O(1)` pub fn try_new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Box, validity: Option, ) -> Result { - try_check_offsets(&offsets, values.len())?; + try_check_offsets_bounds(&offsets, values.len())?; if validity .as_ref() - .map_or(false, |validity| validity.len() != offsets.len() - 1) + .map_or(false, |validity| validity.len() != offsets.len()) { return Err(Error::oos( "validity mask length must match the number of values", @@ -77,16 +72,15 @@ impl ListArray { /// /// # Panics /// This function panics iff: - /// * the offsets are not monotonically increasing /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. + /// * the validity's length is not equal to `offsets.len()`. /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either [`crate::datatypes::PhysicalType::List`] or [`crate::datatypes::PhysicalType::LargeList`]. /// * The `data_type`'s inner field's data type is not equal to `values.data_type`. /// # Implementation - /// This function is `O(N)` - checking monotinicity is `O(N)` + /// This function is `O(1)` pub fn new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Box, validity: Option, ) -> Self { @@ -96,7 +90,7 @@ impl ListArray { /// Alias of `new` pub fn from_data( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Box, validity: Option, ) -> Self { @@ -106,7 +100,7 @@ impl ListArray { /// Returns a new empty [`ListArray`]. pub fn new_empty(data_type: DataType) -> Self { let values = new_empty_array(Self::get_child_type(&data_type).clone()); - Self::new(data_type, Buffer::from(vec![O::zero()]), values, None) + Self::new(data_type, OffsetsBuffer::default(), values, None) } /// Returns a new null [`ListArray`]. @@ -115,7 +109,7 @@ impl ListArray { let child = Self::get_child_type(&data_type).clone(); Self::new( data_type, - vec![O::default(); 1 + length].into(), + vec![O::zero(); 1 + length].try_into().unwrap(), new_empty_array(child), Some(Bitmap::new_zeroed(length)), ) @@ -132,77 +126,6 @@ impl ListArray { } } -// unsafe construtors -impl ListArray { - /// Creates a new [`ListArray`]. - /// - /// # Errors - /// This function returns an error iff: - /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. - /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either [`crate::datatypes::PhysicalType::List`] or [`crate::datatypes::PhysicalType::LargeList`]. - /// * The `data_type`'s inner field's data type is not equal to `values.data_type`. - /// # Safety - /// This function is unsafe iff: - /// * the offsets are not monotonically increasing - /// # Implementation - /// This function is `O(1)` - pub unsafe fn try_new_unchecked( - data_type: DataType, - offsets: Buffer, - values: Box, - validity: Option, - ) -> Result { - try_check_offsets_bounds(&offsets, values.len())?; - - if validity - .as_ref() - .map_or(false, |validity| validity.len() != offsets.len() - 1) - { - return Err(Error::oos( - "validity mask length must match the number of values", - )); - } - - let child_data_type = Self::try_get_child(&data_type)?.data_type(); - let values_data_type = values.data_type(); - if child_data_type != values_data_type { - return Err(Error::oos( - format!("ListArray's child's DataType must match. However, the expected DataType is {child_data_type:?} while it got {values_data_type:?}."), - )); - } - - Ok(Self { - data_type, - offsets, - values, - validity, - }) - } - - /// Creates a new [`ListArray`]. - /// - /// # Panics - /// This function panics iff: - /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. - /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either [`crate::datatypes::PhysicalType::List`] or [`crate::datatypes::PhysicalType::LargeList`]. - /// * The `data_type`'s inner field's data type is not equal to `values.data_type`. - /// # Safety - /// This function is unsafe iff: - /// * the offsets are not monotonically increasing - /// # Implementation - /// This function is `O(1)` - pub unsafe fn new_unchecked( - data_type: DataType, - offsets: Buffer, - values: Box, - validity: Option, - ) -> Self { - Self::try_new_unchecked(data_type, offsets, values, validity).unwrap() - } -} - impl ListArray { /// Returns a slice of this [`ListArray`]. /// # Panics @@ -258,20 +181,17 @@ impl ListArray { /// Returns the length of this array #[inline] pub fn len(&self) -> usize { - self.offsets.len() - 1 + self.offsets.len() } /// Returns the element at index `i` + /// # Panic + /// Panics iff `i >= self.len()` #[inline] pub fn value(&self, i: usize) -> Box { - let offset = self.offsets[i]; - let offset_1 = self.offsets[i + 1]; - let length = (offset_1 - offset).to_usize(); - - // Safety: - // One of the invariants of the struct - // is that offsets are in bounds - unsafe { self.values.slice_unchecked(offset.to_usize(), length) } + assert!(i < self.len()); + // Safety: invariant of this function + unsafe { self.value_unchecked(i) } } /// Returns the element at index `i` as &str @@ -279,11 +199,12 @@ impl ListArray { /// Assumes that the `i < self.len`. #[inline] pub unsafe fn value_unchecked(&self, i: usize) -> Box { - let offset = *self.offsets.get_unchecked(i); - let offset_1 = *self.offsets.get_unchecked(i + 1); - let length = (offset_1 - offset).to_usize(); + // safety: the invariant of the function + let (start, end) = self.offsets.start_end_unchecked(i); + let length = end - start; - self.values.slice_unchecked(offset.to_usize(), length) + // safety: the invariant of the struct + self.values.slice_unchecked(start, length) } /// The optional validity. @@ -294,7 +215,7 @@ impl ListArray { /// The offsets [`Buffer`]. #[inline] - pub fn offsets(&self) -> &Buffer { + pub fn offsets(&self) -> &OffsetsBuffer { &self.offsets } diff --git a/src/array/list/mutable.rs b/src/array/list/mutable.rs index 57017998d0b..98280dc89f7 100644 --- a/src/array/list/mutable.rs +++ b/src/array/list/mutable.rs @@ -2,13 +2,13 @@ use std::sync::Arc; use crate::{ array::{ - physical_binary::{extend_validity, try_extend_offsets}, - specification::try_check_offsets, - Array, MutableArray, Offset, TryExtend, TryExtendFromSelf, TryPush, + physical_binary::extend_validity, Array, MutableArray, TryExtend, TryExtendFromSelf, + TryPush, }, bitmap::MutableBitmap, datatypes::{DataType, Field}, error::{Error, Result}, + offset::{Offset, Offsets}, trusted_len::TrustedLen, }; @@ -18,7 +18,7 @@ use super::ListArray; #[derive(Debug, Clone)] pub struct MutableListArray { data_type: DataType, - offsets: Vec, + offsets: Offsets, values: M, validity: Option, } @@ -36,8 +36,7 @@ impl MutableListArray { let values = M::default(); let data_type = ListArray::::default_datatype(values.data_type().clone()); - let mut offsets = Vec::::with_capacity(capacity + 1); - offsets.push(O::default()); + let offsets = Offsets::::with_capacity(capacity); Self { data_type, offsets, @@ -55,16 +54,12 @@ impl Default for MutableListArray { impl From> for ListArray { fn from(mut other: MutableListArray) -> Self { - // Safety: - // MutableListArray has monotonically increasing offsets - unsafe { - ListArray::new_unchecked( - other.data_type, - other.offsets.into(), - other.values.as_box(), - other.validity.map(|x| x.into()), - ) - } + ListArray::new( + other.data_type, + other.offsets.into(), + other.values.as_box(), + other.validity.map(|x| x.into()), + ) } } @@ -112,16 +107,14 @@ where extend_validity(self.len(), &mut self.validity, &other.validity); self.values.try_extend_from_self(&other.values)?; - - try_extend_offsets(&mut self.offsets, &other.offsets) + self.offsets.try_extend_from_self(&other.offsets) } } impl MutableListArray { /// Creates a new [`MutableListArray`] from a [`MutableArray`] and capacity. pub fn new_from(values: M, data_type: DataType, capacity: usize) -> Self { - let mut offsets = Vec::::with_capacity(capacity + 1); - offsets.push(O::default()); + let offsets = Offsets::::with_capacity(capacity); assert_eq!(values.len(), 0); ListArray::::get_child_field(&data_type); Self { @@ -153,11 +146,11 @@ impl MutableListArray { /// Needs to be called when a valid value was extended to this array. /// This is a relatively low level function, prefer `try_push` when you can. pub fn try_push_valid(&mut self) -> Result<()> { - let size = self.values.len(); - let size = O::from_usize(size).ok_or(Error::Overflow)?; - assert!(size >= *self.offsets.last().unwrap()); + let total_length = self.values.len(); + let offset = self.offsets.last().to_usize(); + let length = total_length.checked_sub(offset).ok_or(Error::Overflow)?; - self.offsets.push(size); + self.offsets.try_push_usize(length)?; if let Some(validity) = &mut self.validity { validity.push(true) } @@ -166,7 +159,7 @@ impl MutableListArray { #[inline] fn push_null(&mut self) { - self.offsets.push(self.last_offset()); + self.offsets.extend_constant(1); match &mut self.validity { Some(validity) => validity.push(false), None => self.init_validity(), @@ -175,79 +168,30 @@ impl MutableListArray { /// Expand this array, using elements from the underlying backing array. /// Assumes the expansion begins at the highest previous offset, or zero if - /// this [MutableListArray] is currently empty. + /// this [`MutableListArray`] is currently empty. /// /// Panics if: /// - the new offsets are not in monotonic increasing order. /// - any new offset is not in bounds of the backing array. /// - the passed iterator has no upper bound. #[allow(dead_code)] - pub(crate) fn extend_offsets(&mut self, expansion: II) - where - II: TrustedLen>, - { - let current_len = self.offsets.len(); - let (_, upper) = expansion.size_hint(); - let upper = upper.expect("iterator must have upper bound"); - if current_len == 0 && upper > 0 { - self.offsets.push(O::zero()); - } - // safety: checked below - unsafe { self.unsafe_extend_offsets(expansion) }; - if self.offsets.len() > current_len { - // check all inserted offsets - try_check_offsets(&self.offsets[current_len..], self.values.len()) - .expect("invalid offsets"); - } - // else expansion is empty, and this is trivially safe. - } - - /// Expand this array, using elements from the underlying backing array. - /// Assumes the expansion begins at the highest previous offset, or zero if - /// this [MutableListArray] is currently empty. - /// - /// # Safety - /// - /// Assumes that `offsets` are in order, and do not overrun the underlying - /// `values` backing array. - /// - /// Also assumes the expansion begins at the highest previous offset, or - /// zero if the array is currently empty. - /// - /// Panics if the passed iterator has no upper bound. - #[allow(dead_code)] - pub(crate) unsafe fn unsafe_extend_offsets(&mut self, expansion: II) + pub(crate) fn try_extend_from_lengths(&mut self, iterator: II) -> Result<()> where - II: TrustedLen>, + II: TrustedLen> + Clone, { - let (_, upper) = expansion.size_hint(); - let upper = upper.expect("iterator must have upper bound"); - let final_size = self.len() + upper; - self.offsets.reserve(upper); - - for item in expansion { - match item { - Some(offset) => { - self.offsets.push(offset); - if let Some(validity) = &mut self.validity { - validity.push(true); - } - } - None => self.push_null(), - } - - if let Some(validity) = &mut self.validity { - if validity.capacity() < final_size { - validity.reserve(final_size - validity.capacity()); - } - } + self.offsets + .try_extend_from_lengths(iterator.clone().map(|x| x.unwrap_or_default()))?; + if let Some(validity) = &mut self.validity { + validity.extend_from_trusted_len_iter(iterator.map(|x| x.is_some())) } + assert_eq!(self.offsets.last().to_usize(), self.values.len()); + Ok(()) } /// Returns the length of this array #[inline] pub fn len(&self) -> usize { - self.offsets.len() - 1 + self.offsets.len() } /// The values @@ -256,7 +200,7 @@ impl MutableListArray { } /// The offsets - pub fn offsets(&self) -> &Vec { + pub fn offsets(&self) -> &Offsets { &self.offsets } @@ -265,13 +209,8 @@ impl MutableListArray { &self.values } - #[inline] - fn last_offset(&self) -> O { - *self.offsets.last().unwrap() - } - fn init_validity(&mut self) { - let len = self.offsets.len() - 1; + let len = self.offsets.len(); let mut validity = MutableBitmap::with_capacity(self.offsets.capacity()); validity.extend_constant(len, true); @@ -319,29 +258,23 @@ impl MutableArray for MutableListArray Box { - // Safety: - // MutableListArray has monotonically increasing offsets - Box::new(unsafe { - ListArray::new_unchecked( - self.data_type.clone(), - std::mem::take(&mut self.offsets).into(), - self.values.as_box(), - std::mem::take(&mut self.validity).map(|x| x.into()), - ) - }) + ListArray::new( + self.data_type.clone(), + std::mem::take(&mut self.offsets).into(), + self.values.as_box(), + std::mem::take(&mut self.validity).map(|x| x.into()), + ) + .boxed() } fn as_arc(&mut self) -> Arc { - // Safety: - // MutableListArray has monotonically increasing offsets - Arc::new(unsafe { - ListArray::new_unchecked( - self.data_type.clone(), - std::mem::take(&mut self.offsets).into(), - self.values.as_box(), - std::mem::take(&mut self.validity).map(|x| x.into()), - ) - }) + ListArray::new( + self.data_type.clone(), + std::mem::take(&mut self.offsets).into(), + self.values.as_box(), + std::mem::take(&mut self.validity).map(|x| x.into()), + ) + .arced() } fn data_type(&self) -> &DataType { diff --git a/src/array/map/ffi.rs b/src/array/map/ffi.rs index bbf3846999a..09920419c21 100644 --- a/src/array/map/ffi.rs +++ b/src/array/map/ffi.rs @@ -1,4 +1,4 @@ -use crate::{array::FromFfi, bitmap::align, error::Result, ffi}; +use crate::{array::FromFfi, bitmap::align, error::Result, ffi, offset::OffsetsBuffer}; use super::super::{ffi::ToFfi, Array}; use super::MapArray; @@ -7,7 +7,7 @@ unsafe impl ToFfi for MapArray { fn buffers(&self) -> Vec> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), - Some(self.offsets.as_ptr().cast::()), + Some(self.offsets.buffer().as_ptr().cast::()), ] } @@ -16,7 +16,7 @@ unsafe impl ToFfi for MapArray { } fn offset(&self) -> Option { - let offset = self.offsets.offset(); + let offset = self.offsets.buffer().offset(); if let Some(bitmap) = self.validity.as_ref() { if bitmap.offset() == offset { Some(offset) @@ -29,7 +29,7 @@ unsafe impl ToFfi for MapArray { } fn to_ffi_aligned(&self) -> Self { - let offset = self.offsets.offset(); + let offset = self.offsets.buffer().offset(); let validity = self.validity.as_ref().map(|bitmap| { if bitmap.offset() == offset { @@ -56,6 +56,9 @@ impl FromFfi for MapArray { let child = array.child(0)?; let values = ffi::try_from(child)?; + // assumption that data from FFI is well constructed + let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets) }; + Self::try_new(data_type, offsets, values, validity) } } diff --git a/src/array/map/mod.rs b/src/array/map/mod.rs index 51ca56a8459..fe8adb1deab 100644 --- a/src/array/map/mod.rs +++ b/src/array/map/mod.rs @@ -1,11 +1,11 @@ use crate::{ bitmap::Bitmap, - buffer::Buffer, datatypes::{DataType, Field}, error::Error, + offset::OffsetsBuffer, }; -use super::{new_empty_array, specification::try_check_offsets, Array}; +use super::{new_empty_array, specification::try_check_offsets_bounds, Array}; mod ffi; mod fmt; @@ -16,8 +16,8 @@ pub use iterator::*; #[derive(Clone)] pub struct MapArray { data_type: DataType, - // invariant: field.len() == offsets.len() - 1 - offsets: Buffer, + // invariant: field.len() == offsets.len() + offsets: OffsetsBuffer, field: Box, // invariant: offsets.len() - 1 == Bitmap::len() validity: Option, @@ -27,18 +27,17 @@ impl MapArray { /// Returns a new [`MapArray`]. /// # Errors /// This function errors iff: - /// * the offsets are not monotonically increasing /// * The last offset is not equal to the field' length /// * The `data_type`'s physical type is not [`crate::datatypes::PhysicalType::Map`] /// * The fields' `data_type` is not equal to the inner field of `data_type` /// * The validity is not `None` and its length is different from `offsets.len() - 1`. pub fn try_new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, field: Box, validity: Option, ) -> Result { - try_check_offsets(&offsets, field.len())?; + try_check_offsets_bounds(&offsets, field.len())?; let inner_field = Self::try_get_field(&data_type)?; if let DataType::Struct(inner) = inner_field.data_type() { @@ -60,7 +59,7 @@ impl MapArray { if validity .as_ref() - .map_or(false, |validity| validity.len() != offsets.len() - 1) + .map_or(false, |validity| validity.len() != offsets.len()) { return Err(Error::oos( "validity mask length must match the number of values", @@ -77,13 +76,12 @@ impl MapArray { /// Creates a new [`MapArray`]. /// # Panics - /// * the offsets are not monotonically increasing /// * The last offset is not equal to the field' length. /// * The `data_type`'s physical type is not [`crate::datatypes::PhysicalType::Map`], /// * The validity is not `None` and its length is different from `offsets.len() - 1`. pub fn new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, field: Box, validity: Option, ) -> Self { @@ -93,7 +91,7 @@ impl MapArray { /// Alias for `new` pub fn from_data( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, field: Box, validity: Option, ) -> Self { @@ -105,7 +103,7 @@ impl MapArray { let field = new_empty_array(Self::get_field(&data_type).data_type().clone()); Self::new( data_type, - vec![0i32; 1 + length].into(), + vec![0i32; 1 + length].try_into().unwrap(), field, Some(Bitmap::new_zeroed(length)), ) @@ -114,7 +112,7 @@ impl MapArray { /// Returns a new empty [`MapArray`]. pub fn new_empty(data_type: DataType) -> Self { let field = new_empty_array(Self::get_field(&data_type).data_type().clone()); - Self::new(data_type, Buffer::from(vec![0i32]), field, None) + Self::new(data_type, OffsetsBuffer::default(), field, None) } /// Returns this [`MapArray`] with a new validity. @@ -197,12 +195,12 @@ impl MapArray { /// Returns the length of this array #[inline] pub fn len(&self) -> usize { - self.offsets.len() - 1 + self.offsets.len() } /// returns the offsets #[inline] - pub fn offsets(&self) -> &Buffer { + pub fn offsets(&self) -> &OffsetsBuffer { &self.offsets } @@ -215,14 +213,8 @@ impl MapArray { /// Returns the element at index `i`. #[inline] pub fn value(&self, i: usize) -> Box { - let offset = self.offsets[i]; - let offset_1 = self.offsets[i + 1]; - let length = (offset_1 - offset) as usize; - - // Safety: - // One of the invariants of the struct - // is that offsets are in bounds - unsafe { self.field.slice_unchecked(offset as usize, length) } + assert!(i < self.len()); + unsafe { self.value_unchecked(i) } } /// Returns the element at index `i`. @@ -230,11 +222,12 @@ impl MapArray { /// Assumes that the `i < self.len`. #[inline] pub unsafe fn value_unchecked(&self, i: usize) -> Box { - let offset = *self.offsets.get_unchecked(i); - let offset_1 = *self.offsets.get_unchecked(i + 1); - let length = (offset_1 - offset) as usize; + // soundness: the invariant of the function + let (start, end) = self.offsets.start_end_unchecked(i); + let length = end - start; - self.field.slice_unchecked(offset as usize, length) + // soundness: the invariant of the struct + self.field.slice_unchecked(start, length) } } diff --git a/src/array/mod.rs b/src/array/mod.rs index 45e83a80803..5aa1dd1eb64 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -439,7 +439,6 @@ pub use iterator::ArrayValuesIter; pub use equal::equal; pub use fmt::{get_display, get_value_display}; -pub use crate::types::Offset; pub use binary::{BinaryArray, BinaryValueIter, MutableBinaryArray, MutableBinaryValuesArray}; pub use boolean::{BooleanArray, MutableBooleanArray}; pub use dictionary::{DictionaryArray, DictionaryKey, MutableDictionaryArray}; @@ -483,7 +482,7 @@ pub trait TryExtendFromSelf { /// 1. `offsets.len() > 0` /// 2. `offsets[i] >= offsets[i-1] for all i` /// 3. `offsets[i] < values.len() for all i` -pub unsafe trait GenericBinaryArray: Array { +pub unsafe trait GenericBinaryArray: Array { /// The values of the array fn values(&self) -> &[u8]; /// The offsets of the array diff --git a/src/array/ord.rs b/src/array/ord.rs index 639317165ab..e42c7fa0fc7 100644 --- a/src/array/ord.rs +++ b/src/array/ord.rs @@ -4,6 +4,7 @@ use std::cmp::Ordering; use crate::datatypes::*; use crate::error::{Error, Result}; +use crate::offset::Offset; use crate::{array::*, types::NativeType}; /// Compare the values at two arbitrary indices in two arrays. diff --git a/src/array/physical_binary.rs b/src/array/physical_binary.rs index bf883f4e6e8..adbf62d6c27 100644 --- a/src/array/physical_binary.rs +++ b/src/array/physical_binary.rs @@ -1,6 +1,5 @@ -use crate::array::Offset; use crate::bitmap::MutableBitmap; -use crate::error::Error; +use crate::offset::{Offset, Offsets}; /// # Safety /// The caller must ensure that `iterator` is `TrustedLen`. @@ -8,7 +7,7 @@ use crate::error::Error; #[allow(clippy::type_complexity)] pub(crate) unsafe fn try_trusted_len_unzip( iterator: I, -) -> std::result::Result<(Option, Vec, Vec), E> +) -> std::result::Result<(Option, Offsets, Vec), E> where O: Offset, P: AsRef<[u8]>, @@ -45,7 +44,7 @@ where ); offsets.set_len(len + 1); - Ok((null.into(), offsets, values)) + Ok((null.into(), Offsets::new_unchecked(offsets), values)) } /// Creates [`MutableBitmap`] and two [`Vec`]s from an iterator of `Option`. @@ -56,7 +55,7 @@ where #[inline] pub(crate) unsafe fn trusted_len_unzip( iterator: I, -) -> (Option, Vec, Vec) +) -> (Option, Offsets, Vec) where O: Offset, P: AsRef<[u8]>, @@ -65,12 +64,10 @@ where let (_, upper) = iterator.size_hint(); let len = upper.expect("trusted_len_unzip requires an upper limit"); - let mut offsets = Vec::::with_capacity(len + 1); + let mut offsets = Offsets::::with_capacity(len); let mut values = Vec::::new(); let mut validity = MutableBitmap::new(); - offsets.push(O::default()); - extend_from_trusted_len_iter(&mut offsets, &mut values, &mut validity, iterator); let validity = if validity.unset_bits() > 0 { @@ -87,7 +84,7 @@ where /// # Safety /// The caller must ensure that `iterator` is [`TrustedLen`]. #[inline] -pub(crate) unsafe fn trusted_len_values_iter(iterator: I) -> (Vec, Vec) +pub(crate) unsafe fn trusted_len_values_iter(iterator: I) -> (Offsets, Vec) where O: Offset, P: AsRef<[u8]>, @@ -96,11 +93,9 @@ where let (_, upper) = iterator.size_hint(); let len = upper.expect("trusted_len_unzip requires an upper limit"); - let mut offsets = Vec::::with_capacity(len + 1); + let mut offsets = Offsets::::with_capacity(len); let mut values = Vec::::new(); - offsets.push(O::default()); - extend_from_trusted_len_values_iter(&mut offsets, &mut values, iterator); (offsets, values) @@ -112,7 +107,7 @@ where // The caller must ensure the `iterator` is [`TrustedLen`] #[inline] pub(crate) unsafe fn extend_from_trusted_len_values_iter( - offsets: &mut Vec, + offsets: &mut Offsets, values: &mut Vec, iterator: I, ) where @@ -120,42 +115,13 @@ pub(crate) unsafe fn extend_from_trusted_len_values_iter( P: AsRef<[u8]>, I: Iterator, { - let (_, upper) = iterator.size_hint(); - let additional = upper.expect("extend_from_trusted_len_values_iter requires an upper limit"); - - offsets.reserve(additional); - - // Read in the last offset, will be used to increment and store - // new values later on - let mut length = *offsets.last().unwrap(); - - // Get a mutable pointer to the `offsets`, and move the pointer - // to the position, where a new value will be written - let mut dst = offsets.as_mut_ptr(); - dst = dst.add(offsets.len()); - - for item in iterator { + let lengths = iterator.map(|item| { let s = item.as_ref(); - - // Calculate the new offset value - length += O::from_usize(s.len()).unwrap(); - // Push new entries for both `values` and `offsets` buffer values.extend_from_slice(s); - std::ptr::write(dst, length); - - // Move to the next position in offset buffer - dst = dst.add(1); - } - - debug_assert_eq!( - dst.offset_from(offsets.as_ptr()) as usize, - offsets.len() + additional, - "TrustedLen iterator's length was not accurately reported" - ); - - // We make sure to set the new length for the `offsets` buffer - offsets.set_len(offsets.len() + additional); + s.len() + }); + offsets.try_extend_from_lengths(lengths).unwrap(); } // Populates `offsets` and `values` [`Vec`]s with information extracted @@ -163,7 +129,7 @@ pub(crate) unsafe fn extend_from_trusted_len_values_iter( // the return value indicates how many items were added. #[inline] pub(crate) fn extend_from_values_iter( - offsets: &mut Vec, + offsets: &mut Offsets, values: &mut Vec, iterator: I, ) -> usize @@ -176,18 +142,12 @@ where offsets.reserve(size_hint); - // Read in the last offset, will be used to increment and store - // new values later on - let mut length = *offsets.last().unwrap(); let start_index = offsets.len(); for item in iterator { - let s = item.as_ref(); - // Calculate the new offset value - length += O::from_usize(s.len()).unwrap(); - - values.extend_from_slice(s); - offsets.push(length); + let bytes = item.as_ref(); + values.extend_from_slice(bytes); + offsets.try_push_usize(bytes.len()).unwrap(); } offsets.len() - start_index } @@ -199,7 +159,7 @@ where // The caller must ensure that `iterator` is [`TrustedLen`] #[inline] pub(crate) unsafe fn extend_from_trusted_len_iter( - offsets: &mut Vec, + offsets: &mut Offsets, values: &mut Vec, validity: &mut MutableBitmap, iterator: I, @@ -214,51 +174,24 @@ pub(crate) unsafe fn extend_from_trusted_len_iter( offsets.reserve(additional); validity.reserve(additional); - // Read in the last offset, will be used to increment and store - // new values later on - let mut length = *offsets.last().unwrap(); - - // Get a mutable pointer to the `offsets`, and move the pointer - // to the position, where a new value will be written - let mut dst = offsets.as_mut_ptr(); - dst = dst.add(offsets.len()); - - for item in iterator { + let lengths = iterator.map(|item| { if let Some(item) = item { let bytes = item.as_ref(); - - // Calculate new offset value - length += O::from_usize(bytes.len()).unwrap(); - - // Push new values for `values` and `validity` buffer values.extend_from_slice(bytes); validity.push_unchecked(true); + bytes.len() } else { - // If `None`, update only `validity` validity.push_unchecked(false); + 0 } - - // Push new offset or old offset depending on the `item` - std::ptr::write(dst, length); - - // Move to the next position in offset buffer - dst = dst.add(1); - } - - debug_assert_eq!( - dst.offset_from(offsets.as_ptr()) as usize, - offsets.len() + additional, - "TrustedLen iterator's length was not accurately reported" - ); - - // We make sure to set the new length for the `offsets` buffer - offsets.set_len(offsets.len() + additional); + }); + offsets.try_extend_from_lengths(lengths).unwrap(); } /// Creates two [`Vec`]s from an iterator of `&[u8]`. /// The first buffer corresponds to a offset buffer, the second to a values buffer. #[inline] -pub(crate) fn values_iter(iterator: I) -> (Vec, Vec) +pub(crate) fn values_iter(iterator: I) -> (Offsets, Vec) where O: Offset, P: AsRef<[u8]>, @@ -266,40 +199,17 @@ where { let (lower, _) = iterator.size_hint(); - let mut offsets = Vec::::with_capacity(lower + 1); + let mut offsets = Offsets::::with_capacity(lower); let mut values = Vec::::new(); - let mut length = O::default(); - offsets.push(length); - for item in iterator { let s = item.as_ref(); - length += O::from_usize(s.len()).unwrap(); values.extend_from_slice(s); - - offsets.push(length) + offsets.try_push_usize(s.len()).unwrap(); } (offsets, values) } -/// Extends `offsets` with all offsets from `other` -#[inline] -pub(crate) fn try_extend_offsets(offsets: &mut Vec, other: &[O]) -> Result<(), Error> -where - O: Offset, -{ - let lengths = other.windows(2).map(|w| w[1] - w[0]); - let mut last = *offsets.last().unwrap(); - - offsets.reserve(other.len() - 1); - for length in lengths { - let r = last.checked_add(&length).ok_or(Error::Overflow)?; - last += length; - offsets.push(r) - } - Ok(()) -} - /// Extends `validity` with all items from `other` pub(crate) fn extend_validity( length: usize, diff --git a/src/array/specification.rs b/src/array/specification.rs index 7b1e0d86640..021cbd5c80c 100644 --- a/src/array/specification.rs +++ b/src/array/specification.rs @@ -1,110 +1,109 @@ use crate::error::{Error, Result}; -use crate::types::Offset; +use crate::offset::{Offset, Offsets, OffsetsBuffer}; -pub fn try_check_offsets_bounds(offsets: &[O], values_len: usize) -> Result { - if let Some(last_offset) = offsets.last() { - if last_offset.to_usize() > values_len { - Err(Error::oos("offsets must not exceed the values length")) - } else { - Ok(last_offset.to_usize()) - } - } else { - Err(Error::oos("offsets must have at least one element")) +/// Helper trait to support `Offset` and `OffsetBuffer` +pub(crate) trait OffsetsContainer { + fn last(&self) -> usize; + fn as_slice(&self) -> &[O]; +} + +impl OffsetsContainer for OffsetsBuffer { + #[inline] + fn last(&self) -> usize { + self.last().to_usize() + } + + #[inline] + fn as_slice(&self) -> &[O] { + self.buffer() } } -pub fn check_offsets_minimal(offsets: &[O], values_len: usize) -> usize { - assert!( - !offsets.is_empty(), - "The length of the offset buffer must be larger than 1" - ); - let len = offsets.len() - 1; - - let last_offset = offsets[len]; - let last_offset = last_offset.to_usize(); - - assert_eq!( - values_len, last_offset, - "The length of the values must be equal to the last offset value" - ); - len +impl OffsetsContainer for Offsets { + #[inline] + fn last(&self) -> usize { + self.last().to_usize() + } + + #[inline] + fn as_slice(&self) -> &[O] { + self.as_slice() + } } -/// # Panics iff: -/// * the `offsets` is not monotonically increasing, or -/// * any slice of `values` between two consecutive pairs from `offsets` is invalid `utf8`, or -/// * any offset is larger or equal to `values_len`. -pub fn try_check_offsets_and_utf8(offsets: &[O], values: &[u8]) -> Result<()> { - if values.is_ascii() { - try_check_offsets(offsets, values.len()) +pub(crate) fn try_check_offsets_bounds>( + offsets: &C, + values_len: usize, +) -> Result<()> { + if offsets.last() > values_len { + Err(Error::oos("offsets must not exceed the values length")) } else { - simdutf8::basic::from_utf8(values)?; + Ok(()) + } +} - for window in offsets.windows(2) { - let start = window[0].to_usize(); - let end = window[1].to_usize(); +/// # Error +/// * any offset is larger or equal to `values_len`. +/// * any slice of `values` between two consecutive pairs from `offsets` is invalid `utf8`, or +pub(crate) fn try_check_utf8>( + offsets: &C, + values: &[u8], +) -> Result<()> { + if offsets.as_slice().len() == 1 { + return Ok(()); + } - // check monotonicity - if start > end { - return Err(Error::oos("offsets must be monotonically increasing")); - } + try_check_offsets_bounds(offsets, values.len())?; - let first = values.get(start); + if values.is_ascii() { + Ok(()) + } else { + simdutf8::basic::from_utf8(values)?; - if let Some(&b) = first { - // A valid code-point iff it does not start with 0b10xxxxxx - // Bit-magic taken from `std::str::is_char_boundary` - if (b as i8) < -0x40 { - return Err(Error::oos("Non-valid char boundary detected")); - } - } - } - // check bounds - if offsets - .last() - .map_or(true, |last| last.to_usize() > values.len()) - { - return Err(Error::oos( - "offsets must have at least one element and must not exceed values length", - )); + // offsets can be == values.len() + // find first offset from the end that is smaller + // Example: + // values.len() = 10 + // offsets = [0, 5, 10, 10] + let offsets = offsets.as_slice(); + let last = offsets + .iter() + .enumerate() + .skip(1) + .rev() + .find_map(|(i, offset)| (offset.to_usize() < values.len()).then(|| i)); + + let last = if let Some(last) = last { + // following the example: last = 1 (offset = 5) + last + } else { + // given `l = values.len()`, this branch is hit iff either: + // * `offsets = [0, l, l, ...]`, which was covered by `from_utf8(values)` above + // * `offsets = [0]`, which never happens because offsets.as_slice().len() == 1 is short-circuited above + return Ok(()); }; - Ok(()) - } -} + // trucate to relevant offsets. Note: `=last` because last was computed skipping the first item + // following the example: starts = [0, 5] + let starts = unsafe { offsets.get_unchecked(..=last) }; -/// Checks that `offsets` is monotonically increasing, and all offsets are less than or equal to -/// `values_len`. -pub fn try_check_offsets(offsets: &[O], values_len: usize) -> Result<()> { - // this code is carefully constructed to auto-vectorize, don't change naively! - match offsets.first() { - None => Err(Error::oos("offsets must have at least one element")), - Some(first) => { - let mut previous = *first; - let mut any_invalid = false; - - // This loop will auto-vectorize because there is not any break, - // an invalid value will be returned once the whole offsets buffer is processed. - for offset in offsets { - if previous > *offset { - any_invalid = true - } - previous = *offset; - } + let mut any_invalid = false; + for start in starts { + let start = start.to_usize(); + + // Safety: `try_check_offsets_bounds` just checked for bounds + let b = *unsafe { values.get_unchecked(start) }; - if any_invalid { - Err(Error::oos("offsets must be monotonically increasing")) - } else if offsets - .last() - .map_or(true, |last| last.to_usize() > values_len) - { - Err(Error::oos( - "offsets must have at least one element and must not exceed values length", - )) - } else { - Ok(()) + // A valid code-point iff it does not start with 0b10xxxxxx + // Bit-magic taken from `std::str::is_char_boundary` + if (b as i8) < -0x40 { + any_invalid = true } } + if any_invalid { + return Err(Error::oos("Non-valid char boundary detected")); + } + Ok(()) } } @@ -136,18 +135,18 @@ mod tests { proptest! { // a bit expensive, feel free to run it when changing the code above - //#![proptest_config(ProptestConfig::with_cases(100000))] + // #![proptest_config(ProptestConfig::with_cases(100000))] #[test] #[cfg_attr(miri, ignore)] // miri and proptest do not work well fn check_utf8_validation(values in binary_strategy()) { for offset in 0..values.len() - 1 { - let offsets = vec![0, offset as i32, values.len() as i32]; + let offsets = vec![0, offset as i32, values.len() as i32].try_into().unwrap(); let mut is_valid = std::str::from_utf8(&values[..offset]).is_ok(); is_valid &= std::str::from_utf8(&values[offset..]).is_ok(); - assert_eq!(try_check_offsets_and_utf8::(&offsets, &values).is_ok(), is_valid) + assert_eq!(try_check_utf8::>(&offsets, &values).is_ok(), is_valid) } } } diff --git a/src/array/utf8/ffi.rs b/src/array/utf8/ffi.rs index abaed6b28e7..3611678da57 100644 --- a/src/array/utf8/ffi.rs +++ b/src/array/utf8/ffi.rs @@ -1,8 +1,9 @@ use crate::{ - array::{FromFfi, Offset, ToFfi}, + array::{FromFfi, ToFfi}, bitmap::align, error::Result, ffi, + offset::{Offset, OffsetsBuffer}, }; use super::Utf8Array; @@ -11,13 +12,13 @@ unsafe impl ToFfi for Utf8Array { fn buffers(&self) -> Vec> { vec![ self.validity.as_ref().map(|x| x.as_ptr()), - Some(self.offsets.as_ptr().cast::()), + Some(self.offsets.buffer().as_ptr().cast::()), Some(self.values.as_ptr().cast::()), ] } fn offset(&self) -> Option { - let offset = self.offsets.offset(); + let offset = self.offsets.buffer().offset(); if let Some(bitmap) = self.validity.as_ref() { if bitmap.offset() == offset { Some(offset) @@ -30,7 +31,7 @@ unsafe impl ToFfi for Utf8Array { } fn to_ffi_aligned(&self) -> Self { - let offset = self.offsets.offset(); + let offset = self.offsets.buffer().offset(); let validity = self.validity.as_ref().map(|bitmap| { if bitmap.offset() == offset { @@ -56,8 +57,9 @@ impl FromFfi for Utf8Array { let offsets = unsafe { array.buffer::(1) }?; let values = unsafe { array.buffer::(2)? }; - Ok(Self::from_data_unchecked( - data_type, offsets, values, validity, - )) + // assumption that data from FFI is well constructed + let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets) }; + + Ok(Self::new_unchecked(data_type, offsets, values, validity)) } } diff --git a/src/array/utf8/fmt.rs b/src/array/utf8/fmt.rs index 6ea28feae12..1b6868c4a0e 100644 --- a/src/array/utf8/fmt.rs +++ b/src/array/utf8/fmt.rs @@ -1,7 +1,8 @@ use std::fmt::{Debug, Formatter, Result, Write}; +use crate::offset::Offset; + use super::super::fmt::write_vec; -use super::super::Offset; use super::Utf8Array; pub fn write_value(array: &Utf8Array, index: usize, f: &mut W) -> Result { diff --git a/src/array/utf8/from.rs b/src/array/utf8/from.rs index 1a0a0a1f7e2..f6866998312 100644 --- a/src/array/utf8/from.rs +++ b/src/array/utf8/from.rs @@ -1,6 +1,6 @@ use std::iter::FromIterator; -use crate::array::Offset; +use crate::offset::Offset; use super::{MutableUtf8Array, Utf8Array}; diff --git a/src/array/utf8/iterator.rs b/src/array/utf8/iterator.rs index 071fc54d8ae..2a5ba87c3e8 100644 --- a/src/array/utf8/iterator.rs +++ b/src/array/utf8/iterator.rs @@ -1,5 +1,6 @@ -use crate::array::{ArrayAccessor, ArrayValuesIter, Offset}; +use crate::array::{ArrayAccessor, ArrayValuesIter}; use crate::bitmap::utils::{BitmapIter, ZipValidity}; +use crate::offset::Offset; use super::{MutableUtf8Array, MutableUtf8ValuesArray, Utf8Array}; diff --git a/src/array/utf8/mod.rs b/src/array/utf8/mod.rs index f91e1466451..f8b8b86a8b8 100644 --- a/src/array/utf8/mod.rs +++ b/src/array/utf8/mod.rs @@ -6,14 +6,15 @@ use crate::{ buffer::Buffer, datatypes::DataType, error::{Error, Result}, + offset::{Offset, OffsetsBuffer}, trusted_len::TrustedLen, }; use either::Either; use super::{ - specification::{try_check_offsets_and_utf8, try_check_offsets_bounds}, - Array, GenericBinaryArray, Offset, + specification::{try_check_offsets_bounds, try_check_utf8}, + Array, GenericBinaryArray, }; mod ffi; @@ -50,7 +51,7 @@ impl> AsRef<[u8]> for StrAsBytes { /// // the underlying representation /// assert_eq!(array.validity(), Some(&Bitmap::from([true, false, true]))); /// assert_eq!(array.values(), &Buffer::from(b"hithere".to_vec())); -/// assert_eq!(array.offsets(), &Buffer::from(vec![0, 2, 2, 2 + 5])); +/// assert_eq!(array.offsets().buffer(), &Buffer::from(vec![0, 2, 2, 2 + 5])); /// # } /// ``` /// @@ -68,7 +69,7 @@ impl> AsRef<[u8]> for StrAsBytes { #[derive(Clone)] pub struct Utf8Array { data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, } @@ -79,23 +80,22 @@ impl Utf8Array { /// /// # Errors /// This function returns an error iff: - /// * the offsets are not monotonically increasing /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. + /// * the validity's length is not equal to `offsets.len()`. /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`. /// * The `values` between two consecutive `offsets` are not valid utf8 /// # Implementation - /// This function is `O(N)` - checking monotinicity and utf8 is `O(N)` + /// This function is `O(N)` - checking utf8 is `O(N)` pub fn try_new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Result { - try_check_offsets_and_utf8(&offsets, &values)?; + try_check_utf8(&offsets, &values)?; if validity .as_ref() - .map_or(false, |validity| validity.len() != offsets.len() - 1) + .map_or(false, |validity| validity.len() != offsets.len()) { return Err(Error::oos( "validity mask length must match the number of values", @@ -144,7 +144,7 @@ impl Utf8Array { /// Returns the length of this array #[inline] pub fn len(&self) -> usize { - self.offsets.len() - 1 + self.offsets.len() } /// Returns the value of the element at index `i`, ignoring the array's validity. @@ -162,8 +162,7 @@ impl Utf8Array { #[inline] pub unsafe fn value_unchecked(&self, i: usize) -> &str { // soundness: the invariant of the function - let start = self.offsets.get_unchecked(i).to_usize(); - let end = self.offsets.get_unchecked(i + 1).to_usize(); + let (start, end) = self.offsets.start_end_unchecked(i); // soundness: the invariant of the struct let slice = self.values.get_unchecked(start..end); @@ -186,7 +185,7 @@ impl Utf8Array { /// Returns the offsets of this [`Utf8Array`]. #[inline] - pub fn offsets(&self) -> &Buffer { + pub fn offsets(&self) -> &OffsetsBuffer { &self.offsets } @@ -277,7 +276,7 @@ impl Utf8Array { }), Right(mutable_bitmap) => match ( self.values.get_mut().map(std::mem::take), - self.offsets.get_mut().map(std::mem::take), + self.offsets.get_mut(), ) { (None, None) => { // Safety: invariants are preserved @@ -325,7 +324,7 @@ impl Utf8Array { } else { match ( self.values.get_mut().map(std::mem::take), - self.offsets.get_mut().map(std::mem::take), + self.offsets.get_mut(), ) { (None, None) => Left(unsafe { Utf8Array::new_unchecked(self.data_type, self.offsets, self.values, None) @@ -348,14 +347,7 @@ impl Utf8Array { /// The array is guaranteed to have no elements nor validity. #[inline] pub fn new_empty(data_type: DataType) -> Self { - unsafe { - Self::from_data_unchecked( - data_type, - Buffer::from(vec![O::zero()]), - Buffer::new(), - None, - ) - } + unsafe { Self::from_data_unchecked(data_type, OffsetsBuffer::new(), Buffer::new(), None) } } /// Returns a new [`Utf8Array`] whose all slots are null / `None`. @@ -363,7 +355,7 @@ impl Utf8Array { pub fn new_null(data_type: DataType, length: usize) -> Self { Self::new( data_type, - vec![O::default(); 1 + length].into(), + vec![O::default(); 1 + length].try_into().unwrap(), Buffer::new(), Some(Bitmap::new_zeroed(length)), ) @@ -383,17 +375,16 @@ impl Utf8Array { /// # Errors /// This function returns an error iff: /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. + /// * the validity's length is not equal to `offsets.len()`. /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`. /// # Safety /// This function is unsound iff: - /// * the offsets are not monotonically increasing /// * The `values` between two consecutive `offsets` are not valid utf8 /// # Implementation /// This function is `O(1)` pub unsafe fn try_new_unchecked( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Result { @@ -401,7 +392,7 @@ impl Utf8Array { if validity .as_ref() - .map_or(false, |validity| validity.len() != offsets.len() - 1) + .map_or(false, |validity| validity.len() != offsets.len()) { return Err(Error::oos( "validity mask length must match the number of values", @@ -425,16 +416,15 @@ impl Utf8Array { /// Creates a new [`Utf8Array`]. /// # Panics /// This function panics iff: - /// * the offsets are not monotonically increasing /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. + /// * the validity's length is not equal to `offsets.len()`. /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`. /// * The `values` between two consecutive `offsets` are not valid utf8 /// # Implementation - /// This function is `O(N)` - checking monotinicity and utf8 is `O(N)` + /// This function is `O(N)` - checking utf8 is `O(N)` pub fn new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Self { @@ -446,7 +436,7 @@ impl Utf8Array { /// # Errors /// This function returns an error iff: /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. + /// * the validity's length is not equal to `offsets.len()`. /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`. /// # Safety /// This function is unsound iff: @@ -456,7 +446,7 @@ impl Utf8Array { /// This function is `O(1)` pub unsafe fn new_unchecked( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Self { @@ -529,7 +519,7 @@ impl Utf8Array { /// Alias for `new` pub fn from_data( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Self { @@ -539,11 +529,10 @@ impl Utf8Array { /// Alias for [`Self::new_unchecked`] /// # Safety /// This function is unsafe iff: - /// * the offsets are not monotonically increasing /// * The `values` between two consecutive `offsets` are not valid utf8 pub unsafe fn from_data_unchecked( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Self { @@ -599,7 +588,7 @@ unsafe impl GenericBinaryArray for Utf8Array { #[inline] fn offsets(&self) -> &[O] { - self.offsets() + self.offsets().buffer() } } @@ -610,11 +599,6 @@ impl Default for Utf8Array { } else { DataType::Utf8 }; - Utf8Array::new( - data_type, - vec![O::from_usize(0).unwrap()].into(), - Default::default(), - None, - ) + Utf8Array::new(data_type, Default::default(), Default::default(), None) } } diff --git a/src/array/utf8/mutable.rs b/src/array/utf8/mutable.rs index 27d28dc64b2..cb66f056dd1 100644 --- a/src/array/utf8/mutable.rs +++ b/src/array/utf8/mutable.rs @@ -2,13 +2,14 @@ use std::{iter::FromIterator, sync::Arc}; use crate::array::{physical_binary::*, TryExtendFromSelf}; use crate::{ - array::{Array, MutableArray, Offset, TryExtend, TryPush}, + array::{Array, MutableArray, TryExtend, TryPush}, bitmap::{ utils::{BitmapIter, ZipValidity}, Bitmap, MutableBitmap, }, datatypes::DataType, error::{Error, Result}, + offset::{Offset, Offsets}, trusted_len::TrustedLen, }; @@ -52,16 +53,15 @@ impl MutableUtf8Array { /// /// # Errors /// This function returns an error iff: - /// * the offsets are not monotonically increasing /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. + /// * the validity's length is not equal to `offsets.len()`. /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`. /// * The `values` between two consecutive `offsets` are not valid utf8 /// # Implementation - /// This function is `O(N)` - checking monotinicity and utf8 is `O(N)` + /// This function is `O(N)` - checking utf8 is `O(N)` pub fn try_new( data_type: DataType, - offsets: Vec, + offsets: Offsets, values: Vec, validity: Option, ) -> Result { @@ -88,7 +88,7 @@ impl MutableUtf8Array { /// * The validity is not `None` and its length is different from `offsets`'s length minus one. pub unsafe fn new_unchecked( data_type: DataType, - offsets: Vec, + offsets: Offsets, values: Vec, validity: Option, ) -> Self { @@ -104,7 +104,7 @@ impl MutableUtf8Array { /// The caller must ensure that every value between offsets is a valid utf8. pub unsafe fn from_data_unchecked( data_type: DataType, - offsets: Vec, + offsets: Offsets, values: Vec, validity: Option, ) -> Self { @@ -119,7 +119,7 @@ impl MutableUtf8Array { /// * The validity is not `None` and its length is different from `offsets`'s length minus one. pub fn from_data( data_type: DataType, - offsets: Vec, + offsets: Offsets, values: Vec, validity: Option, ) -> Self { @@ -230,7 +230,7 @@ impl MutableUtf8Array { } /// Extract the low-end APIs from the [`MutableUtf8Array`]. - pub fn into_data(self) -> (DataType, Vec, Vec, Option) { + pub fn into_data(self) -> (DataType, Offsets, Vec, Option) { let (data_type, offsets, values) = self.values.into_inner(); (data_type, offsets, values, self.validity) } @@ -248,7 +248,7 @@ impl MutableUtf8Array { } /// returns its offsets. - pub fn offsets(&self) -> &Vec { + pub fn offsets(&self) -> &Offsets { self.values.offsets() } } diff --git a/src/array/utf8/mutable_values.rs b/src/array/utf8/mutable_values.rs index 5e33144452f..47c6b9e3eec 100644 --- a/src/array/utf8/mutable_values.rs +++ b/src/array/utf8/mutable_values.rs @@ -2,12 +2,13 @@ use std::{iter::FromIterator, sync::Arc}; use crate::{ array::{ - specification::{check_offsets_minimal, try_check_offsets_and_utf8}, - Array, ArrayValuesIter, MutableArray, Offset, TryExtend, TryExtendFromSelf, TryPush, + specification::{try_check_offsets_bounds, try_check_utf8}, + Array, ArrayValuesIter, MutableArray, TryExtend, TryExtendFromSelf, TryPush, }, bitmap::MutableBitmap, datatypes::DataType, error::{Error, Result}, + offset::{Offset, Offsets}, trusted_len::TrustedLen, }; @@ -19,7 +20,7 @@ use crate::array::physical_binary::*; #[derive(Debug, Clone)] pub struct MutableUtf8ValuesArray { data_type: DataType, - offsets: Vec, + offsets: Offsets, values: Vec, } @@ -65,7 +66,7 @@ impl MutableUtf8ValuesArray { pub fn new() -> Self { Self { data_type: Self::default_data_type(), - offsets: vec![O::default()], + offsets: Offsets::new(), values: Vec::::new(), } } @@ -74,14 +75,13 @@ impl MutableUtf8ValuesArray { /// /// # Errors /// This function returns an error iff: - /// * the offsets are not monotonically increasing /// * The last offset is not equal to the values' length. /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`. /// * The `values` between two consecutive `offsets` are not valid utf8 /// # Implementation - /// This function is `O(N)` - checking monotinicity and utf8 is `O(N)` - pub fn try_new(data_type: DataType, offsets: Vec, values: Vec) -> Result { - try_check_offsets_and_utf8(&offsets, &values)?; + /// This function is `O(N)` - checking utf8 is `O(N)` + pub fn try_new(data_type: DataType, offsets: Offsets, values: Vec) -> Result { + try_check_utf8(&offsets, &values)?; if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { return Err(Error::oos( "MutableUtf8ValuesArray can only be initialized with DataType::Utf8 or DataType::LargeUtf8", @@ -107,8 +107,9 @@ impl MutableUtf8ValuesArray { /// * The `values` between two consecutive `offsets` are not valid utf8 /// # Implementation /// This function is `O(1)` - pub unsafe fn new_unchecked(data_type: DataType, offsets: Vec, values: Vec) -> Self { - check_offsets_minimal(&offsets, values.len()); + pub unsafe fn new_unchecked(data_type: DataType, offsets: Offsets, values: Vec) -> Self { + try_check_offsets_bounds(&offsets, values.len()) + .expect("The length of the values must be equal to the last offset value"); if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { panic!("MutableUtf8ValuesArray can only be initialized with DataType::Utf8 or DataType::LargeUtf8") @@ -134,12 +135,9 @@ impl MutableUtf8ValuesArray { /// Initializes a new [`MutableUtf8ValuesArray`] with a pre-allocated capacity of items and values. pub fn with_capacities(capacity: usize, values: usize) -> Self { - let mut offsets = Vec::::with_capacity(capacity + 1); - offsets.push(O::default()); - Self { data_type: Self::default_data_type(), - offsets, + offsets: Offsets::::with_capacity(capacity), values: Vec::::with_capacity(values), } } @@ -152,7 +150,7 @@ impl MutableUtf8ValuesArray { /// returns its offsets. #[inline] - pub fn offsets(&self) -> &Vec { + pub fn offsets(&self) -> &Offsets { &self.offsets } @@ -171,7 +169,7 @@ impl MutableUtf8ValuesArray { /// Returns the length of this array #[inline] pub fn len(&self) -> usize { - self.offsets.len() - 1 + self.offsets.len() } /// Pushes a new item to the array. @@ -189,7 +187,7 @@ impl MutableUtf8ValuesArray { return None; } self.offsets.pop()?; - let start = self.offsets.last()?.to_usize(); + let start = self.offsets.last().to_usize(); let value = self.values.split_off(start); // Safety: utf8 is validated on initialization Some(unsafe { String::from_utf8_unchecked(value) }) @@ -210,8 +208,7 @@ impl MutableUtf8ValuesArray { #[inline] pub unsafe fn value_unchecked(&self, i: usize) -> &str { // soundness: the invariant of the function - let start = self.offsets.get_unchecked(i).to_usize(); - let end = self.offsets.get_unchecked(i + 1).to_usize(); + let (start, end) = self.offsets.start_end(i); // soundness: the invariant of the struct let slice = self.values.get_unchecked(start..end); @@ -232,7 +229,7 @@ impl MutableUtf8ValuesArray { } /// Extract the low-end APIs from the [`MutableUtf8ValuesArray`]. - pub fn into_inner(self) -> (DataType, Vec, Vec) { + pub fn into_inner(self) -> (DataType, Offsets, Vec) { (self.data_type, self.offsets, self.values) } } @@ -400,17 +397,13 @@ impl> TryPush for MutableUtf8ValuesArray { fn try_push(&mut self, value: T) -> Result<()> { let bytes = value.as_ref().as_bytes(); self.values.extend_from_slice(bytes); - - let size = O::from_usize(self.values.len()).ok_or(Error::Overflow)?; - - self.offsets.push(size); - Ok(()) + self.offsets.try_push_usize(bytes.len()) } } impl TryExtendFromSelf for MutableUtf8ValuesArray { fn try_extend_from_self(&mut self, other: &Self) -> Result<()> { self.values.extend_from_slice(&other.values); - try_extend_offsets(&mut self.offsets, &other.offsets) + self.offsets.try_extend_from_self(&other.offsets) } } diff --git a/src/compute/aggregate/memory.rs b/src/compute/aggregate/memory.rs index 5b2e0e07e2d..90a2bc3a762 100644 --- a/src/compute/aggregate/memory.rs +++ b/src/compute/aggregate/memory.rs @@ -9,7 +9,7 @@ fn validity_size(validity: Option<&Bitmap>) -> usize { macro_rules! dyn_binary { ($array:expr, $ty:ty, $o:ty) => {{ let array = $array.as_any().downcast_ref::<$ty>().unwrap(); - let offsets = array.offsets(); + let offsets = array.offsets().buffer(); // in case of Binary/Utf8/List the offsets are sliced, // not the values buffer diff --git a/src/compute/aggregate/min_max.rs b/src/compute/aggregate/min_max.rs index 00f76b052bc..886cd509dd6 100644 --- a/src/compute/aggregate/min_max.rs +++ b/src/compute/aggregate/min_max.rs @@ -1,11 +1,12 @@ use crate::bitmap::utils::{BitChunkIterExact, BitChunksExact}; use crate::datatypes::{DataType, PhysicalType, PrimitiveType}; use crate::error::{Error, Result}; +use crate::offset::Offset; use crate::scalar::*; use crate::types::simd::*; use crate::types::NativeType; use crate::{ - array::{Array, BinaryArray, BooleanArray, Offset, PrimitiveArray, Utf8Array}, + array::{Array, BinaryArray, BooleanArray, PrimitiveArray, Utf8Array}, bitmap::Bitmap, }; diff --git a/src/compute/cast/binary_to.rs b/src/compute/cast/binary_to.rs index 44224321b55..98cf4105b4b 100644 --- a/src/compute/cast/binary_to.rs +++ b/src/compute/cast/binary_to.rs @@ -1,6 +1,5 @@ -use std::convert::TryFrom; - -use crate::error::{Error, Result}; +use crate::error::Result; +use crate::offset::Offset; use crate::{array::*, datatypes::DataType, types::NativeType}; use super::CastOptions; @@ -8,11 +7,9 @@ use super::CastOptions; /// Conversion of binary pub fn binary_to_large_binary(from: &BinaryArray, to_data_type: DataType) -> BinaryArray { let values = from.values().clone(); - let offsets = from.offsets().iter().map(|x| *x as i64).collect::>(); - // todo: use `new_unchecked` since all invariants are preserved BinaryArray::::new( to_data_type, - offsets.into(), + from.offsets().into(), values, from.validity().cloned(), ) @@ -24,13 +21,10 @@ pub fn binary_large_to_binary( to_data_type: DataType, ) -> Result> { let values = from.values().clone(); - let _ = i32::try_from(*from.offsets().last().unwrap()).map_err(Error::from_external_error)?; - - let offsets = from.offsets().iter().map(|x| *x as i32).collect::>(); - // todo: use `new_unchecked` since all invariants are preserved + let offsets = from.offsets().try_into()?; Ok(BinaryArray::::new( to_data_type, - offsets.into(), + offsets, values, from.validity().cloned(), )) @@ -57,12 +51,7 @@ pub fn binary_to_large_utf8( to_data_type: DataType, ) -> Result> { let values = from.values().clone(); - let offsets = from - .offsets() - .iter() - .map(|x| *x as i64) - .collect::>() - .into(); + let offsets = from.offsets().into(); Utf8Array::::try_new(to_data_type, offsets, values, from.validity().cloned()) } diff --git a/src/compute/cast/boolean_to.rs b/src/compute/cast/boolean_to.rs index ef24e4b4dff..1ce45c87118 100644 --- a/src/compute/cast/boolean_to.rs +++ b/src/compute/cast/boolean_to.rs @@ -1,6 +1,7 @@ use crate::{ - array::{Array, BinaryArray, BooleanArray, Offset, PrimitiveArray, Utf8Array}, + array::{Array, BinaryArray, BooleanArray, PrimitiveArray, Utf8Array}, error::Result, + offset::Offset, types::NativeType, }; diff --git a/src/compute/cast/mod.rs b/src/compute/cast/mod.rs index edb6c223568..d4d47380752 100644 --- a/src/compute/cast/mod.rs +++ b/src/compute/cast/mod.rs @@ -18,6 +18,7 @@ use crate::{ array::*, datatypes::*, error::{Error, Result}, + offset::{Offset, Offsets}, }; /// options defining how Cast kernels behave @@ -323,28 +324,18 @@ fn cast_list( } fn cast_list_to_large_list(array: &ListArray, to_type: &DataType) -> ListArray { - let offets = array - .offsets() - .iter() - .map(|x| *x as i64) - .collect::>() - .into(); + let offsets = array.offsets().into(); ListArray::::new( to_type.clone(), - offets, + offsets, array.values().clone(), array.validity().cloned(), ) } fn cast_large_to_list(array: &ListArray, to_type: &DataType) -> ListArray { - let offsets = array - .offsets() - .iter() - .map(|x| *x as i32) - .collect::>() - .into(); + let offsets = array.offsets().try_into().expect("Conver me to error"); ListArray::::new( to_type.clone(), @@ -365,14 +356,15 @@ fn cast_fixed_size_list_to_list( options, )?; - let offsets = (0..(fixed.len() + 1)) + let offsets = (0..=fixed.len()) .map(|ix| (ix * fixed.size()) as i32) - .collect::>() - .into(); + .collect::>(); + // Safety: offsets _are_ monotonically increasing + let offsets = unsafe { Offsets::new_unchecked(offsets) }; Ok(ListArray::::new( to_type.clone(), - offsets, + offsets.into(), new_values, fixed.validity().cloned(), )) @@ -384,7 +376,7 @@ fn cast_list_to_fixed_size_list( size: usize, options: CastOptions, ) -> Result { - let offsets = list.offsets().iter(); + let offsets = list.offsets().buffer().iter(); let expected = (0..list.len()).map(|ix| (ix * size) as i32); match offsets @@ -477,6 +469,8 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu let values = cast(array, &to.data_type, options)?; // create offsets, where if array.len() = 2, we have [0,1,2] let offsets = (0..=array.len() as i32).collect::>(); + // Safety: offsets _are_ monotonically increasing + let offsets = unsafe { Offsets::new_unchecked(offsets) }; let list_array = ListArray::::new(to_type.clone(), offsets.into(), values, None); diff --git a/src/compute/cast/primitive_to.rs b/src/compute/cast/primitive_to.rs index a36f16e3deb..4feb5aaba6c 100644 --- a/src/compute/cast/primitive_to.rs +++ b/src/compute/cast/primitive_to.rs @@ -4,6 +4,7 @@ use num_traits::{AsPrimitive, Float, ToPrimitive}; use crate::datatypes::IntervalUnit; use crate::error::Result; +use crate::offset::{Offset, Offsets}; use crate::types::{days_ms, f16, months_days_ns}; use crate::{ array::*, @@ -41,7 +42,9 @@ pub fn primitive_to_binary( } values.set_len(offset); values.shrink_to_fit(); - BinaryArray::::from_data_unchecked( + // Safety: offsets _are_ monotonically increasing + let offsets = unsafe { Offsets::new_unchecked(offsets) }; + BinaryArray::::new( BinaryArray::::default_data_type(), offsets.into(), values.into(), @@ -103,11 +106,13 @@ pub fn primitive_to_utf8( let len = lexical_core::write_unchecked(*x, bytes).len(); offset += len; - offsets.push(O::from_usize(offset as usize).unwrap()); + offsets.push(O::from_usize(offset).unwrap()); } values.set_len(offset); values.shrink_to_fit(); - Utf8Array::::from_data_unchecked( + // Safety: offsets _are_ monotonically increasing + let offsets = unsafe { Offsets::new_unchecked(offsets) }; + Utf8Array::::new_unchecked( Utf8Array::::default_data_type(), offsets.into(), values.into(), diff --git a/src/compute/cast/utf8_to.rs b/src/compute/cast/utf8_to.rs index 996889174a2..165c24a1025 100644 --- a/src/compute/cast/utf8_to.rs +++ b/src/compute/cast/utf8_to.rs @@ -1,11 +1,10 @@ -use std::convert::TryFrom; - use chrono::Datelike; use crate::{ array::*, datatypes::DataType, - error::{Error, Result}, + error::Result, + offset::Offset, temporal_conversions::{ utf8_to_naive_timestamp_ns as utf8_to_naive_timestamp_ns_, utf8_to_timestamp_ns as utf8_to_timestamp_ns_, EPOCH_DAYS_FROM_CE, @@ -149,13 +148,9 @@ pub fn utf8_to_large_utf8(from: &Utf8Array) -> Utf8Array { let data_type = Utf8Array::::default_data_type(); let validity = from.validity().cloned(); let values = from.values().clone(); - let offsets = from - .offsets() - .iter() - .map(|x| *x as i64) - .collect::>() - .into(); - // Safety: sound because `offsets` fulfills the same invariants as `from.offsets()` + + let offsets = from.offsets().into(); + // Safety: sound because `values` fulfills the same invariants as `from.values()` unsafe { Utf8Array::::from_data_unchecked(data_type, offsets, values, validity) } } @@ -164,22 +159,17 @@ pub fn utf8_large_to_utf8(from: &Utf8Array) -> Result> { let data_type = Utf8Array::::default_data_type(); let validity = from.validity().cloned(); let values = from.values().clone(); - let _ = i32::try_from(*from.offsets().last().unwrap()).map_err(Error::from_external_error)?; + let offsets = from.offsets().try_into()?; - let offsets = from - .offsets() - .iter() - .map(|x| *x as i32) - .collect::>() - .into(); - // Safety: sound because `offsets` fulfills the same invariants as `from.offsets()` + // Safety: sound because `values` fulfills the same invariants as `from.values()` Ok(unsafe { Utf8Array::::from_data_unchecked(data_type, offsets, values, validity) }) } /// Conversion to binary pub fn utf8_to_binary(from: &Utf8Array, to_data_type: DataType) -> BinaryArray { + // Safety: erasure of an invariant is always safe unsafe { - BinaryArray::::new_unchecked( + BinaryArray::::new( to_data_type, from.offsets().clone(), from.values().clone(), diff --git a/src/compute/comparison/binary.rs b/src/compute/comparison/binary.rs index 0317a9b7783..6787e184618 100644 --- a/src/compute/comparison/binary.rs +++ b/src/compute/comparison/binary.rs @@ -1,9 +1,10 @@ //! Comparison functions for [`BinaryArray`] use crate::compute::comparison::{finish_eq_validities, finish_neq_validities}; use crate::{ - array::{BinaryArray, BooleanArray, Offset}, + array::{BinaryArray, BooleanArray}, bitmap::Bitmap, datatypes::DataType, + offset::Offset, }; use super::super::utils::combine_validities; diff --git a/src/compute/comparison/utf8.rs b/src/compute/comparison/utf8.rs index 31983fa816c..05d84d803ff 100644 --- a/src/compute/comparison/utf8.rs +++ b/src/compute/comparison/utf8.rs @@ -1,9 +1,10 @@ //! Comparison functions for [`Utf8Array`] use crate::compute::comparison::{finish_eq_validities, finish_neq_validities}; use crate::{ - array::{BooleanArray, Offset, Utf8Array}, + array::{BooleanArray, Utf8Array}, bitmap::Bitmap, datatypes::DataType, + offset::Offset, }; use super::super::utils::combine_validities; diff --git a/src/compute/contains.rs b/src/compute/contains.rs index a1ede1dd29d..738a8e11e57 100644 --- a/src/compute/contains.rs +++ b/src/compute/contains.rs @@ -1,10 +1,11 @@ //! Declares the [`contains`] operator use crate::{ - array::{Array, BinaryArray, BooleanArray, ListArray, Offset, PrimitiveArray, Utf8Array}, + array::{Array, BinaryArray, BooleanArray, ListArray, PrimitiveArray, Utf8Array}, bitmap::Bitmap, datatypes::DataType, error::{Error, Result}, + offset::Offset, types::NativeType, }; diff --git a/src/compute/hash.rs b/src/compute/hash.rs index 68232b4d015..d5f8370d8b4 100644 --- a/src/compute/hash.rs +++ b/src/compute/hash.rs @@ -12,9 +12,10 @@ macro_rules! new_state { } use crate::{ - array::{Array, BinaryArray, BooleanArray, Offset, PrimitiveArray, Utf8Array}, + array::{Array, BinaryArray, BooleanArray, PrimitiveArray, Utf8Array}, datatypes::{DataType, PhysicalType, PrimitiveType}, error::{Error, Result}, + offset::Offset, types::NativeType, }; diff --git a/src/compute/length.rs b/src/compute/length.rs index c52541a8917..9dc7e0b1c12 100644 --- a/src/compute/length.rs +++ b/src/compute/length.rs @@ -21,6 +21,7 @@ use crate::{ array::*, datatypes::DataType, error::{Error, Result}, + offset::Offset, types::NativeType, }; @@ -31,6 +32,7 @@ where { let values = array .offsets() + .buffer() .windows(2) .map(|offset| op(offset[1] - offset[0])) .collect::>(); diff --git a/src/compute/like.rs b/src/compute/like.rs index bf363972a1a..5c736cd8b5f 100644 --- a/src/compute/like.rs +++ b/src/compute/like.rs @@ -5,11 +5,12 @@ use regex::bytes::Regex as BytesRegex; use regex::Regex; use crate::{ - array::{BinaryArray, BooleanArray, Offset, Utf8Array}, + array::{BinaryArray, BooleanArray, Utf8Array}, bitmap::Bitmap, compute::utils::combine_validities, datatypes::DataType, error::{Error, Result}, + offset::Offset, }; #[inline] diff --git a/src/compute/regex_match.rs b/src/compute/regex_match.rs index 41cacb293bf..371eb073e46 100644 --- a/src/compute/regex_match.rs +++ b/src/compute/regex_match.rs @@ -3,11 +3,13 @@ use ahash::AHashMap; use regex::Regex; -use super::utils::combine_validities; -use crate::array::{BooleanArray, Offset, Utf8Array}; +use crate::array::{BooleanArray, Utf8Array}; use crate::bitmap::Bitmap; use crate::datatypes::DataType; use crate::error::{Error, Result}; +use crate::offset::Offset; + +use super::utils::combine_validities; /// Regex matches pub fn regex_match(values: &Utf8Array, regex: &Utf8Array) -> Result { diff --git a/src/compute/sort/binary.rs b/src/compute/sort/binary.rs index 766efc678b2..cf0992b4b99 100644 --- a/src/compute/sort/binary.rs +++ b/src/compute/sort/binary.rs @@ -1,4 +1,5 @@ -use crate::array::{BinaryArray, Offset, PrimitiveArray}; +use crate::array::{BinaryArray, PrimitiveArray}; +use crate::offset::Offset; use crate::types::Index; use super::common; diff --git a/src/compute/sort/mod.rs b/src/compute/sort/mod.rs index be0f15a1ae2..5394e370280 100644 --- a/src/compute/sort/mod.rs +++ b/src/compute/sort/mod.rs @@ -5,6 +5,7 @@ use crate::array::ord; use crate::compute::take; use crate::datatypes::*; use crate::error::{Error, Result}; +use crate::offset::Offset; use crate::{ array::*, types::{Index, NativeType}, diff --git a/src/compute/sort/row/mod.rs b/src/compute/sort/row/mod.rs index 8d4833af089..005e046fc92 100644 --- a/src/compute/sort/row/mod.rs +++ b/src/compute/sort/row/mod.rs @@ -637,9 +637,10 @@ mod tests { use super::*; use crate::{ - array::{Array, DictionaryKey, Float32Array, Int16Array, NullArray, Offset}, + array::{Array, DictionaryKey, Float32Array, Int16Array, NullArray}, compute::sort::build_compare, datatypes::DataType, + offset::Offset, types::NativeType, }; diff --git a/src/compute/sort/utf8.rs b/src/compute/sort/utf8.rs index e2e2da1bc56..0d7190eb23f 100644 --- a/src/compute/sort/utf8.rs +++ b/src/compute/sort/utf8.rs @@ -1,5 +1,5 @@ -use crate::array::{DictionaryArray, DictionaryKey}; -use crate::array::{Offset, PrimitiveArray, Utf8Array}; +use crate::array::{DictionaryArray, DictionaryKey, PrimitiveArray, Utf8Array}; +use crate::offset::Offset; use crate::types::Index; use super::common; diff --git a/src/compute/substring.rs b/src/compute/substring.rs index d879a971707..95618c6a007 100644 --- a/src/compute/substring.rs +++ b/src/compute/substring.rs @@ -21,6 +21,7 @@ use crate::{ array::*, datatypes::DataType, error::{Error, Result}, + offset::{Offset, Offsets}, }; fn utf8_substring(array: &Utf8Array, start: O, length: &Option) -> Utf8Array { @@ -76,13 +77,10 @@ fn binary_substring( let offsets = array.offsets(); let values = array.values(); - let mut new_offsets = Vec::::with_capacity(array.len() + 1); + let mut new_offsets = Offsets::::with_capacity(array.len()); let mut new_values = Vec::::new(); // we have no way to estimate how much this will be. - let mut length_so_far = O::zero(); - new_offsets.push(length_so_far); - - offsets.windows(2).for_each(|windows| { + offsets.buffer().windows(2).for_each(|windows| { let length_i: O = windows[1] - windows[0]; // compute where we should start slicing this entry @@ -98,8 +96,9 @@ fn binary_substring( .unwrap_or(length_i) // .max(0) is not needed as it is guaranteed .min(windows[1] - start); // so we do not go beyond this entry - length_so_far += length; - new_offsets.push(length_so_far); + new_offsets + .try_push(length) + .expect("Substring is always smaller than original - overflow never happens"); // we need usize for ranges let start = start.to_usize(); diff --git a/src/compute/take/binary.rs b/src/compute/take/binary.rs index 0651b6bba13..4d116d650df 100644 --- a/src/compute/take/binary.rs +++ b/src/compute/take/binary.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{Array, BinaryArray, Offset, PrimitiveArray}; +use crate::array::{Array, BinaryArray, PrimitiveArray}; +use crate::offset::Offset; use super::generic_binary::*; use super::Index; diff --git a/src/compute/take/generic_binary.rs b/src/compute/take/generic_binary.rs index ef78184f6b8..a9cf9c199c2 100644 --- a/src/compute/take/generic_binary.rs +++ b/src/compute/take/generic_binary.rs @@ -1,20 +1,26 @@ use crate::{ - array::{GenericBinaryArray, Offset, PrimitiveArray}, + array::{GenericBinaryArray, PrimitiveArray}, bitmap::{Bitmap, MutableBitmap}, buffer::Buffer, + offset::{Offset, Offsets, OffsetsBuffer}, }; use super::Index; -pub fn take_values(length: O, starts: &[O], offsets: &[O], values: &[u8]) -> Buffer { +pub fn take_values( + length: O, + starts: &[O], + offsets: &OffsetsBuffer, + values: &[u8], +) -> Buffer { let new_len = length.to_usize(); let mut buffer = Vec::with_capacity(new_len); starts .iter() - .zip(offsets.windows(2)) - .for_each(|(start_, window)| { - let start = start_.to_usize(); - let end = (*start_ + (window[1] - window[0])).to_usize(); + .map(|start| start.to_usize()) + .zip(offsets.lengths()) + .for_each(|(start, length)| { + let end = start + length; buffer.extend_from_slice(&values[start..end]); }); buffer.into() @@ -22,36 +28,27 @@ pub fn take_values(length: O, starts: &[O], offsets: &[O], values: &[ // take implementation when neither values nor indices contain nulls pub fn take_no_validity( - offsets: &[O], + offsets: &OffsetsBuffer, values: &[u8], indices: &[I], -) -> (Buffer, Buffer, Option) { - let mut length = O::default(); +) -> (OffsetsBuffer, Buffer, Option) { let mut buffer = Vec::::new(); - let offsets = indices.iter().map(|index| { - let index = index.to_usize(); - let start = offsets[index]; - let length_h = offsets[index + 1] - start; - length += length_h; - - let _start = start.to_usize(); - let end = (start + length_h).to_usize(); - buffer.extend_from_slice(&values[_start..end]); - length + let lengths = indices.iter().map(|index| index.to_usize()).map(|index| { + let (start, end) = offsets.start_end(index); + // todo: remove this bound check + buffer.extend_from_slice(&values[start..end]); + end - start }); - let offsets = std::iter::once(O::default()) - .chain(offsets) - .collect::>() - .into(); + let offsets = Offsets::try_from_lengths(lengths).expect(""); - (offsets, buffer.into(), None) + (offsets.into(), buffer.into(), None) } // take implementation when only values contain nulls pub fn take_values_validity>( values: &A, indices: &[I], -) -> (Buffer, Buffer, Option) { +) -> (OffsetsBuffer, Buffer, Option) { let validity_values = values.validity().unwrap(); let validity = indices .iter() @@ -74,20 +71,24 @@ pub fn take_values_validity>( let offsets = std::iter::once(O::default()) .chain(offsets) .collect::>(); + // Safety: by construction offsets are monotonically increasing + let offsets = unsafe { Offsets::new_unchecked(offsets) }.into(); - let buffer = take_values(length, starts.as_slice(), offsets.as_slice(), values_values); + let buffer = take_values(length, starts.as_slice(), &offsets, values_values); - (offsets.into(), buffer, validity.into()) + (offsets, buffer, validity.into()) } // take implementation when only indices contain nulls pub fn take_indices_validity( - offsets: &[O], + offsets: &OffsetsBuffer, values: &[u8], indices: &PrimitiveArray, -) -> (Buffer, Buffer, Option) { +) -> (OffsetsBuffer, Buffer, Option) { let mut length = O::default(); + let offsets = offsets.buffer(); + let mut starts = Vec::::with_capacity(indices.len()); let offsets = indices.values().iter().map(|index| { let index = index.to_usize(); @@ -104,18 +105,19 @@ pub fn take_indices_validity( let offsets = std::iter::once(O::default()) .chain(offsets) .collect::>(); - let starts: Buffer = starts.into(); + // Safety: by construction offsets are monotonically increasing + let offsets = unsafe { Offsets::new_unchecked(offsets) }.into(); - let buffer = take_values(length, starts.as_slice(), offsets.as_slice(), values); + let buffer = take_values(length, &starts, &offsets, values); - (offsets.into(), buffer, indices.validity().cloned()) + (offsets, buffer, indices.validity().cloned()) } // take implementation when both indices and values contain nulls pub fn take_values_indices_validity>( values: &A, indices: &PrimitiveArray, -) -> (Buffer, Buffer, Option) { +) -> (OffsetsBuffer, Buffer, Option) { let mut length = O::default(); let mut validity = MutableBitmap::with_capacity(indices.len()); @@ -147,10 +149,10 @@ pub fn take_values_indices_validity>(); + // Safety: by construction offsets are monotonically increasing + let offsets = unsafe { Offsets::new_unchecked(offsets) }.into(); - let starts: Buffer = starts.into(); - - let buffer = take_values(length, starts.as_slice(), offsets.as_slice(), values_values); + let buffer = take_values(length, &starts, &offsets, values_values); - (offsets.into(), buffer, validity.into()) + (offsets, buffer, validity.into()) } diff --git a/src/compute/take/list.rs b/src/compute/take/list.rs index 5e8b1d10e7c..6abc1d10155 100644 --- a/src/compute/take/list.rs +++ b/src/compute/take/list.rs @@ -17,8 +17,9 @@ use crate::array::{ growable::{Growable, GrowableList}, - ListArray, Offset, PrimitiveArray, + ListArray, PrimitiveArray, }; +use crate::offset::Offset; use super::Index; diff --git a/src/compute/take/utf8.rs b/src/compute/take/utf8.rs index 99fc091f491..490e76bf4b8 100644 --- a/src/compute/take/utf8.rs +++ b/src/compute/take/utf8.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{Array, Offset, PrimitiveArray, Utf8Array}; +use crate::array::{Array, PrimitiveArray, Utf8Array}; +use crate::offset::Offset; use super::generic_binary::*; use super::Index; diff --git a/src/compute/utf8.rs b/src/compute/utf8.rs index 37c1c6ff47e..2e480016ef5 100644 --- a/src/compute/utf8.rs +++ b/src/compute/utf8.rs @@ -1,9 +1,10 @@ //! Defines common maps to a [`Utf8Array`] use crate::{ - array::{Array, Offset, Utf8Array}, + array::{Array, Utf8Array}, datatypes::DataType, error::{Error, Result}, + offset::Offset, }; /// utf8_apply will apply `Fn(&str) -> String` to every value in Utf8Array. diff --git a/src/ffi/mmap.rs b/src/ffi/mmap.rs index 4bd006393f7..32be7d763eb 100644 --- a/src/ffi/mmap.rs +++ b/src/ffi/mmap.rs @@ -1,9 +1,10 @@ use std::collections::VecDeque; use std::sync::Arc; -use crate::array::{Array, DictionaryKey, FixedSizeListArray, ListArray, Offset, StructArray}; +use crate::array::{Array, DictionaryKey, FixedSizeListArray, ListArray, StructArray}; use crate::datatypes::DataType; use crate::error::Error; +use crate::offset::Offset; use crate::io::ipc::read::{Dictionaries, OutOfSpecKind}; use crate::io::ipc::read::{IpcBuffer, Node}; diff --git a/src/io/avro/read/nested.rs b/src/io/avro/read/nested.rs index 649a702ff9c..7886bd0b81b 100644 --- a/src/io/avro/read/nested.rs +++ b/src/io/avro/read/nested.rs @@ -2,25 +2,24 @@ use crate::array::*; use crate::bitmap::*; use crate::datatypes::*; use crate::error::*; +use crate::offset::{Offset, Offsets}; /// Auxiliary struct #[derive(Debug)] pub struct DynMutableListArray { data_type: DataType, - offsets: Vec, + offsets: Offsets, values: Box, validity: Option, } impl DynMutableListArray { pub fn new_from(values: Box, data_type: DataType, capacity: usize) -> Self { - let mut offsets = Vec::::with_capacity(capacity + 1); - offsets.push(O::default()); assert_eq!(values.len(), 0); ListArray::::get_child_field(&data_type); Self { data_type, - offsets, + offsets: Offsets::::with_capacity(capacity), values, validity: None, } @@ -33,11 +32,11 @@ impl DynMutableListArray { #[inline] pub fn try_push_valid(&mut self) -> Result<()> { - let size = self.values.len(); - let size = O::from_usize(size).ok_or(Error::Overflow)?; - assert!(size >= *self.offsets.last().unwrap()); + let total_length = self.values.len(); + let offset = self.offsets.last().to_usize(); + let length = total_length.checked_sub(offset).ok_or(Error::Overflow)?; - self.offsets.push(size); + self.offsets.try_push_usize(length)?; if let Some(validity) = &mut self.validity { validity.push(true) } @@ -46,20 +45,15 @@ impl DynMutableListArray { #[inline] fn push_null(&mut self) { - self.offsets.push(self.last_offset()); + self.offsets.extend_constant(1); match &mut self.validity { Some(validity) => validity.push(false), None => self.init_validity(), } } - #[inline] - fn last_offset(&self) -> O { - *self.offsets.last().unwrap() - } - fn init_validity(&mut self) { - let len = self.offsets.len() - 1; + let len = self.offsets.len(); let mut validity = MutableBitmap::new(); validity.extend_constant(len, true); @@ -70,7 +64,7 @@ impl DynMutableListArray { impl MutableArray for DynMutableListArray { fn len(&self) -> usize { - self.offsets.len() - 1 + self.offsets.len() } fn validity(&self) -> Option<&MutableBitmap> { @@ -78,21 +72,23 @@ impl MutableArray for DynMutableListArray { } fn as_box(&mut self) -> Box { - Box::new(ListArray::new( + ListArray::new( self.data_type.clone(), std::mem::take(&mut self.offsets).into(), self.values.as_box(), std::mem::take(&mut self.validity).map(|x| x.into()), - )) + ) + .boxed() } fn as_arc(&mut self) -> std::sync::Arc { - std::sync::Arc::new(ListArray::new( + ListArray::new( self.data_type.clone(), std::mem::take(&mut self.offsets).into(), self.values.as_box(), std::mem::take(&mut self.validity).map(|x| x.into()), - )) + ) + .arced() } fn data_type(&self) -> &DataType { diff --git a/src/io/avro/write/serialize.rs b/src/io/avro/write/serialize.rs index cb94b78d28b..32dc18cc8eb 100644 --- a/src/io/avro/write/serialize.rs +++ b/src/io/avro/write/serialize.rs @@ -3,6 +3,7 @@ use avro_schema::write::encode; use crate::bitmap::utils::ZipValidity; use crate::datatypes::{IntervalUnit, PhysicalType, PrimitiveType}; +use crate::offset::Offset; use crate::types::months_days_ns; use crate::{array::*, datatypes::DataType}; @@ -99,6 +100,7 @@ fn list_required<'a, O: Offset>(array: &'a ListArray, schema: &AvroSchema) -> let mut inner = new_serializer(array.values().as_ref(), schema); let lengths = array .offsets() + .buffer() .windows(2) .map(|w| (w[1] - w[0]).to_usize() as i64); @@ -124,6 +126,7 @@ fn list_optional<'a, O: Offset>(array: &'a ListArray, schema: &AvroSchema) -> let mut inner = new_serializer(array.values().as_ref(), schema); let lengths = array .offsets() + .buffer() .windows(2) .map(|w| (w[1] - w[0]).to_usize() as i64); let lengths = ZipValidity::new_with_validity(lengths, array.validity()); diff --git a/src/io/csv/read_utils.rs b/src/io/csv/read_utils.rs index 27bdcff163d..278dd32cb21 100644 --- a/src/io/csv/read_utils.rs +++ b/src/io/csv/read_utils.rs @@ -1,23 +1,24 @@ use chrono::Datelike; -// Ideally this trait should not be needed and both `csv` and `csv_async` crates would share -// the same `ByteRecord` struct. Unfortunately, they do not and thus we must use generics -// over this trait and materialize the generics for each struct. -pub(crate) trait ByteRecordGeneric { - fn get(&self, index: usize) -> Option<&[u8]>; -} - use crate::{ array::*, chunk::Chunk, datatypes::*, error::{Error, Result}, + offset::Offset, temporal_conversions, types::NativeType, }; use super::utils::RFC3339; +// Ideally this trait should not be needed and both `csv` and `csv_async` crates would share +// the same `ByteRecord` struct. Unfortunately, they do not and thus we must use generics +// over this trait and materialize the generics for each struct. +pub(crate) trait ByteRecordGeneric { + fn get(&self, index: usize) -> Option<&[u8]>; +} + #[inline] fn to_utf8(bytes: &[u8]) -> Option<&str> { simdutf8::basic::from_utf8(bytes).ok() diff --git a/src/io/csv/write/serialize.rs b/src/io/csv/write/serialize.rs index 6f704d1cc08..46addc378f0 100644 --- a/src/io/csv/write/serialize.rs +++ b/src/io/csv/write/serialize.rs @@ -5,13 +5,15 @@ use crate::temporal_conversions; use crate::types::NativeType; use crate::util::lexical_to_bytes_mut; use crate::{ - array::{Array, BinaryArray, BooleanArray, PrimitiveArray, Utf8Array}, + array::{ + Array, BinaryArray, BooleanArray, DictionaryArray, DictionaryKey, PrimitiveArray, Utf8Array, + }, datatypes::{DataType, TimeUnit}, error::Result, + offset::Offset, }; use super::super::super::iterator::{BufStreamingIterator, StreamingIterator}; -use crate::array::{DictionaryArray, DictionaryKey, Offset}; use csv_core::WriteResult; use std::any::Any; use std::fmt::{Debug, Write}; diff --git a/src/io/ipc/read/array/binary.rs b/src/io/ipc/read/array/binary.rs index 3ee6c4ba288..68c5b40d078 100644 --- a/src/io/ipc/read/array/binary.rs +++ b/src/io/ipc/read/array/binary.rs @@ -1,10 +1,11 @@ use std::collections::VecDeque; use std::io::{Read, Seek}; -use crate::array::{BinaryArray, Offset}; +use crate::array::BinaryArray; use crate::buffer::Buffer; use crate::datatypes::DataType; use crate::error::{Error, Result}; +use crate::offset::Offset; use super::super::read_basic::*; use super::super::{Compression, IpcBuffer, Node, OutOfSpecKind}; @@ -68,7 +69,7 @@ pub fn read_binary( scratch, )?; - BinaryArray::::try_new(data_type, offsets, values, validity) + BinaryArray::::try_new(data_type, offsets.try_into()?, values, validity) } pub fn skip_binary( diff --git a/src/io/ipc/read/array/list.rs b/src/io/ipc/read/array/list.rs index 7870b85a7b1..1b45b10730d 100644 --- a/src/io/ipc/read/array/list.rs +++ b/src/io/ipc/read/array/list.rs @@ -2,10 +2,11 @@ use std::collections::VecDeque; use std::convert::TryInto; use std::io::{Read, Seek}; -use crate::array::{ListArray, Offset}; +use crate::array::ListArray; use crate::buffer::Buffer; use crate::datatypes::DataType; use crate::error::{Error, Result}; +use crate::offset::Offset; use super::super::super::IpcField; use super::super::deserialize::{read, skip}; @@ -84,7 +85,7 @@ where version, scratch, )?; - ListArray::try_new(data_type, offsets, values, validity) + ListArray::try_new(data_type, offsets.try_into()?, values, validity) } pub fn skip_list( diff --git a/src/io/ipc/read/array/map.rs b/src/io/ipc/read/array/map.rs index b0803ca5f2f..4acec42ef6b 100644 --- a/src/io/ipc/read/array/map.rs +++ b/src/io/ipc/read/array/map.rs @@ -80,7 +80,7 @@ pub fn read_map( version, scratch, )?; - MapArray::try_new(data_type, offsets, field, validity) + MapArray::try_new(data_type, offsets.try_into()?, field, validity) } pub fn skip_map( diff --git a/src/io/ipc/read/array/utf8.rs b/src/io/ipc/read/array/utf8.rs index f57e51ecc5a..398184e3e55 100644 --- a/src/io/ipc/read/array/utf8.rs +++ b/src/io/ipc/read/array/utf8.rs @@ -1,10 +1,11 @@ use std::collections::VecDeque; use std::io::{Read, Seek}; -use crate::array::{Offset, Utf8Array}; +use crate::array::Utf8Array; use crate::buffer::Buffer; use crate::datatypes::DataType; use crate::error::{Error, Result}; +use crate::offset::Offset; use super::super::read_basic::*; use super::super::{Compression, IpcBuffer, Node, OutOfSpecKind}; @@ -69,7 +70,7 @@ pub fn read_utf8( scratch, )?; - Utf8Array::::try_new(data_type, offsets, values, validity) + Utf8Array::::try_new(data_type, offsets.try_into()?, values, validity) } pub fn skip_utf8( diff --git a/src/io/ipc/write/serialize.rs b/src/io/ipc/write/serialize.rs index 28647bd9111..03e716ce83a 100644 --- a/src/io/ipc/write/serialize.rs +++ b/src/io/ipc/write/serialize.rs @@ -2,7 +2,12 @@ use arrow_format::ipc; use crate::{ - array::*, bitmap::Bitmap, datatypes::PhysicalType, trusted_len::TrustedLen, types::NativeType, + array::*, + bitmap::Bitmap, + datatypes::PhysicalType, + offset::{Offset, OffsetsBuffer}, + trusted_len::TrustedLen, + types::NativeType, }; use super::super::compression; @@ -65,7 +70,7 @@ fn write_boolean( #[allow(clippy::too_many_arguments)] fn write_generic_binary( validity: Option<&Bitmap>, - offsets: &[O], + offsets: &OffsetsBuffer, values: &[u8], buffers: &mut Vec, arrow_data: &mut Vec, @@ -73,6 +78,7 @@ fn write_generic_binary( is_little_endian: bool, compression: Option, ) { + let offsets = offsets.buffer(); write_bitmap( validity, offsets.len() - 1, @@ -181,7 +187,7 @@ fn write_list( is_little_endian: bool, compression: Option, ) { - let offsets = array.offsets(); + let offsets = array.offsets().buffer(); let validity = array.validity(); write_bitmap( @@ -195,7 +201,7 @@ fn write_list( let first = *offsets.first().unwrap(); let last = *offsets.last().unwrap(); - if first == O::default() { + if first == O::zero() { write_buffer( offsets, buffers, @@ -309,7 +315,7 @@ fn write_map( is_little_endian: bool, compression: Option, ) { - let offsets = array.offsets(); + let offsets = array.offsets().buffer(); let validity = array.validity(); write_bitmap( diff --git a/src/io/json/read/deserialize.rs b/src/io/json/read/deserialize.rs index ea5d25a0a5d..73eac81d4f9 100644 --- a/src/io/json/read/deserialize.rs +++ b/src/io/json/read/deserialize.rs @@ -12,6 +12,7 @@ use crate::{ chunk::Chunk, datatypes::{DataType, Field, IntervalUnit, Schema}, error::Error, + offset::{Offset, Offsets}, types::{f16, NativeType}, }; @@ -226,24 +227,19 @@ fn deserialize_list<'a, O: Offset, A: Borrow>>( let child = ListArray::::get_child_type(&data_type); let mut validity = MutableBitmap::with_capacity(rows.len()); - let mut offsets = Vec::::with_capacity(rows.len() + 1); + let mut offsets = Offsets::::with_capacity(rows.len()); let mut inner = vec![]; - offsets.push(O::zero()); - rows.iter().fold(O::zero(), |mut length, row| { - match row.borrow() { - Value::Array(value) => { - inner.extend(value.iter()); - validity.push(true); - // todo make this an Err - length += O::from_usize(value.len()).expect("List offset is too large :/"); - offsets.push(length); - length - } - _ => { - validity.push(false); - offsets.push(length); - length - } + rows.iter().for_each(|row| match row.borrow() { + Value::Array(value) => { + inner.extend(value.iter()); + validity.push(true); + offsets + .try_push_usize(value.len()) + .expect("List offset is too large :/"); + } + _ => { + validity.push(false); + offsets.extend_constant(1); } }); @@ -258,39 +254,25 @@ fn deserialize_list_into<'a, O: Offset, A: Borrow>>( target: &mut MutableListArray>, rows: &[A], ) { - let start = { - let empty = vec![]; - let inner: Vec<_> = rows - .iter() - .flat_map(|row| match row.borrow() { - Value::Array(value) => value.iter(), - _ => empty.iter(), - }) - .collect(); - - let child = target.mut_values(); - let start_len = child.len(); - deserialize_into(child, &inner); + let empty = vec![]; + let inner: Vec<_> = rows + .iter() + .flat_map(|row| match row.borrow() { + Value::Array(value) => value.iter(), + _ => empty.iter(), + }) + .collect(); - // todo make this an Err - O::from_usize(start_len).expect("Child list size too large") - }; + deserialize_into(target.mut_values(), &inner); - let mut position = start; - let arrays = rows.iter().map(|row| { - match row.borrow() { - Value::Array(value) => { - // todo make this an Err - position += O::from_usize(value.len()).expect("List offset is too large :/"); - Some(position) - } - _ => None, - } + let lengths = rows.iter().map(|row| match row.borrow() { + Value::Array(value) => Some(value.len()), + _ => None, }); - // though this will always be safe, we cannot use unsafe_extend_offsets here - // due to `#![forbid(unsafe_code)]` on the io module - target.extend_offsets(arrays); + target + .try_extend_from_lengths(lengths) + .expect("Offsets overflow"); } fn deserialize_fixed_size_list_into<'a, A: Borrow>>( @@ -301,10 +283,7 @@ fn deserialize_fixed_size_list_into<'a, A: Borrow>>( match row.borrow() { Value::Array(value) => { if value.len() == target.size() { - { - let child = target.mut_values(); - deserialize_into(child, value); - } + deserialize_into(target.mut_values(), value); // unless alignment is already off, the if above should // prevent this from ever happening. target.try_push_valid().expect("unaligned backing array"); diff --git a/src/io/json/write/serialize.rs b/src/io/json/write/serialize.rs index aa323109e9b..ec1b0d5ee49 100644 --- a/src/io/json/write/serialize.rs +++ b/src/io/json/write/serialize.rs @@ -6,6 +6,7 @@ use streaming_iterator::StreamingIterator; use crate::bitmap::utils::ZipValidity; use crate::datatypes::TimeUnit; use crate::io::iterator::BufStreamingIterator; +use crate::offset::Offset; use crate::temporal_conversions::{ date32_to_date, date64_to_date, timestamp_ms_to_datetime, timestamp_ns_to_datetime, timestamp_s_to_datetime, timestamp_us_to_datetime, @@ -140,7 +141,7 @@ fn list_serializer<'a, O: Offset>( let mut serializer = new_serializer(array.values().as_ref()); Box::new(BufStreamingIterator::new( - ZipValidity::new_with_validity(array.offsets().windows(2), array.validity()), + ZipValidity::new_with_validity(array.offsets().buffer().windows(2), array.validity()), move |offset, buf| { if let Some(offset) = offset { let length = (offset[1] - offset[0]).to_usize(); diff --git a/src/io/json_integration/read/array.rs b/src/io/json_integration/read/array.rs index 90cd709cbf9..67af89a18b2 100644 --- a/src/io/json_integration/read/array.rs +++ b/src/io/json_integration/read/array.rs @@ -10,6 +10,7 @@ use crate::{ datatypes::{DataType, PhysicalType, PrimitiveType, Schema}, error::{Error, Result}, io::ipc::IpcField, + offset::Offset, types::{days_ms, i256, months_days_ns, NativeType}, }; @@ -189,7 +190,7 @@ fn to_binary(json_col: &ArrowJsonColumn, data_type: DataType) -> Box< .iter() .flat_map(|value| value.as_str().map(|x| hex::decode(x).unwrap()).unwrap()) .collect(); - Box::new(BinaryArray::new(data_type, offsets, values, validity)) + BinaryArray::new(data_type, offsets.try_into().unwrap(), values, validity).boxed() } fn to_utf8(json_col: &ArrowJsonColumn, data_type: DataType) -> Box { @@ -202,7 +203,7 @@ fn to_utf8(json_col: &ArrowJsonColumn, data_type: DataType) -> Box( @@ -222,9 +223,7 @@ fn to_list( dictionaries, )?; let offsets = to_offsets::(json_col.offset.as_ref()); - Ok(Box::new(ListArray::::new( - data_type, offsets, values, validity, - ))) + Ok(ListArray::::new(data_type, offsets.try_into()?, values, validity).boxed()) } fn to_map( @@ -244,7 +243,12 @@ fn to_map( dictionaries, )?; let offsets = to_offsets::(json_col.offset.as_ref()); - Ok(Box::new(MapArray::new(data_type, offsets, field, validity))) + Ok(Box::new(MapArray::new( + data_type, + offsets.try_into().unwrap(), + field, + validity, + ))) } fn to_dictionary( diff --git a/src/io/odbc/read/deserialize.rs b/src/io/odbc/read/deserialize.rs index b98596850c2..7ebf79b8b9a 100644 --- a/src/io/odbc/read/deserialize.rs +++ b/src/io/odbc/read/deserialize.rs @@ -6,6 +6,7 @@ use crate::array::{Array, BinaryArray, BooleanArray, PrimitiveArray, Utf8Array}; use crate::bitmap::{Bitmap, MutableBitmap}; use crate::buffer::Buffer; use crate::datatypes::{DataType, TimeUnit}; +use crate::offset::{Offsets, OffsetsBuffer}; use crate::types::NativeType; use super::super::api::buffers::AnyColumnView; @@ -118,22 +119,23 @@ fn bool_optional(data_type: DataType, values: &[Bit], indicators: &[isize]) -> B fn binary_generic<'a>( iter: impl Iterator>, -) -> (Buffer, Buffer, Option) { +) -> (OffsetsBuffer, Buffer, Option) { let length = iter.size_hint().0; let mut validity = MutableBitmap::with_capacity(length); let mut values = Vec::::with_capacity(0); - let mut offsets = Vec::with_capacity(length + 1); - offsets.push(0i32); - + let mut offsets = Offsets::::with_capacity(length); for item in iter { if let Some(item) = item { values.extend_from_slice(item); + offsets + .try_push_usize(item.len()) + .expect("List to contain less than i32::MAX items."); validity.push(true); } else { + offsets.extend_constant(1); validity.push(false); } - offsets.push(values.len() as i32) } (offsets.into(), values.into(), validity.into()) diff --git a/src/io/odbc/write/serialize.rs b/src/io/odbc/write/serialize.rs index 01767e6d52f..f92326ba89c 100644 --- a/src/io/odbc/write/serialize.rs +++ b/src/io/odbc/write/serialize.rs @@ -4,6 +4,7 @@ use crate::array::*; use crate::bitmap::Bitmap; use crate::datatypes::DataType; use crate::error::{Error, Result}; +use crate::offset::Offset; use crate::types::NativeType; use super::super::api; @@ -159,6 +160,7 @@ fn fixed_binary(array: &FixedSizeBinaryArray, writer: &mut BinColumnWriter) { fn binary(array: &BinaryArray, writer: &mut BinColumnWriter) { let max_len = array .offsets() + .buffer() .windows(2) .map(|x| (x[1] - x[0]).to_usize()) .max() @@ -170,6 +172,7 @@ fn binary(array: &BinaryArray, writer: &mut BinColumnWriter) { fn utf8(array: &Utf8Array, writer: &mut TextColumnWriter) { let max_len = array .offsets() + .buffer() .windows(2) .map(|x| (x[1] - x[0]).to_usize()) .max() diff --git a/src/io/orc/read/mod.rs b/src/io/orc/read/mod.rs index ca6ce7354c3..3fe4abb7f63 100644 --- a/src/io/orc/read/mod.rs +++ b/src/io/orc/read/mod.rs @@ -1,12 +1,11 @@ //! APIs to read from [ORC format](https://orc.apache.org). use std::io::Read; -use crate::array::{ - Array, BinaryArray, BooleanArray, Int64Array, Offset, PrimitiveArray, Utf8Array, -}; +use crate::array::{Array, BinaryArray, BooleanArray, Int64Array, PrimitiveArray, Utf8Array}; use crate::bitmap::{Bitmap, MutableBitmap}; use crate::datatypes::{DataType, Field, Schema}; use crate::error::Error; +use crate::offset::{Offset, Offsets}; use crate::types::NativeType; use orc_format::proto::stream::Kind; @@ -251,23 +250,19 @@ where #[inline] fn try_extend, I: Iterator>( - offsets: &mut Vec, - length: &mut O, + offsets: &mut Offsets, iter: I, -) -> Result<(), orc_format::error::Error> { +) -> Result<(), Error> { for item in iter { - let item: O = item - .try_into() - .map_err(|_| orc_format::error::Error::OutOfSpec)?; - *length += item; - offsets.push(*length) + let length: O = item.try_into().map_err(|_| Error::Overflow)?; + offsets.try_push(length)? } Ok(()) } fn deserialize_binary_generic>( column: &Column, -) -> Result<(Vec, Vec, Option), Error> { +) -> Result<(Offsets, Vec, Option), Error> { let num_rows = column.number_of_rows(); let mut scratch = vec![]; @@ -275,9 +270,7 @@ fn deserialize_binary_generic>( let lengths = column.get_stream(Kind::Length, scratch)?; - let mut offsets = Vec::with_capacity(num_rows + 1); - let mut length = O::default(); - offsets.push(length); + let mut offsets = Offsets::with_capacity(num_rows); if let Some(validity) = &validity { let mut iter = decode::UnsignedRleV2Iter::new(lengths, validity.len() - validity.unset_bits(), vec![]); @@ -287,34 +280,35 @@ fn deserialize_binary_generic>( .next() .transpose()? .ok_or(orc_format::error::Error::OutOfSpec)?; - let item: O = item + let length: O = item .try_into() .map_err(|_| Error::ExternalFormat("value uncastable".to_string()))?; - length += item; + offsets.try_push(length)?; + } else { + offsets.extend_constant(1) } - offsets.push(length); } let (lengths, _) = iter.into_inner(); scratch = std::mem::take(&mut lengths.into_inner()); } else { let mut iter = decode::UnsignedRleV2RunIter::new(lengths, num_rows, vec![]); iter.try_for_each(|run| { - run.and_then(|run| match run { + run.map_err(Error::from).and_then(|run| match run { decode::UnsignedRleV2Run::Direct(values_iter) => { - try_extend(&mut offsets, &mut length, values_iter) + try_extend(&mut offsets, values_iter) } decode::UnsignedRleV2Run::Delta(values_iter) => { - try_extend(&mut offsets, &mut length, values_iter) + try_extend(&mut offsets, values_iter) } decode::UnsignedRleV2Run::ShortRepeat(values_iter) => { - try_extend(&mut offsets, &mut length, values_iter) + try_extend(&mut offsets, values_iter) } }) })?; let (lengths, _) = iter.into_inner(); scratch = std::mem::take(&mut lengths.into_inner()); } - let length = length.to_usize(); + let length = offsets.last().to_usize(); let mut values = vec![0; length]; let mut data = column.get_stream(Kind::Data, scratch)?; diff --git a/src/io/parquet/read/deserialize/binary/basic.rs b/src/io/parquet/read/deserialize/binary/basic.rs index 8790b43ea45..14c805158f8 100644 --- a/src/io/parquet/read/deserialize/binary/basic.rs +++ b/src/io/parquet/read/deserialize/binary/basic.rs @@ -9,11 +9,12 @@ use parquet2::{ }; use crate::{ - array::{Array, BinaryArray, Offset, Utf8Array}, + array::{Array, BinaryArray, Utf8Array}, bitmap::{Bitmap, MutableBitmap}, buffer::Buffer, datatypes::DataType, error::{Error, Result}, + offset::{Offset, OffsetsBuffer}, }; use super::super::utils::{ @@ -227,7 +228,7 @@ impl<'a> utils::PageState<'a> for State<'a> { pub trait TraitBinaryArray: Array + 'static { fn try_new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Result @@ -238,7 +239,7 @@ pub trait TraitBinaryArray: Array + 'static { impl TraitBinaryArray for BinaryArray { fn try_new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Result { @@ -249,7 +250,7 @@ impl TraitBinaryArray for BinaryArray { impl TraitBinaryArray for Utf8Array { fn try_new( data_type: DataType, - offsets: Buffer, + offsets: OffsetsBuffer, values: Buffer, validity: Option, ) -> Result { @@ -372,22 +373,18 @@ impl<'a, O: Offset> utils::Decoder<'a> for BinaryDecoder { let Binary { offsets, values: values_, - last_offset, } = values; - let offset = *last_offset; + let last_offset = *offsets.last(); extend_from_decoder( validity, page_validity, Some(additional), offsets, - page_values.lengths.by_ref().map(|x| { - *last_offset += O::from_usize(x).unwrap(); - *last_offset - }), + page_values.lengths.by_ref(), ); - let length = *last_offset - offset; + let length = *offsets.last() - last_offset; let (consumed, remaining) = page_values.values.split_at(length.to_usize()); page_values.values = remaining; @@ -485,7 +482,7 @@ pub(super) fn finish>( ) -> Result { A::try_new( data_type.clone(), - values.offsets.0.into(), + values.offsets.into(), values.values.into(), validity.into(), ) diff --git a/src/io/parquet/read/deserialize/binary/dictionary.rs b/src/io/parquet/read/deserialize/binary/dictionary.rs index 3000a7ca7d2..6f883528ef8 100644 --- a/src/io/parquet/read/deserialize/binary/dictionary.rs +++ b/src/io/parquet/read/deserialize/binary/dictionary.rs @@ -3,11 +3,12 @@ use std::collections::VecDeque; use parquet2::page::DictPage; use crate::{ - array::{Array, BinaryArray, DictionaryArray, DictionaryKey, Offset, Utf8Array}, + array::{Array, BinaryArray, DictionaryArray, DictionaryKey, Utf8Array}, bitmap::MutableBitmap, datatypes::{DataType, PhysicalType}, error::Result, io::parquet::read::deserialize::nested_utils::{InitNested, NestedState}, + offset::Offset, }; use super::super::Pages; @@ -66,11 +67,10 @@ fn read_dict(data_type: DataType, dict: &DictPage) -> Box match data_type.to_physical_type() { PhysicalType::Utf8 | PhysicalType::LargeUtf8 => { - Utf8Array::::new(data_type, data.offsets.0.into(), data.values.into(), None).boxed() + Utf8Array::::new(data_type, data.offsets.into(), data.values.into(), None).boxed() } PhysicalType::Binary | PhysicalType::LargeBinary => { - BinaryArray::::new(data_type, data.offsets.0.into(), data.values.into(), None) - .boxed() + BinaryArray::::new(data_type, data.offsets.into(), data.values.into(), None).boxed() } _ => unreachable!(), } diff --git a/src/io/parquet/read/deserialize/binary/nested.rs b/src/io/parquet/read/deserialize/binary/nested.rs index 8c0f5ef419e..2d345140db7 100644 --- a/src/io/parquet/read/deserialize/binary/nested.rs +++ b/src/io/parquet/read/deserialize/binary/nested.rs @@ -7,8 +7,8 @@ use parquet2::{ }; use crate::{ - array::Offset, bitmap::MutableBitmap, datatypes::DataType, error::Result, - io::parquet::read::Pages, + bitmap::MutableBitmap, datatypes::DataType, error::Result, io::parquet::read::Pages, + offset::Offset, }; use super::super::utils::MaybeNext; diff --git a/src/io/parquet/read/deserialize/binary/utils.rs b/src/io/parquet/read/deserialize/binary/utils.rs index a47cb967a55..ddf7abc2a06 100644 --- a/src/io/parquet/read/deserialize/binary/utils.rs +++ b/src/io/parquet/read/deserialize/binary/utils.rs @@ -1,4 +1,4 @@ -use crate::array::Offset; +use crate::offset::{Offset, Offsets}; use super::super::utils::Pushable; @@ -7,70 +7,51 @@ use super::super::utils::Pushable; pub struct Binary { pub offsets: Offsets, pub values: Vec, - pub last_offset: O, } -#[derive(Debug)] -pub struct Offsets(pub Vec); - -impl Offsets { - #[inline] - pub fn extend_lengths>(&mut self, lengths: I) { - let mut last_offset = *self.0.last().unwrap(); - self.0.extend(lengths.map(|length| { - last_offset += O::from_usize(length).unwrap(); - last_offset - })); - } -} - -impl Pushable for Offsets { +impl Pushable for Offsets { fn reserve(&mut self, additional: usize) { - self.0.reserve(additional) + self.reserve(additional) } #[inline] fn len(&self) -> usize { - self.0.len() - 1 + self.len() } #[inline] - fn push(&mut self, value: O) { - self.0.push(value) + fn push(&mut self, value: usize) { + self.try_push_usize(value).unwrap() } #[inline] fn push_null(&mut self) { - self.0.push(*self.0.last().unwrap()) + self.extend_constant(1); } #[inline] - fn extend_constant(&mut self, additional: usize, value: O) { - self.0.extend_constant(additional, value) + fn extend_constant(&mut self, additional: usize, _: usize) { + self.extend_constant(additional) } } impl Binary { #[inline] pub fn with_capacity(capacity: usize) -> Self { - let mut offsets = Vec::with_capacity(1 + capacity); - offsets.push(O::default()); Self { - offsets: Offsets(offsets), + offsets: Offsets::with_capacity(capacity), values: Vec::with_capacity(capacity * 24), - last_offset: O::default(), } } #[inline] pub fn push(&mut self, v: &[u8]) { self.values.extend(v); - self.last_offset += O::from_usize(v.len()).unwrap(); - self.offsets.push(self.last_offset) + self.offsets.try_push_usize(v.len()).unwrap() } #[inline] pub fn extend_constant(&mut self, additional: usize) { - self.offsets.extend_constant(additional, self.last_offset); + self.offsets.extend_constant(additional); } #[inline] @@ -80,10 +61,10 @@ impl Binary { #[inline] pub fn extend_lengths>(&mut self, lengths: I, values: &mut &[u8]) { - let current_offset = self.last_offset; - self.offsets.extend_lengths(lengths); - self.last_offset = *self.offsets.0.last().unwrap(); // guaranteed to have one - let length = self.last_offset.to_usize() - current_offset.to_usize(); + let current_offset = *self.offsets.last(); + self.offsets.try_extend_from_lengths(lengths).unwrap(); + let new_offset = *self.offsets.last(); + let length = new_offset.to_usize() - current_offset.to_usize(); let (consumed, remaining) = values.split_at(length); *values = remaining; self.values.extend_from_slice(consumed); @@ -93,7 +74,7 @@ impl Binary { impl<'a, O: Offset> Pushable<&'a [u8]> for Binary { #[inline] fn reserve(&mut self, additional: usize) { - let avg_len = self.values.len() / std::cmp::max(self.last_offset.to_usize(), 1); + let avg_len = self.values.len() / std::cmp::max(self.offsets.last().to_usize(), 1); self.values.reserve(additional * avg_len); self.offsets.reserve(additional); } diff --git a/src/io/parquet/read/deserialize/mod.rs b/src/io/parquet/read/deserialize/mod.rs index b16d1a6e83d..d3baa7879be 100644 --- a/src/io/parquet/read/deserialize/mod.rs +++ b/src/io/parquet/read/deserialize/mod.rs @@ -18,6 +18,7 @@ use crate::{ array::{Array, DictionaryKey, FixedSizeListArray, ListArray}, datatypes::{DataType, Field, IntervalUnit}, error::Result, + offset::Offsets, }; use self::nested_utils::{InitNested, NestedArrayIter, NestedState}; @@ -53,6 +54,11 @@ fn create_list( offsets.push(values.len() as i64); let offsets = offsets.iter().map(|x| *x as i32).collect::>(); + + let offsets: Offsets = offsets + .try_into() + .expect("i64 offsets do not fit in i32 offsets"); + Box::new(ListArray::::new( data_type, offsets.into(), @@ -65,7 +71,7 @@ fn create_list( Box::new(ListArray::::new( data_type, - offsets.into(), + offsets.try_into().expect("List too large"), values, validity.and_then(|x| x.into()), )) diff --git a/src/io/parquet/read/deserialize/nested.rs b/src/io/parquet/read/deserialize/nested.rs index 389d0479fec..065190fbb7e 100644 --- a/src/io/parquet/read/deserialize/nested.rs +++ b/src/io/parquet/read/deserialize/nested.rs @@ -295,7 +295,7 @@ where let (nested, inner) = x?; let array = MapArray::new( field.data_type().clone(), - vec![0, inner.len() as i32].into(), + vec![0, inner.len() as i32].try_into().unwrap(), inner, None, ); diff --git a/src/io/parquet/read/statistics/binary.rs b/src/io/parquet/read/statistics/binary.rs index 1786477e6bb..aeb43a6b3e0 100644 --- a/src/io/parquet/read/statistics/binary.rs +++ b/src/io/parquet/read/statistics/binary.rs @@ -1,7 +1,8 @@ -use crate::array::{MutableArray, MutableBinaryArray, Offset}; use parquet2::statistics::{BinaryStatistics, Statistics as ParquetStatistics}; +use crate::array::{MutableArray, MutableBinaryArray}; use crate::error::Result; +use crate::offset::Offset; pub(super) fn push( from: Option<&dyn ParquetStatistics>, diff --git a/src/io/parquet/read/statistics/list.rs b/src/io/parquet/read/statistics/list.rs index 9d0adbcb0cc..047b5e07700 100644 --- a/src/io/parquet/read/statistics/list.rs +++ b/src/io/parquet/read/statistics/list.rs @@ -1,6 +1,7 @@ use crate::array::*; use crate::datatypes::DataType; use crate::error::Result; +use crate::offset::Offsets; use super::make_mutable; @@ -40,19 +41,21 @@ impl MutableArray for DynMutableListArray { match self.data_type.to_logical_type() { DataType::List(_) => { - let offsets = (0..=inner.len() as i32).collect::>().into(); + let offsets = + Offsets::try_from_lengths(std::iter::repeat(1).take(inner.len())).unwrap(); Box::new(ListArray::::new( self.data_type.clone(), - offsets, + offsets.into(), inner, None, )) } DataType::LargeList(_) => { - let offsets = (0..=inner.len() as i64).collect::>().into(); + let offsets = + Offsets::try_from_lengths(std::iter::repeat(1).take(inner.len())).unwrap(); Box::new(ListArray::::new( self.data_type.clone(), - offsets, + offsets.into(), inner, None, )) diff --git a/src/io/parquet/read/statistics/map.rs b/src/io/parquet/read/statistics/map.rs index c31dfb9d9e7..db3678510f8 100644 --- a/src/io/parquet/read/statistics/map.rs +++ b/src/io/parquet/read/statistics/map.rs @@ -40,7 +40,7 @@ impl MutableArray for DynMutableMapArray { fn as_box(&mut self) -> Box { Box::new(MapArray::new( self.data_type.clone(), - vec![0, self.inner.len() as i32].into(), + vec![0, self.inner.len() as i32].try_into().unwrap(), self.inner.as_box(), None, )) diff --git a/src/io/parquet/read/statistics/utf8.rs b/src/io/parquet/read/statistics/utf8.rs index 7a447e2334f..da9fcb6e111 100644 --- a/src/io/parquet/read/statistics/utf8.rs +++ b/src/io/parquet/read/statistics/utf8.rs @@ -1,7 +1,8 @@ -use crate::array::{MutableArray, MutableUtf8Array, Offset}; use parquet2::statistics::{BinaryStatistics, Statistics as ParquetStatistics}; +use crate::array::{MutableArray, MutableUtf8Array}; use crate::error::Result; +use crate::offset::Offset; pub(super) fn push( from: Option<&dyn ParquetStatistics>, diff --git a/src/io/parquet/write/binary/basic.rs b/src/io/parquet/write/binary/basic.rs index 58156c901e6..55886d0f630 100644 --- a/src/io/parquet/write/binary/basic.rs +++ b/src/io/parquet/write/binary/basic.rs @@ -8,10 +8,11 @@ use parquet2::{ use super::super::utils; use super::super::WriteOptions; use crate::{ - array::{Array, BinaryArray, Offset}, + array::{Array, BinaryArray}, bitmap::Bitmap, error::{Error, Result}, io::parquet::read::schema::is_nullable, + offset::Offset, }; pub(crate) fn encode_plain( @@ -63,7 +64,7 @@ pub fn array_to_page( Encoding::Plain => encode_plain(array, is_optional, &mut buffer), Encoding::DeltaLengthByteArray => encode_delta( array.values(), - array.offsets(), + array.offsets().buffer(), array.validity(), is_optional, &mut buffer, diff --git a/src/io/parquet/write/binary/nested.rs b/src/io/parquet/write/binary/nested.rs index 55f5d2ef247..950ea4190ca 100644 --- a/src/io/parquet/write/binary/nested.rs +++ b/src/io/parquet/write/binary/nested.rs @@ -6,8 +6,9 @@ use super::basic::{build_statistics, encode_plain}; use crate::io::parquet::read::schema::is_nullable; use crate::io::parquet::write::Nested; use crate::{ - array::{Array, BinaryArray, Offset}, + array::{Array, BinaryArray}, error::Result, + offset::Offset, }; pub fn array_to_page( diff --git a/src/io/parquet/write/nested/def.rs b/src/io/parquet/write/nested/def.rs index ea62c1c14dc..eb945a4c683 100644 --- a/src/io/parquet/write/nested/def.rs +++ b/src/io/parquet/write/nested/def.rs @@ -1,4 +1,4 @@ -use crate::{array::Offset, bitmap::Bitmap}; +use crate::{bitmap::Bitmap, offset::Offset}; use super::super::pages::{ListNested, Nested}; use super::rep::num_values; diff --git a/src/io/parquet/write/nested/mod.rs b/src/io/parquet/write/nested/mod.rs index d9f01e3f487..5f6cd4d3524 100644 --- a/src/io/parquet/write/nested/mod.rs +++ b/src/io/parquet/write/nested/mod.rs @@ -3,7 +3,7 @@ mod rep; use parquet2::{encoding::hybrid_rle::encode_u32, read::levels::get_bit_width, write::Version}; -use crate::{array::Offset, error::Result}; +use crate::{error::Result, offset::Offset}; use super::Nested; diff --git a/src/io/parquet/write/pages.rs b/src/io/parquet/write/pages.rs index caaa92866be..e3e1eec410f 100644 --- a/src/io/parquet/write/pages.rs +++ b/src/io/parquet/write/pages.rs @@ -1,10 +1,11 @@ use parquet2::schema::types::{ParquetType, PrimitiveType as ParquetPrimitiveType}; use parquet2::{page::Page, write::DynIter}; -use crate::array::{ListArray, Offset, StructArray}; +use crate::array::{ListArray, StructArray}; use crate::bitmap::Bitmap; use crate::datatypes::PhysicalType; use crate::io::parquet::read::schema::is_nullable; +use crate::offset::Offset; use crate::{ array::Array, error::{Error, Result}, @@ -105,7 +106,7 @@ fn to_nested_recursive<'a>( }; parents.push(Nested::List(ListNested::new( - array.offsets(), + array.offsets().buffer(), array.validity(), is_optional, ))); @@ -128,7 +129,7 @@ fn to_nested_recursive<'a>( }; parents.push(Nested::LargeList(ListNested::new( - array.offsets(), + array.offsets().buffer(), array.validity(), is_optional, ))); @@ -417,7 +418,7 @@ mod tests { let array = ListArray::new( DataType::List(Box::new(Field::new("l", array.data_type().clone(), true))), - vec![0i32, 2, 4].into(), + vec![0i32, 2, 4].try_into().unwrap(), Box::new(array), None, ); diff --git a/src/io/parquet/write/utf8/basic.rs b/src/io/parquet/write/utf8/basic.rs index c1e7b9321bc..744d3bce7b1 100644 --- a/src/io/parquet/write/utf8/basic.rs +++ b/src/io/parquet/write/utf8/basic.rs @@ -9,9 +9,10 @@ use super::super::binary::{encode_delta, ord_binary}; use super::super::utils; use super::super::WriteOptions; use crate::{ - array::{Array, Offset, Utf8Array}, + array::{Array, Utf8Array}, error::{Error, Result}, io::parquet::read::schema::is_nullable, + offset::Offset, }; pub(crate) fn encode_plain( @@ -62,7 +63,7 @@ pub fn array_to_page( Encoding::Plain => encode_plain(array, is_optional, &mut buffer), Encoding::DeltaLengthByteArray => encode_delta( array.values(), - array.offsets(), + array.offsets().buffer(), array.validity(), is_optional, &mut buffer, diff --git a/src/io/parquet/write/utf8/nested.rs b/src/io/parquet/write/utf8/nested.rs index 42babd46cd7..2792ef35712 100644 --- a/src/io/parquet/write/utf8/nested.rs +++ b/src/io/parquet/write/utf8/nested.rs @@ -6,8 +6,9 @@ use super::basic::{build_statistics, encode_plain}; use crate::io::parquet::read::schema::is_nullable; use crate::io::parquet::write::Nested; use crate::{ - array::{Array, Offset, Utf8Array}, + array::{Array, Utf8Array}, error::Result, + offset::Offset, }; pub fn array_to_page( diff --git a/src/lib.rs b/src/lib.rs index 64771b954cf..bef2e6e53c1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -24,6 +24,7 @@ pub mod error; #[cfg_attr(docsrs, doc(cfg(feature = "io_ipc")))] pub mod mmap; +pub mod offset; pub mod scalar; pub mod trusted_len; pub mod types; diff --git a/src/offset.rs b/src/offset.rs new file mode 100644 index 00000000000..2337f082218 --- /dev/null +++ b/src/offset.rs @@ -0,0 +1,496 @@ +//! Contains the declaration of [`Offset`] +use std::hint::unreachable_unchecked; + +use crate::buffer::Buffer; +use crate::error::Error; +pub use crate::types::Offset; + +/// A wrapper type of [`Vec`] representing the invariants of Arrow's offsets. +/// It is guaranteed to (sound to assume that): +/// * every element is `>= 0` +/// * element at position `i` is >= than element at position `i-1`. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Offsets(Vec); + +impl Default for Offsets { + #[inline] + fn default() -> Self { + Self::new() + } +} + +impl TryFrom> for Offsets { + type Error = Error; + + #[inline] + fn try_from(offsets: Vec) -> Result { + try_check_offsets(&offsets)?; + Ok(Self(offsets)) + } +} + +impl TryFrom> for OffsetsBuffer { + type Error = Error; + + #[inline] + fn try_from(offsets: Buffer) -> Result { + try_check_offsets(&offsets)?; + Ok(Self(offsets)) + } +} + +impl TryFrom> for OffsetsBuffer { + type Error = Error; + + #[inline] + fn try_from(offsets: Vec) -> Result { + try_check_offsets(&offsets)?; + Ok(Self(offsets.into())) + } +} + +impl From> for OffsetsBuffer { + #[inline] + fn from(offsets: Offsets) -> Self { + Self(offsets.0.into()) + } +} + +impl Offsets { + /// Returns an empty [`Offsets`] (i.e. with a single element, the zero) + #[inline] + pub fn new() -> Self { + Self(vec![O::zero()]) + } + + /// Creates a new [`Offsets`] from an iterator of lengths + #[inline] + pub fn try_from_iter>(iter: I) -> Result { + let iterator = iter.into_iter(); + let (lower, _) = iterator.size_hint(); + let mut offsets = Self::with_capacity(lower); + for item in iterator { + offsets.try_push_usize(item)? + } + Ok(offsets) + } + + /// Returns a new [`Offsets`] with a capacity, allocating at least `capacity + 1` entries. + pub fn with_capacity(capacity: usize) -> Self { + let mut offsets = Vec::with_capacity(capacity + 1); + offsets.push(O::zero()); + Self(offsets) + } + + /// Returns the capacity of [`Offsets`]. + pub fn capacity(&self) -> usize { + self.0.capacity() - 1 + } + + /// Reserves `additional` entries. + pub fn reserve(&mut self, additional: usize) { + self.0.reserve(additional); + } + + /// Shrinks the capacity of self to fit. + pub fn shrink_to_fit(&mut self) { + self.0.shrink_to_fit(); + } + + /// Pushes a new element with a given length. + /// # Error + /// This function errors iff the new last item is larger than what `O` supports. + /// # Panic + /// This function asserts that `length > 0`. + #[inline] + pub fn try_push(&mut self, length: O) -> Result<(), Error> { + let old_length = self.last(); + assert!(length >= O::zero()); + let new_length = old_length.checked_add(&length).ok_or(Error::Overflow)?; + self.0.push(new_length); + Ok(()) + } + + /// Pushes a new element with a given length. + /// # Error + /// This function errors iff the new last item is larger than what `O` supports. + /// # Implementation + /// This function: + /// * checks that this length does not overflow + #[inline] + pub fn try_push_usize(&mut self, length: usize) -> Result<(), Error> { + let length = O::from_usize(length).ok_or(Error::Overflow)?; + + let old_length = self.last(); + let new_length = old_length.checked_add(&length).ok_or(Error::Overflow)?; + self.0.push(new_length); + Ok(()) + } + + /// Returns [`Offsets`] assuming that `offsets` fulfills its invariants + /// # Safety + /// This is safe iff the invariants of this struct are guaranteed in `offsets`. + #[inline] + pub unsafe fn new_unchecked(offsets: Vec) -> Self { + Self(offsets) + } + + /// Returns the last offset of this container. + #[inline] + pub fn last(&self) -> &O { + match self.0.last() { + Some(element) => element, + None => unsafe { unreachable_unchecked() }, + } + } + + /// Returns a range (start, end) corresponding to the position `index` + /// # Panic + /// This function panics iff `index >= self.len()` + #[inline] + pub fn start_end(&self, index: usize) -> (usize, usize) { + // soundness: the invariant of the function + assert!(index < self.len()); + unsafe { self.start_end_unchecked(index) } + } + + /// Returns a range (start, end) corresponding to the position `index` + /// # Safety + /// `index` must be `< self.len()` + #[inline] + pub unsafe fn start_end_unchecked(&self, index: usize) -> (usize, usize) { + // soundness: the invariant of the function + let start = self.0.get_unchecked(index).to_usize(); + let end = self.0.get_unchecked(index + 1).to_usize(); + (start, end) + } + + /// Returns the length of this container + #[inline] + pub fn len(&self) -> usize { + self.0.len() - 1 + } + + /// Returns the byte slice stored in this buffer + #[inline] + pub fn as_slice(&self) -> &[O] { + self.0.as_slice() + } + + /// Pops the last element + #[inline] + pub fn pop(&mut self) -> Option { + if self.len() == 0 { + None + } else { + self.0.pop() + } + } + + /// Extends itself with `additional` elements equal to the last offset. + /// This is useful to extend offsets with empty values, e.g. for null slots. + #[inline] + pub fn extend_constant(&mut self, additional: usize) { + let offset = *self.last(); + if additional == 1 { + self.0.push(offset) + } else { + self.0.resize(self.len() + additional, offset) + } + } + + /// Try to create a new [`Offsets`] from a sequence of `lengths` + /// # Errors + /// This function errors iff this operation overflows for the maximum value of `O`. + #[inline] + pub fn try_from_lengths>(lengths: I) -> Result { + let mut self_ = Self::with_capacity(lengths.size_hint().0); + self_.try_extend_from_lengths(lengths)?; + Ok(self_) + } + + /// Try extend from an iterator of lengths + /// # Errors + /// This function errors iff this operation overflows for the maximum value of `O`. + #[inline] + pub fn try_extend_from_lengths>( + &mut self, + lengths: I, + ) -> Result<(), Error> { + let mut total_length = 0; + let mut offset = *self.last(); + let original_offset = offset.to_usize(); + + let lengths = lengths.map(|length| { + total_length += length; + O::from_as_usize(length) + }); + + let offsets = lengths.map(|length| { + offset += length; // this may overflow, checked below + offset + }); + self.0.extend(offsets); + + let last_offset = original_offset + .checked_add(total_length) + .ok_or(Error::Overflow)?; + O::from_usize(last_offset).ok_or(Error::Overflow)?; + Ok(()) + } + + /// Extends itself from another [`Offsets`] + /// # Errors + /// This function errors iff this operation overflows for the maximum value of `O`. + pub fn try_extend_from_self(&mut self, other: &Self) -> Result<(), Error> { + let mut length = *self.last(); + let other_length = *other.last(); + // check if the operation would overflow + length.checked_add(&other_length).ok_or(Error::Overflow)?; + + let lengths = other.as_slice().windows(2).map(|w| w[1] - w[0]); + let offsets = lengths.map(|new_length| { + length += new_length; + length + }); + self.0.extend(offsets); + Ok(()) + } + + /// Extends itself from another [`Offsets`] sliced by `start, length` + /// # Errors + /// This function errors iff this operation overflows for the maximum value of `O`. + pub fn try_extend_from_slice( + &mut self, + other: &OffsetsBuffer, + start: usize, + length: usize, + ) -> Result<(), Error> { + if length == 0 { + return Ok(()); + } + let other = &other.0[start..start + length + 1]; + let other_length = other.last().expect("Length to be non-zero"); + let mut length = *self.last(); + // check if the operation would overflow + length.checked_add(other_length).ok_or(Error::Overflow)?; + + let lengths = other.windows(2).map(|w| w[1] - w[0]); + let offsets = lengths.map(|new_length| { + length += new_length; + length + }); + self.0.extend(offsets); + Ok(()) + } + + /// Returns the inner [`Vec`]. + #[inline] + pub fn into_inner(self) -> Vec { + self.0 + } +} + +/// Checks that `offsets` is monotonically increasing. +fn try_check_offsets(offsets: &[O]) -> Result<(), Error> { + // this code is carefully constructed to auto-vectorize, don't change naively! + match offsets.first() { + None => Err(Error::oos("offsets must have at least one element")), + Some(first) => { + if *first < O::zero() { + return Err(Error::oos("offsets must be larger than 0")); + } + let mut previous = *first; + let mut any_invalid = false; + + // This loop will auto-vectorize because there is not any break, + // an invalid value will be returned once the whole offsets buffer is processed. + for offset in offsets { + if previous > *offset { + any_invalid = true + } + previous = *offset; + } + + if any_invalid { + Err(Error::oos("offsets must be monotonically increasing")) + } else { + Ok(()) + } + } + } +} + +/// A wrapper type of [`Buffer`] that is guaranteed to: +/// * Always contain an element +/// * Every element is `>0` +/// * element at position `i` is >= than element at position `i-1`. +#[derive(Clone, PartialEq, Debug)] +pub struct OffsetsBuffer(Buffer); + +impl Default for OffsetsBuffer { + #[inline] + fn default() -> Self { + Self(vec![O::zero()].into()) + } +} + +impl OffsetsBuffer { + /// # Safety + /// This is safe iff the invariants of this struct are guaranteed in `offsets`. + #[inline] + pub unsafe fn new_unchecked(offsets: Buffer) -> Self { + Self(offsets) + } + + /// Returns an empty [`OffsetsBuffer`] (i.e. with a single element, the zero) + #[inline] + pub fn new() -> Self { + Self(vec![O::zero()].into()) + } + + /// Copy-on-write API to convert [`OffsetsBuffer`] into [`Offsets`]. + #[inline] + pub fn get_mut(&mut self) -> Option> { + self.0 + .get_mut() + .map(|x| { + let mut new = vec![O::zero()]; + std::mem::swap(x, &mut new); + new + }) + // Safety: Offsets and OffsetsBuffer share invariants + .map(|offsets| unsafe { Offsets::new_unchecked(offsets) }) + } + + /// Returns a reference to its internal [`Buffer`]. + #[inline] + pub fn buffer(&self) -> &Buffer { + &self.0 + } + + /// Returns the length of this container + #[inline] + pub fn len(&self) -> usize { + self.0.len() - 1 + } + + /// Returns the byte slice stored in this buffer + #[inline] + pub fn as_slice(&self) -> &[O] { + self.0.as_slice() + } + + /// Returns the last offset of this container, which is guaranteed to exist. + #[inline] + pub fn last(&self) -> &O { + match self.0.last() { + Some(element) => element, + None => unsafe { unreachable_unchecked() }, + } + } + + /// Returns a range (start, end) corresponding to the position `index` + /// # Panic + /// This function panics iff `index >= self.len()` + #[inline] + pub fn start_end(&self, index: usize) -> (usize, usize) { + // soundness: the invariant of the function + assert!(index < self.len()); + unsafe { self.start_end_unchecked(index) } + } + + /// Returns a range (start, end) corresponding to the position `index` + /// # Safety + /// `index` must be `< self.len()` + #[inline] + pub unsafe fn start_end_unchecked(&self, index: usize) -> (usize, usize) { + // soundness: the invariant of the function + let start = self.0.get_unchecked(index).to_usize(); + let end = self.0.get_unchecked(index + 1).to_usize(); + (start, end) + } + + /// Returns a new [`OffsetsBuffer`] that is a slice of this buffer starting at `offset`. + /// Doing so allows the same memory region to be shared between buffers. + /// # Safety + /// The caller must ensure `offset + length <= self.len()` + #[inline] + pub unsafe fn slice_unchecked(self, offset: usize, length: usize) -> Self { + Self(self.0.slice_unchecked(offset, length)) + } + + /// Returns an iterator with the lengths of the offsets + #[inline] + pub fn lengths(&self) -> impl Iterator + '_ { + self.0.windows(2).map(|w| (w[1] - w[0]).to_usize()) + } + + /// Returns the inner [`Buffer`]. + #[inline] + pub fn into_inner(self) -> Buffer { + self.0 + } +} + +impl From<&OffsetsBuffer> for OffsetsBuffer { + fn from(offsets: &OffsetsBuffer) -> Self { + // this conversion is lossless and uphelds all invariants + Self( + offsets + .buffer() + .iter() + .map(|x| *x as i64) + .collect::>() + .into(), + ) + } +} + +impl TryFrom<&OffsetsBuffer> for OffsetsBuffer { + type Error = Error; + + fn try_from(offsets: &OffsetsBuffer) -> Result { + i32::try_from(*offsets.last()).map_err(|_| Error::Overflow)?; + + // this conversion is lossless and uphelds all invariants + Ok(Self( + offsets + .buffer() + .iter() + .map(|x| *x as i32) + .collect::>() + .into(), + )) + } +} + +impl From> for Offsets { + fn from(offsets: Offsets) -> Self { + // this conversion is lossless and uphelds all invariants + Self( + offsets + .as_slice() + .iter() + .map(|x| *x as i64) + .collect::>(), + ) + } +} + +impl TryFrom> for Offsets { + type Error = Error; + + fn try_from(offsets: Offsets) -> Result { + i32::try_from(*offsets.last()).map_err(|_| Error::Overflow)?; + + // this conversion is lossless and uphelds all invariants + Ok(Self( + offsets + .as_slice() + .iter() + .map(|x| *x as i32) + .collect::>(), + )) + } +} diff --git a/src/scalar/binary.rs b/src/scalar/binary.rs index 70a9be40374..cfbdbd04b0a 100644 --- a/src/scalar/binary.rs +++ b/src/scalar/binary.rs @@ -1,4 +1,4 @@ -use crate::{array::*, datatypes::DataType}; +use crate::{datatypes::DataType, offset::Offset}; use super::Scalar; diff --git a/src/scalar/list.rs b/src/scalar/list.rs index 438e13f7f55..98ee497d949 100644 --- a/src/scalar/list.rs +++ b/src/scalar/list.rs @@ -1,6 +1,6 @@ use std::any::Any; -use crate::{array::*, datatypes::DataType}; +use crate::{array::*, datatypes::DataType, offset::Offset}; use super::Scalar; diff --git a/src/scalar/utf8.rs b/src/scalar/utf8.rs index f62581eccbc..73ea98b729e 100644 --- a/src/scalar/utf8.rs +++ b/src/scalar/utf8.rs @@ -1,4 +1,4 @@ -use crate::{array::*, datatypes::DataType}; +use crate::{datatypes::DataType, offset::Offset}; use super::Scalar; diff --git a/src/temporal_conversions.rs b/src/temporal_conversions.rs index af643eaf7fd..8eae7213fc8 100644 --- a/src/temporal_conversions.rs +++ b/src/temporal_conversions.rs @@ -7,8 +7,9 @@ use chrono::{ use crate::error::Result; use crate::{ - array::{Offset, PrimitiveArray, Utf8Array}, + array::{PrimitiveArray, Utf8Array}, error::Error, + offset::Offset, }; use crate::{ datatypes::{DataType, TimeUnit}, diff --git a/src/types/index.rs b/src/types/index.rs index b44b3957e79..264720fbe7a 100644 --- a/src/types/index.rs +++ b/src/types/index.rs @@ -21,6 +21,9 @@ pub trait Index: /// Convert itself from [`usize`]. fn from_usize(index: usize) -> Option; + /// Convert itself from [`usize`]. + fn from_as_usize(index: usize) -> Self; + /// An iterator from (inclusive) `start` to (exclusive) `end`. fn range(start: usize, end: usize) -> Option> { let start = Self::from_usize(start); @@ -44,6 +47,11 @@ macro_rules! index { fn from_usize(value: usize) -> Option { Self::try_from(value).ok() } + + #[inline] + fn from_as_usize(value: usize) -> Self { + value as $t + } } }; } diff --git a/src/util/bench_util.rs b/src/util/bench_util.rs index 3ba63ce37be..6ac87ed3f6a 100644 --- a/src/util/bench_util.rs +++ b/src/util/bench_util.rs @@ -3,7 +3,7 @@ use rand::distributions::{Alphanumeric, Distribution, Standard}; use rand::{rngs::StdRng, Rng, SeedableRng}; -use crate::{array::*, types::NativeType}; +use crate::{array::*, offset::Offset, types::NativeType}; /// Returns fixed seedable RNG pub fn seedable_rng() -> StdRng { diff --git a/tests/it/array/binary/mod.rs b/tests/it/array/binary/mod.rs index 03cee86be1b..7609058afa5 100644 --- a/tests/it/array/binary/mod.rs +++ b/tests/it/array/binary/mod.rs @@ -3,6 +3,7 @@ use arrow2::{ bitmap::Bitmap, buffer::Buffer, datatypes::DataType, + offset::OffsetsBuffer, }; mod mutable; @@ -98,7 +99,7 @@ fn with_validity() { #[test] #[should_panic] fn wrong_offsets() { - let offsets = Buffer::from(vec![0, 5, 4]); // invalid offsets + let offsets = vec![0, 5, 4].try_into().unwrap(); // invalid offsets let values = Buffer::from(b"abbbbb".to_vec()); BinaryArray::::from_data(DataType::Binary, offsets, values, None); } @@ -106,7 +107,7 @@ fn wrong_offsets() { #[test] #[should_panic] fn wrong_data_type() { - let offsets = Buffer::from(vec![0, 4]); + let offsets = vec![0, 4].try_into().unwrap(); let values = Buffer::from(b"abbb".to_vec()); BinaryArray::::from_data(DataType::Int8, offsets, values, None); } @@ -114,7 +115,7 @@ fn wrong_data_type() { #[test] #[should_panic] fn value_with_wrong_offsets_panics() { - let offsets = Buffer::from(vec![0, 10, 11, 4]); + let offsets = vec![0, 10, 11, 4].try_into().unwrap(); let values = Buffer::from(b"abbb".to_vec()); // the 10-11 is not checked let array = BinaryArray::::from_data(DataType::Binary, offsets, values, None); @@ -127,7 +128,7 @@ fn value_with_wrong_offsets_panics() { #[test] #[should_panic] fn index_out_of_bounds_panics() { - let offsets = Buffer::from(vec![0, 1, 2, 4]); + let offsets = vec![0, 1, 2, 4].try_into().unwrap(); let values = Buffer::from(b"abbb".to_vec()); let array = BinaryArray::::from_data(DataType::Utf8, offsets, values, None); @@ -137,7 +138,7 @@ fn index_out_of_bounds_panics() { #[test] #[should_panic] fn value_unchecked_with_wrong_offsets_panics() { - let offsets = Buffer::from(vec![0, 10, 11, 4]); + let offsets = vec![0, 10, 11, 4].try_into().unwrap(); let values = Buffer::from(b"abbb".to_vec()); // the 10-11 is not checked let array = BinaryArray::::from_data(DataType::Binary, offsets, values, None); @@ -157,7 +158,7 @@ fn debug() { #[test] fn into_mut_1() { - let offsets = Buffer::from(vec![0, 1]); + let offsets = vec![0, 1].try_into().unwrap(); let values = Buffer::from(b"a".to_vec()); let a = values.clone(); // cloned values assert_eq!(a, values); @@ -167,7 +168,7 @@ fn into_mut_1() { #[test] fn into_mut_2() { - let offsets = Buffer::from(vec![0, 1]); + let offsets: OffsetsBuffer = vec![0, 1].try_into().unwrap(); let values = Buffer::from(b"a".to_vec()); let a = offsets.clone(); // cloned offsets assert_eq!(a, offsets); @@ -177,7 +178,7 @@ fn into_mut_2() { #[test] fn into_mut_3() { - let offsets = Buffer::from(vec![0, 1]); + let offsets = vec![0, 1].try_into().unwrap(); let values = Buffer::from(b"a".to_vec()); let validity = Some([true].into()); let a = validity.clone(); // cloned validity @@ -188,7 +189,7 @@ fn into_mut_3() { #[test] fn into_mut_4() { - let offsets = Buffer::from(vec![0, 1]); + let offsets = vec![0, 1].try_into().unwrap(); let values = Buffer::from(b"a".to_vec()); let validity = Some([true].into()); let array = BinaryArray::::new(DataType::Binary, offsets, values, validity); diff --git a/tests/it/array/binary/mutable.rs b/tests/it/array/binary/mutable.rs index 8968d1cb15c..012e79d5e4a 100644 --- a/tests/it/array/binary/mutable.rs +++ b/tests/it/array/binary/mutable.rs @@ -10,12 +10,12 @@ fn new() { let a = MutableBinaryArray::::with_capacity(2); assert_eq!(a.len(), 0); - assert!(a.offsets().capacity() >= 3); + assert!(a.offsets().capacity() >= 2); assert_eq!(a.values().capacity(), 0); let a = MutableBinaryArray::::with_capacities(2, 60); assert_eq!(a.len(), 0); - assert!(a.offsets().capacity() >= 3); + assert!(a.offsets().capacity() >= 2); assert!(a.values().capacity() >= 60); } @@ -24,12 +24,12 @@ fn from_iter() { let iter = (0..3u8).map(|x| Some(vec![x; x as usize])); let a: MutableBinaryArray = iter.clone().collect(); assert_eq!(a.values().deref(), &[1u8, 2, 2]); - assert_eq!(a.offsets().deref(), &[0, 0, 1, 3]); + assert_eq!(a.offsets().as_slice(), &[0, 0, 1, 3]); assert_eq!(a.validity(), None); let a = unsafe { MutableBinaryArray::::from_trusted_len_iter_unchecked(iter) }; assert_eq!(a.values().deref(), &[1u8, 2, 2]); - assert_eq!(a.offsets().deref(), &[0, 0, 1, 3]); + assert_eq!(a.offsets().as_slice(), &[0, 0, 1, 3]); assert_eq!(a.validity(), None); } @@ -38,12 +38,12 @@ fn from_trusted_len_iter() { let data = vec![vec![0; 0], vec![1; 1], vec![2; 2]]; let a: MutableBinaryArray = data.iter().cloned().map(Some).collect(); assert_eq!(a.values().deref(), &[1u8, 2, 2]); - assert_eq!(a.offsets().deref(), &[0, 0, 1, 3]); + assert_eq!(a.offsets().as_slice(), &[0, 0, 1, 3]); assert_eq!(a.validity(), None); let a = MutableBinaryArray::::from_trusted_len_iter(data.iter().cloned().map(Some)); assert_eq!(a.values().deref(), &[1u8, 2, 2]); - assert_eq!(a.offsets().deref(), &[0, 0, 1, 3]); + assert_eq!(a.offsets().as_slice(), &[0, 0, 1, 3]); assert_eq!(a.validity(), None); let a = MutableBinaryArray::::try_from_trusted_len_iter::( @@ -51,12 +51,12 @@ fn from_trusted_len_iter() { ) .unwrap(); assert_eq!(a.values().deref(), &[1u8, 2, 2]); - assert_eq!(a.offsets().deref(), &[0, 0, 1, 3]); + assert_eq!(a.offsets().as_slice(), &[0, 0, 1, 3]); assert_eq!(a.validity(), None); let a = MutableBinaryArray::::from_trusted_len_values_iter(data.iter().cloned()); assert_eq!(a.values().deref(), &[1u8, 2, 2]); - assert_eq!(a.offsets().deref(), &[0, 0, 1, 3]); + assert_eq!(a.offsets().as_slice(), &[0, 0, 1, 3]); assert_eq!(a.validity(), None); } diff --git a/tests/it/array/binary/mutable_values.rs b/tests/it/array/binary/mutable_values.rs index af02b1d54b3..0bf532bc21c 100644 --- a/tests/it/array/binary/mutable_values.rs +++ b/tests/it/array/binary/mutable_values.rs @@ -7,35 +7,28 @@ fn capacity() { let mut b = MutableBinaryValuesArray::::with_capacity(100); assert_eq!(b.values().capacity(), 0); - assert!(b.offsets().capacity() >= 101); + assert!(b.offsets().capacity() >= 100); b.shrink_to_fit(); - assert!(b.offsets().capacity() < 101); -} - -#[test] -fn offsets_must_be_monotonic_increasing() { - let offsets = vec![0, 5, 4]; - let values = b"abbbbb".to_vec(); - assert!(MutableBinaryValuesArray::::try_new(DataType::Binary, offsets, values).is_err()); + assert!(b.offsets().capacity() < 100); } #[test] fn offsets_must_be_in_bounds() { - let offsets = vec![0, 10]; + let offsets = vec![0, 10].try_into().unwrap(); let values = b"abbbbb".to_vec(); assert!(MutableBinaryValuesArray::::try_new(DataType::Binary, offsets, values).is_err()); } #[test] fn data_type_must_be_consistent() { - let offsets = vec![0, 4]; + let offsets = vec![0, 4].try_into().unwrap(); let values = b"abbb".to_vec(); assert!(MutableBinaryValuesArray::::try_new(DataType::Int32, offsets, values).is_err()); } #[test] fn as_box() { - let offsets = vec![0, 2]; + let offsets = vec![0, 2].try_into().unwrap(); let values = b"ab".to_vec(); let mut b = MutableBinaryValuesArray::::try_new(DataType::Binary, offsets, values).unwrap(); @@ -44,7 +37,7 @@ fn as_box() { #[test] fn as_arc() { - let offsets = vec![0, 2]; + let offsets = vec![0, 2].try_into().unwrap(); let values = b"ab".to_vec(); let mut b = MutableBinaryValuesArray::::try_new(DataType::Binary, offsets, values).unwrap(); @@ -53,13 +46,13 @@ fn as_arc() { #[test] fn extend_trusted_len() { - let offsets = vec![0, 2]; + let offsets = vec![0, 2].try_into().unwrap(); let values = b"ab".to_vec(); let mut b = MutableBinaryValuesArray::::try_new(DataType::Binary, offsets, values).unwrap(); b.extend_trusted_len(vec!["a", "b"].into_iter()); - let offsets = vec![0, 2, 3, 4]; + let offsets = vec![0, 2, 3, 4].try_into().unwrap(); let values = b"abab".to_vec(); assert_eq!( b.as_box(), @@ -73,7 +66,7 @@ fn extend_trusted_len() { fn from_trusted_len() { let mut b = MutableBinaryValuesArray::::from_trusted_len_iter(vec!["a", "b"].into_iter()); - let offsets = vec![0, 1, 2]; + let offsets = vec![0, 1, 2].try_into().unwrap(); let values = b"ab".to_vec(); assert_eq!( b.as_box(), @@ -85,7 +78,7 @@ fn from_trusted_len() { #[test] fn extend_from_iter() { - let offsets = vec![0, 2]; + let offsets = vec![0, 2].try_into().unwrap(); let values = b"ab".to_vec(); let mut b = MutableBinaryValuesArray::::try_new(DataType::Binary, offsets, values).unwrap(); @@ -94,7 +87,7 @@ fn extend_from_iter() { let a = b.clone(); b.extend_trusted_len(a.iter()); - let offsets = vec![0, 2, 3, 4, 6, 7, 8]; + let offsets = vec![0, 2, 3, 4, 6, 7, 8].try_into().unwrap(); let values = b"abababab".to_vec(); assert_eq!( b.as_box(), diff --git a/tests/it/array/binary/to_mutable.rs b/tests/it/array/binary/to_mutable.rs index b553d85d050..1773c83a362 100644 --- a/tests/it/array/binary/to_mutable.rs +++ b/tests/it/array/binary/to_mutable.rs @@ -12,7 +12,7 @@ fn shared_validity() { let validity = Bitmap::from([true]); let array = BinaryArray::::new( DataType::Binary, - vec![0, 1].into(), + vec![0, 1].try_into().unwrap(), b"a".to_vec().into(), Some(validity.clone()), ); @@ -25,7 +25,7 @@ fn shared_values() { let values: Buffer = b"a".to_vec().into(); let array = BinaryArray::::new( DataType::Binary, - vec![0, 1].into(), + vec![0, 1].try_into().unwrap(), values.clone(), Some(Bitmap::from([true])), ); @@ -39,7 +39,7 @@ fn shared_offsets_values() { let values: Buffer = b"a".to_vec().into(); let array = BinaryArray::::new( DataType::Binary, - offsets.clone(), + offsets.clone().try_into().unwrap(), values.clone(), Some(Bitmap::from([true])), ); @@ -52,7 +52,7 @@ fn shared_offsets() { let offsets: Buffer = vec![0, 1].into(); let array = BinaryArray::::new( DataType::Binary, - offsets.clone(), + offsets.clone().try_into().unwrap(), b"a".to_vec().into(), Some(Bitmap::from([true])), ); diff --git a/tests/it/array/equal/list.rs b/tests/it/array/equal/list.rs index 17427946383..67a458017b4 100644 --- a/tests/it/array/equal/list.rs +++ b/tests/it/array/equal/list.rs @@ -1,6 +1,5 @@ use arrow2::array::{Int32Array, ListArray, MutableListArray, MutablePrimitiveArray, TryExtend}; use arrow2::bitmap::Bitmap; -use arrow2::buffer::Buffer; use arrow2::datatypes::DataType; use super::test_equal; @@ -67,7 +66,7 @@ fn test_list_offsets() { #[test] fn test_bla() { - let offsets = Buffer::from(vec![0, 3, 3, 6]); + let offsets = vec![0, 3, 3, 6].try_into().unwrap(); let data_type = ListArray::::default_datatype(DataType::Int32); let values = Box::new(Int32Array::from([ Some(1), @@ -81,7 +80,7 @@ fn test_bla() { let lhs = ListArray::::from_data(data_type, offsets, values, Some(validity)); let lhs = lhs.slice(1, 2); - let offsets = Buffer::from(vec![0, 0, 3]); + let offsets = vec![0, 0, 3].try_into().unwrap(); let data_type = ListArray::::default_datatype(DataType::Int32); let values = Box::new(Int32Array::from([Some(4), None, Some(6)])); let validity = Bitmap::from([false, true]); diff --git a/tests/it/array/equal/utf8.rs b/tests/it/array/equal/utf8.rs index 2be9ebef83c..28e13d4a2ec 100644 --- a/tests/it/array/equal/utf8.rs +++ b/tests/it/array/equal/utf8.rs @@ -1,4 +1,5 @@ use arrow2::array::*; +use arrow2::offset::Offset; use super::{binary_cases, test_equal}; diff --git a/tests/it/array/list/mod.rs b/tests/it/array/list/mod.rs index ef07860edb2..cf8aec30f9f 100644 --- a/tests/it/array/list/mod.rs +++ b/tests/it/array/list/mod.rs @@ -12,7 +12,7 @@ fn debug() { let data_type = ListArray::::default_datatype(DataType::Int32); let array = ListArray::::from_data( data_type, - Buffer::from(vec![0, 2, 2, 3, 5]), + vec![0, 2, 2, 3, 5].try_into().unwrap(), Box::new(values), None, ); @@ -29,7 +29,7 @@ fn test_nested_panic() { let data_type = ListArray::::default_datatype(DataType::Int32); let array = ListArray::::from_data( data_type.clone(), - Buffer::from(vec![0, 2, 2, 3, 5]), + vec![0, 2, 2, 3, 5].try_into().unwrap(), Box::new(values), None, ); @@ -38,7 +38,7 @@ fn test_nested_panic() { // the nested structure of the child data let _ = ListArray::::from_data( data_type, - Buffer::from(vec![0, 2, 4]), + vec![0, 2, 4].try_into().unwrap(), Box::new(array), None, ); @@ -52,7 +52,7 @@ fn test_nested_display() { let data_type = ListArray::::default_datatype(DataType::Int32); let array = ListArray::::from_data( data_type, - Buffer::from(vec![0, 2, 4, 7, 7, 8, 10]), + vec![0, 2, 4, 7, 7, 8, 10].try_into().unwrap(), Box::new(values), None, ); @@ -60,7 +60,7 @@ fn test_nested_display() { let data_type = ListArray::::default_datatype(array.data_type().clone()); let nested = ListArray::::from_data( data_type, - Buffer::from(vec![0, 2, 5, 6]), + vec![0, 2, 5, 6].try_into().unwrap(), Box::new(array), None, ); diff --git a/tests/it/array/list/mutable.rs b/tests/it/array/list/mutable.rs index 63cf3f8c1e5..4d7483fe31a 100644 --- a/tests/it/array/list/mutable.rs +++ b/tests/it/array/list/mutable.rs @@ -21,7 +21,7 @@ fn basics() { let data_type = ListArray::::default_datatype(DataType::Int32); let expected = ListArray::::from_data( data_type, - Buffer::from(vec![0, 3, 3, 6]), + vec![0, 3, 3, 6].try_into().unwrap(), Box::new(values), Some(Bitmap::from([true, false, true])), ); @@ -32,7 +32,7 @@ fn basics() { fn with_capacity() { let array = MutableListArray::>::with_capacity(10); assert!(array.offsets().capacity() >= 10); - assert_eq!(array.offsets().len(), 1); + assert_eq!(array.offsets().len(), 0); assert_eq!(array.values().values().capacity(), 0); assert_eq!(array.validity(), None); } @@ -45,7 +45,7 @@ fn push() { .unwrap(); assert_eq!(array.len(), 1); assert_eq!(array.values().values().as_ref(), [1, 2, 3]); - assert_eq!(array.offsets().as_ref(), [0, 3]); + assert_eq!(array.offsets().as_slice(), [0, 3]); assert_eq!(array.validity(), None); } diff --git a/tests/it/array/map/mod.rs b/tests/it/array/map/mod.rs index 38fde84367e..f58936dc3f6 100644 --- a/tests/it/array/map/mod.rs +++ b/tests/it/array/map/mod.rs @@ -20,7 +20,12 @@ fn basics() { None, ); - let array = MapArray::new(data_type, vec![0, 1, 2].into(), Box::new(field), None); + let array = MapArray::new( + data_type, + vec![0, 1, 2].try_into().unwrap(), + Box::new(field), + None, + ); assert_eq!( array.value(0), diff --git a/tests/it/array/utf8/mod.rs b/tests/it/array/utf8/mod.rs index 8c9b41e8b92..e60a9e28b83 100644 --- a/tests/it/array/utf8/mod.rs +++ b/tests/it/array/utf8/mod.rs @@ -1,4 +1,7 @@ -use arrow2::{array::*, bitmap::Bitmap, buffer::Buffer, datatypes::DataType, error::Result}; +use arrow2::{ + array::*, bitmap::Bitmap, buffer::Buffer, datatypes::DataType, error::Result, + offset::OffsetsBuffer, +}; mod mutable; mod mutable_values; @@ -60,8 +63,8 @@ fn from() { fn from_slice() { let b = Utf8Array::::from_slice(["a", "b", "cc"]); - let offsets = Buffer::from(vec![0, 1, 2, 4]); - let values = Buffer::from(b"abcc".to_vec()); + let offsets = vec![0, 1, 2, 4].try_into().unwrap(); + let values = b"abcc".to_vec().into(); assert_eq!( b, Utf8Array::::from_data(DataType::Utf8, offsets, values, None) @@ -72,8 +75,8 @@ fn from_slice() { fn from_iter_values() { let b = Utf8Array::::from_iter_values(["a", "b", "cc"].iter()); - let offsets = Buffer::from(vec![0, 1, 2, 4]); - let values = Buffer::from(b"abcc".to_vec()); + let offsets = vec![0, 1, 2, 4].try_into().unwrap(); + let values = b"abcc".to_vec().into(); assert_eq!( b, Utf8Array::::from_data(DataType::Utf8, offsets, values, None) @@ -85,8 +88,8 @@ fn from_trusted_len_iter() { let b = Utf8Array::::from_trusted_len_iter(vec![Some("a"), Some("b"), Some("cc")].into_iter()); - let offsets = Buffer::from(vec![0, 1, 2, 4]); - let values = Buffer::from(b"abcc".to_vec()); + let offsets = vec![0, 1, 2, 4].try_into().unwrap(); + let values = b"abcc".to_vec().into(); assert_eq!( b, Utf8Array::::from_data(DataType::Utf8, offsets, values, None) @@ -102,8 +105,8 @@ fn try_from_trusted_len_iter() { ) .unwrap(); - let offsets = Buffer::from(vec![0, 1, 2, 4]); - let values = Buffer::from(b"abcc".to_vec()); + let offsets = vec![0, 1, 2, 4].try_into().unwrap(); + let values = b"abcc".to_vec().into(); assert_eq!( b, Utf8Array::::from_data(DataType::Utf8, offsets, values, None) @@ -112,59 +115,38 @@ fn try_from_trusted_len_iter() { #[test] fn not_utf8() { - let offsets = Buffer::from(vec![0, 4]); - let values = Buffer::from(vec![0, 159, 146, 150]); // invalid utf8 + let offsets = vec![0, 4].try_into().unwrap(); + let values = vec![0, 159, 146, 150].into(); // invalid utf8 assert!(Utf8Array::::try_new(DataType::Utf8, offsets, values, None).is_err()); } #[test] fn not_utf8_individually() { - let offsets = Buffer::from(vec![0, 1, 2]); - let values = Buffer::from(vec![207, 128]); // each is invalid utf8, but together is valid - assert!(Utf8Array::::try_new(DataType::Utf8, offsets, values, None).is_err()); -} - -#[test] -fn wrong_offsets() { - let offsets = Buffer::from(vec![0, 5, 4]); // invalid offsets - let values = Buffer::from(b"abbbbb".to_vec()); + let offsets = vec![0, 1, 2].try_into().unwrap(); + let values = vec![207, 128].into(); // each is invalid utf8, but together is valid assert!(Utf8Array::::try_new(DataType::Utf8, offsets, values, None).is_err()); } #[test] fn wrong_data_type() { - let offsets = Buffer::from(vec![0, 4]); - let values = Buffer::from(b"abbb".to_vec()); + let offsets = vec![0, 4].try_into().unwrap(); + let values = b"abbb".to_vec().into(); assert!(Utf8Array::::try_new(DataType::Int32, offsets, values, None).is_err()); } #[test] fn out_of_bounds_offsets_panics() { // the 10 is out of bounds - let offsets = Buffer::from(vec![0, 10, 11]); - let values = Buffer::from(b"abbb".to_vec()); - assert!(Utf8Array::::try_new(DataType::Utf8, offsets, values, None).is_err()); -} - -#[test] -fn decreasing_offset_and_ascii_panics() { - let offsets = Buffer::from(vec![0, 2, 1]); - let values = Buffer::from(b"abbb".to_vec()); - assert!(Utf8Array::::try_new(DataType::Utf8, offsets, values, None).is_err()); -} - -#[test] -fn decreasing_offset_and_utf8_panics() { - let offsets = Buffer::from(vec![0, 2, 4, 2]); // not increasing - let values = Buffer::from(vec![207, 128, 207, 128, 207, 128]); // valid utf8 + let offsets = vec![0, 10, 11].try_into().unwrap(); + let values = b"abbb".to_vec().into(); assert!(Utf8Array::::try_new(DataType::Utf8, offsets, values, None).is_err()); } #[test] #[should_panic] fn index_out_of_bounds_panics() { - let offsets = Buffer::from(vec![0, 1, 2, 4]); - let values = Buffer::from(b"abbb".to_vec()); + let offsets = vec![0, 1, 2, 4].try_into().unwrap(); + let values = b"abbb".to_vec().into(); let array = Utf8Array::::from_data(DataType::Utf8, offsets, values, None); array.value(3); @@ -179,7 +161,7 @@ fn debug() { #[test] fn into_mut_1() { - let offsets = Buffer::from(vec![0, 1]); + let offsets = vec![0, 1].try_into().unwrap(); let values = Buffer::from(b"a".to_vec()); let a = values.clone(); // cloned values assert_eq!(a, values); @@ -189,8 +171,8 @@ fn into_mut_1() { #[test] fn into_mut_2() { - let offsets = Buffer::from(vec![0, 1]); - let values = Buffer::from(b"a".to_vec()); + let offsets: OffsetsBuffer = vec![0, 1].try_into().unwrap(); + let values = b"a".to_vec().into(); let a = offsets.clone(); // cloned offsets assert_eq!(a, offsets); let array = Utf8Array::::from_data(DataType::Utf8, offsets, values, None); @@ -199,8 +181,8 @@ fn into_mut_2() { #[test] fn into_mut_3() { - let offsets = Buffer::from(vec![0, 1]); - let values = Buffer::from(b"a".to_vec()); + let offsets = vec![0, 1].try_into().unwrap(); + let values = b"a".to_vec().into(); let validity = Some([true].into()); let a = validity.clone(); // cloned validity assert_eq!(a, validity); @@ -210,8 +192,8 @@ fn into_mut_3() { #[test] fn into_mut_4() { - let offsets = Buffer::from(vec![0, 1]); - let values = Buffer::from(b"a".to_vec()); + let offsets = vec![0, 1].try_into().unwrap(); + let values = b"a".to_vec().into(); let validity = Some([true].into()); let array = Utf8Array::::new(DataType::Utf8, offsets, values, validity); assert!(array.into_mut().is_right()); diff --git a/tests/it/array/utf8/mutable.rs b/tests/it/array/utf8/mutable.rs index 57c188fb808..faa4868a58f 100644 --- a/tests/it/array/utf8/mutable.rs +++ b/tests/it/array/utf8/mutable.rs @@ -7,7 +7,7 @@ fn capacities() { let b = MutableUtf8Array::::with_capacities(1, 10); assert!(b.values().capacity() >= 10); - assert!(b.offsets().capacity() >= 2); + assert!(b.offsets().capacity() >= 1); } #[test] @@ -69,24 +69,15 @@ fn pop_all_some() { #[test] #[should_panic] fn not_utf8() { - let offsets = vec![0, 4]; + let offsets = vec![0, 4].try_into().unwrap(); let values = vec![0, 159, 146, 150]; // invalid utf8 MutableUtf8Array::::from_data(DataType::Utf8, offsets, values, None); } -/// Safety guarantee -#[test] -#[should_panic] -fn wrong_offsets() { - let offsets = vec![0, 5, 4]; // invalid offsets - let values = vec![0, 1, 2, 3, 4, 5]; - MutableUtf8Array::::from_data(DataType::Utf8, offsets, values, None); -} - #[test] #[should_panic] fn wrong_data_type() { - let offsets = vec![0, 4]; // invalid offsets + let offsets = vec![0, 4].try_into().unwrap(); let values = vec![1, 2, 3, 4]; MutableUtf8Array::::from_data(DataType::Int8, offsets, values, None); } diff --git a/tests/it/array/utf8/mutable_values.rs b/tests/it/array/utf8/mutable_values.rs index 1ad783b607d..6bf04726a36 100644 --- a/tests/it/array/utf8/mutable_values.rs +++ b/tests/it/array/utf8/mutable_values.rs @@ -7,35 +7,28 @@ fn capacity() { let mut b = MutableUtf8ValuesArray::::with_capacity(100); assert_eq!(b.values().capacity(), 0); - assert!(b.offsets().capacity() >= 101); + assert!(b.offsets().capacity() >= 100); b.shrink_to_fit(); - assert!(b.offsets().capacity() < 101); -} - -#[test] -fn offsets_must_be_monotonic_increasing() { - let offsets = vec![0, 5, 4]; - let values = b"abbbbb".to_vec(); - assert!(MutableUtf8ValuesArray::::try_new(DataType::Utf8, offsets, values).is_err()); + assert!(b.offsets().capacity() < 100); } #[test] fn offsets_must_be_in_bounds() { - let offsets = vec![0, 10]; + let offsets = vec![0, 10].try_into().unwrap(); let values = b"abbbbb".to_vec(); assert!(MutableUtf8ValuesArray::::try_new(DataType::Utf8, offsets, values).is_err()); } #[test] fn data_type_must_be_consistent() { - let offsets = vec![0, 4]; + let offsets = vec![0, 4].try_into().unwrap(); let values = b"abbb".to_vec(); assert!(MutableUtf8ValuesArray::::try_new(DataType::Int32, offsets, values).is_err()); } #[test] fn must_be_utf8() { - let offsets = vec![0, 4]; + let offsets = vec![0, 4].try_into().unwrap(); let values = vec![0, 159, 146, 150]; assert!(std::str::from_utf8(&values).is_err()); assert!(MutableUtf8ValuesArray::::try_new(DataType::Utf8, offsets, values).is_err()); @@ -43,7 +36,7 @@ fn must_be_utf8() { #[test] fn as_box() { - let offsets = vec![0, 2]; + let offsets = vec![0, 2].try_into().unwrap(); let values = b"ab".to_vec(); let mut b = MutableUtf8ValuesArray::::try_new(DataType::Utf8, offsets, values).unwrap(); let _ = b.as_box(); @@ -51,7 +44,7 @@ fn as_box() { #[test] fn as_arc() { - let offsets = vec![0, 2]; + let offsets = vec![0, 2].try_into().unwrap(); let values = b"ab".to_vec(); let mut b = MutableUtf8ValuesArray::::try_new(DataType::Utf8, offsets, values).unwrap(); let _ = b.as_arc(); @@ -59,12 +52,12 @@ fn as_arc() { #[test] fn extend_trusted_len() { - let offsets = vec![0, 2]; + let offsets = vec![0, 2].try_into().unwrap(); let values = b"ab".to_vec(); let mut b = MutableUtf8ValuesArray::::try_new(DataType::Utf8, offsets, values).unwrap(); b.extend_trusted_len(vec!["a", "b"].into_iter()); - let offsets = vec![0, 2, 3, 4]; + let offsets = vec![0, 2, 3, 4].try_into().unwrap(); let values = b"abab".to_vec(); assert_eq!( b.as_box(), @@ -78,7 +71,7 @@ fn extend_trusted_len() { fn from_trusted_len() { let mut b = MutableUtf8ValuesArray::::from_trusted_len_iter(vec!["a", "b"].into_iter()); - let offsets = vec![0, 1, 2]; + let offsets = vec![0, 1, 2].try_into().unwrap(); let values = b"ab".to_vec(); assert_eq!( b.as_box(), @@ -90,7 +83,7 @@ fn from_trusted_len() { #[test] fn extend_from_iter() { - let offsets = vec![0, 2]; + let offsets = vec![0, 2].try_into().unwrap(); let values = b"ab".to_vec(); let mut b = MutableUtf8ValuesArray::::try_new(DataType::Utf8, offsets, values).unwrap(); b.extend_trusted_len(vec!["a", "b"].into_iter()); @@ -98,7 +91,7 @@ fn extend_from_iter() { let a = b.clone(); b.extend_trusted_len(a.iter()); - let offsets = vec![0, 2, 3, 4, 6, 7, 8]; + let offsets = vec![0, 2, 3, 4, 6, 7, 8].try_into().unwrap(); let values = b"abababab".to_vec(); assert_eq!( b.as_box(), diff --git a/tests/it/array/utf8/to_mutable.rs b/tests/it/array/utf8/to_mutable.rs index c4c822b62d8..97ee0fb2055 100644 --- a/tests/it/array/utf8/to_mutable.rs +++ b/tests/it/array/utf8/to_mutable.rs @@ -1,4 +1,6 @@ -use arrow2::{array::Utf8Array, bitmap::Bitmap, buffer::Buffer, datatypes::DataType}; +use arrow2::{ + array::Utf8Array, bitmap::Bitmap, buffer::Buffer, datatypes::DataType, offset::OffsetsBuffer, +}; #[test] fn not_shared() { @@ -12,7 +14,7 @@ fn shared_validity() { let validity = Bitmap::from([true]); let array = Utf8Array::::new( DataType::Utf8, - vec![0, 1].into(), + vec![0, 1].try_into().unwrap(), b"a".to_vec().into(), Some(validity.clone()), ); @@ -25,7 +27,7 @@ fn shared_values() { let values: Buffer = b"a".to_vec().into(); let array = Utf8Array::::new( DataType::Utf8, - vec![0, 1].into(), + vec![0, 1].try_into().unwrap(), values.clone(), Some(Bitmap::from([true])), ); @@ -35,7 +37,7 @@ fn shared_values() { #[test] #[allow(clippy::redundant_clone)] fn shared_offsets_values() { - let offsets: Buffer = vec![0, 1].into(); + let offsets: OffsetsBuffer = vec![0, 1].try_into().unwrap(); let values: Buffer = b"a".to_vec().into(); let array = Utf8Array::::new( DataType::Utf8, @@ -49,7 +51,7 @@ fn shared_offsets_values() { #[test] #[allow(clippy::redundant_clone)] fn shared_offsets() { - let offsets: Buffer = vec![0, 1].into(); + let offsets: OffsetsBuffer = vec![0, 1].try_into().unwrap(); let array = Utf8Array::::new( DataType::Utf8, offsets.clone(), diff --git a/tests/it/compute/length.rs b/tests/it/compute/length.rs index 9bb37576956..0a6b5e51e8e 100644 --- a/tests/it/compute/length.rs +++ b/tests/it/compute/length.rs @@ -1,6 +1,7 @@ use arrow2::array::*; use arrow2::compute::length::*; use arrow2::datatypes::*; +use arrow2::offset::Offset; fn length_test_string() { vec![ diff --git a/tests/it/compute/regex_match.rs b/tests/it/compute/regex_match.rs index 141a87ad3e6..66f28d03b9b 100644 --- a/tests/it/compute/regex_match.rs +++ b/tests/it/compute/regex_match.rs @@ -1,6 +1,7 @@ -use arrow2::array::{BooleanArray, Offset, Utf8Array}; +use arrow2::array::{BooleanArray, Utf8Array}; use arrow2::compute::regex_match::*; use arrow2::error::Result; +use arrow2::offset::Offset; fn test_generic, &Utf8Array) -> Result>( lhs: Vec<&str>, diff --git a/tests/it/compute/substring.rs b/tests/it/compute/substring.rs index 365615cd51f..5b76a0ac348 100644 --- a/tests/it/compute/substring.rs +++ b/tests/it/compute/substring.rs @@ -1,4 +1,4 @@ -use arrow2::{array::*, compute::substring::*, error::Result}; +use arrow2::{array::*, compute::substring::*, error::Result, offset::Offset}; fn with_nulls_utf8() -> Result<()> { let cases = vec![ diff --git a/tests/it/compute/take.rs b/tests/it/compute/take.rs index 0e1719fb4eb..75b55d76f53 100644 --- a/tests/it/compute/take.rs +++ b/tests/it/compute/take.rs @@ -176,7 +176,7 @@ fn list_with_no_none() { let data_type = ListArray::::default_datatype(DataType::Int32); let array = ListArray::::from_data( data_type, - Buffer::from(vec![0, 2, 2, 6, 9, 10]), + vec![0, 2, 2, 6, 9, 10].try_into().unwrap(), Box::new(values), None, ); @@ -189,7 +189,7 @@ fn list_with_no_none() { let expected_type = ListArray::::default_datatype(DataType::Int32); let expected = ListArray::::from_data( expected_type, - Buffer::from(vec![0, 1, 1, 4]), + vec![0, 1, 1, 4].try_into().unwrap(), Box::new(expected_values), None, ); @@ -208,7 +208,7 @@ fn list_with_none() { let data_type = ListArray::::default_datatype(DataType::Int32); let array = ListArray::::from_data( data_type, - Buffer::from(vec![0, 2, 2, 6, 9, 10]), + vec![0, 2, 2, 6, 9, 10].try_into().unwrap(), Box::new(values), Some(validity), ); @@ -267,7 +267,7 @@ fn test_nested() { let data_type = ListArray::::default_datatype(DataType::Int32); let array = ListArray::::from_data( data_type, - Buffer::from(vec![0, 2, 4, 7, 7, 8, 10]), + vec![0, 2, 4, 7, 7, 8, 10].try_into().unwrap(), Box::new(values), None, ); @@ -275,7 +275,7 @@ fn test_nested() { let data_type = ListArray::::default_datatype(array.data_type().clone()); let nested = ListArray::::from_data( data_type, - Buffer::from(vec![0, 2, 5, 6]), + vec![0, 2, 5, 6].try_into().unwrap(), Box::new(array), None, ); @@ -290,7 +290,7 @@ fn test_nested() { let expected_data_type = ListArray::::default_datatype(DataType::Int32); let expected_array = ListArray::::from_data( expected_data_type, - Buffer::from(vec![0, 2, 4, 7, 7, 8]), + vec![0, 2, 4, 7, 7, 8].try_into().unwrap(), Box::new(expected_values), None, ); @@ -298,7 +298,7 @@ fn test_nested() { let expected_data_type = ListArray::::default_datatype(expected_array.data_type().clone()); let expected = ListArray::::from_data( expected_data_type, - Buffer::from(vec![0, 2, 5]), + vec![0, 2, 5].try_into().unwrap(), Box::new(expected_array), None, ); diff --git a/tests/it/compute/utf8.rs b/tests/it/compute/utf8.rs index 864dc0eca27..e9d8613ddeb 100644 --- a/tests/it/compute/utf8.rs +++ b/tests/it/compute/utf8.rs @@ -1,4 +1,4 @@ -use arrow2::{array::*, compute::utf8::*, error::Result}; +use arrow2::{array::*, compute::utf8::*, error::Result, offset::Offset}; fn with_nulls_utf8_lower() -> Result<()> { let cases = vec![ diff --git a/tests/it/ffi/data.rs b/tests/it/ffi/data.rs index bc87eda68c8..165c7107926 100644 --- a/tests/it/ffi/data.rs +++ b/tests/it/ffi/data.rs @@ -115,7 +115,7 @@ fn utf8_sliced() -> Result<()> { let bitmap = Bitmap::from([true, false, false, true]).slice(1, 3); let data = Utf8Array::::try_new( DataType::Utf8, - vec![0, 1, 1, 2].into(), + vec![0, 1, 1, 2].try_into().unwrap(), b"ab".to_vec().into(), Some(bitmap), )?; @@ -146,7 +146,7 @@ fn binary_sliced() -> Result<()> { let bitmap = Bitmap::from([true, false, false, true]).slice(1, 3); let data = BinaryArray::::try_new( DataType::Binary, - vec![0, 1, 1, 2].into(), + vec![0, 1, 1, 2].try_into().unwrap(), b"ab".to_vec().into(), Some(bitmap), )?; @@ -213,7 +213,7 @@ fn list_sliced() -> Result<()> { let array = ListArray::::try_new( DataType::List(Box::new(Field::new("a", DataType::Int32, true))), - vec![0, 1, 1, 2].into(), + vec![0, 1, 1, 2].try_into().unwrap(), Box::new(PrimitiveArray::::from_vec(vec![1, 2])), Some(bitmap), )?; diff --git a/tests/it/io/avro/write.rs b/tests/it/io/avro/write.rs index c47b1b9782e..7cff7740fbb 100644 --- a/tests/it/io/avro/write.rs +++ b/tests/it/io/avro/write.rs @@ -86,7 +86,7 @@ pub(super) fn data() -> Chunk> { ])), Box::new(ListArray::::new( list_dt, - vec![0, 2, 5].into(), + vec![0, 2, 5].try_into().unwrap(), Box::new(PrimitiveArray::::from([ None, Some(1), @@ -98,7 +98,7 @@ pub(super) fn data() -> Chunk> { )), Box::new(ListArray::::new( list_dt1, - vec![0, 2, 2].into(), + vec![0, 2, 2].try_into().unwrap(), Box::new(PrimitiveArray::::from([None, Some(1)])), Some([true, false].into()), )), diff --git a/tests/it/io/ipc/read/file.rs b/tests/it/io/ipc/read/file.rs index 9d21d051f2a..515a6ede92f 100644 --- a/tests/it/io/ipc/read/file.rs +++ b/tests/it/io/ipc/read/file.rs @@ -106,6 +106,12 @@ fn read_generated_100_decimal() -> Result<()> { test_file("1.0.0-bigendian", "generated_decimal") } +#[test] +fn read_generated_duplicate_fieldnames() -> Result<()> { + test_file("1.0.0-littleendian", "generated_duplicate_fieldnames")?; + test_file("1.0.0-bigendian", "generated_duplicate_fieldnames") +} + #[test] fn read_generated_100_interval() -> Result<()> { test_file("1.0.0-littleendian", "generated_interval")?; diff --git a/tests/it/io/json/write.rs b/tests/it/io/json/write.rs index 8555f0081c6..3b0695cf631 100644 --- a/tests/it/io/json/write.rs +++ b/tests/it/io/json/write.rs @@ -321,7 +321,7 @@ fn list_of_struct() -> Result<()> { // [{"c11": 5, "c12": {"c121": "g"}}] let c1 = ListArray::::from_data( c1_datatype, - Buffer::from(vec![0, 2, 2, 3]), + Buffer::from(vec![0, 2, 2, 3]).try_into().unwrap(), Box::new(s), Some(Bitmap::from_u8_slice([0b00000101], 3)), ); diff --git a/tests/it/io/ndjson/mod.rs b/tests/it/io/ndjson/mod.rs index 632d3015383..b2e2b2fc895 100644 --- a/tests/it/io/ndjson/mod.rs +++ b/tests/it/io/ndjson/mod.rs @@ -2,7 +2,6 @@ mod read; use arrow2::array::*; use arrow2::bitmap::Bitmap; -use arrow2::buffer::Buffer; use arrow2::datatypes::*; use arrow2::error::Result; use arrow2::io::ndjson::write as ndjson_write; @@ -286,7 +285,7 @@ fn case_nested_list() -> (String, Box) { ); let expected = ListArray::from_data( a_list_data_type, - Buffer::from(vec![0i32, 2, 3, 6, 6, 6]), + vec![0i32, 2, 3, 6, 6, 6].try_into().unwrap(), a_struct.boxed(), Some(Bitmap::from_u8_slice([0b00010111], 5)), ); diff --git a/tests/it/io/parquet/mod.rs b/tests/it/io/parquet/mod.rs index 9097d32fb30..4790479cc2f 100644 --- a/tests/it/io/parquet/mod.rs +++ b/tests/it/io/parquet/mod.rs @@ -3,13 +3,13 @@ use std::io::{Cursor, Read, Seek}; use arrow2::{ array::*, bitmap::Bitmap, - buffer::Buffer, chunk::Chunk, datatypes::*, error::Result, io::parquet::read as p_read, io::parquet::read::statistics::*, io::parquet::write::*, + offset::Offset, types::{days_ms, NativeType}, }; @@ -74,7 +74,7 @@ pub fn pyarrow_nested_edge(column: &str) -> Box { // [["a", "b", None, "c"]] let a = ListArray::::new( DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), - vec![0, 4].into(), + vec![0, 4].try_into().unwrap(), Utf8Array::::from([Some("a"), Some("b"), None, Some("c")]).boxed(), None, ); @@ -90,7 +90,7 @@ pub fn pyarrow_nested_edge(column: &str) -> Box { } pub fn pyarrow_nested_nullable(column: &str) -> Box { - let offsets = Buffer::from(vec![0, 2, 2, 5, 8, 8, 11, 11, 12]); + let offsets = vec![0, 2, 2, 5, 8, 8, 11, 11, 12].try_into().unwrap(); let values = match column { "list_int64" => { @@ -581,7 +581,7 @@ pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics { array.data_type().clone(), nullable, ))), - vec![0, array.len() as i32].into(), + vec![0, array.len() as i32].try_into().unwrap(), array, None, ) @@ -684,7 +684,7 @@ pub fn pyarrow_nested_edge_statistics(column: &str) -> Statistics { array.data_type().clone(), true, ))), - vec![0, array.len() as i32].into(), + vec![0, array.len() as i32].try_into().unwrap(), array, None, ) @@ -989,7 +989,7 @@ pub fn pyarrow_map(column: &str) -> Box { ]); MapArray::try_new( DataType::Map(Box::new(Field::new("entries", dt.clone(), false)), false), - vec![0, 2].into(), + vec![0, 2].try_into().unwrap(), StructArray::try_new( dt, vec![ @@ -1014,7 +1014,7 @@ pub fn pyarrow_map(column: &str) -> Box { ]); MapArray::try_new( DataType::Map(Box::new(Field::new("entries", dt.clone(), false)), false), - vec![0, 2].into(), + vec![0, 2].try_into().unwrap(), StructArray::try_new( dt, vec![ @@ -1046,7 +1046,7 @@ pub fn pyarrow_map_statistics(column: &str) -> Statistics { Box::new(Field::new("items", DataType::Struct(fields.clone()), false)), false, ), - vec![0, arrays[0].len() as i32].into(), + vec![0, arrays[0].len() as i32].try_into().unwrap(), StructArray::new(DataType::Struct(fields), arrays, None).boxed(), None, ) @@ -1510,7 +1510,7 @@ fn nested_dict_data(data_type: DataType) -> Result<(Schema, Chunk values.data_type().clone(), false, ))), - vec![0i32, 0, 0, 2, 3].into(), + vec![0i32, 0, 0, 2, 3].try_into().unwrap(), values.boxed(), Some([true, false, true, true].into()), )?;