Skip to content

Commit

Permalink
Merge branch 'main' into lazy-hconcat
Browse files Browse the repository at this point in the history
  • Loading branch information
adamreeve committed Dec 27, 2023
2 parents b119d1e + 8d610b1 commit 4b1e68f
Show file tree
Hide file tree
Showing 433 changed files with 4,804 additions and 2,975 deletions.
138 changes: 64 additions & 74 deletions Cargo.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ atoi = "2"
avro-schema = { version = "0.3" }
base64 = "0.21.2"
bitflags = "2"
bytemuck = { version = "1", features = ["derive", "extern_crate_alloc"] }
bytemuck = { version = "1.11", features = ["derive", "extern_crate_alloc"] }
chrono = { version = "0.4.31", default-features = false, features = ["std"] }
chrono-tz = "0.8.1"
ciborium = "0.2"
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-arrow/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ tokio = { workspace = true, features = ["macros", "rt", "fs", "io-util"] }
tokio-util = { workspace = true, features = ["compat"] }

[build-dependencies]
rustc_version = "0.4.0"
version_check = { workspace = true }

[target.wasm32-unknown-unknown.dependencies]
getrandom = { version = "0.2", features = ["js"] }
Expand Down
7 changes: 7 additions & 0 deletions crates/polars-arrow/build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
fn main() {
println!("cargo:rerun-if-changed=build.rs");
let channel = version_check::Channel::read().unwrap();
if channel.is_nightly() {
println!("cargo:rustc-cfg=feature=\"nightly\"");
}
}
1 change: 1 addition & 0 deletions crates/polars-arrow/src/array/dictionary/value_map.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ impl<K: DictionaryKey, M: MutableArray> ValueMap<K, M> {
// safety: we only iterate within bounds
let value = unsafe { values.value_unchecked_at(index) };
let hash = ahash_hash(value.borrow());

match map.raw_entry_mut().from_hash(hash, |item| {
// safety: invariant of the struct, it's always in bounds since we maintain it
let stored_value = unsafe { values.value_unchecked_at(item.key.as_usize()) };
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-arrow/src/array/null.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ impl NullArray {
/// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to [`crate::datatypes::PhysicalType::Null`].
pub fn try_new(data_type: ArrowDataType, length: usize) -> PolarsResult<Self> {
if data_type.to_physical_type() != PhysicalType::Null {
polars_bail!(ComputeError: "NullArray can only be initialized with a DataType whose physical type is Boolean");
polars_bail!(ComputeError: "NullArray can only be initialized with a DataType whose physical type is Null");
}

Ok(Self { data_type, length })
Expand Down
20 changes: 8 additions & 12 deletions crates/polars-arrow/src/bitmap/bitmask.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
#[cfg(feature = "simd")]
use std::simd::ToBitMask;

#[cfg(feature = "simd")]
use num_traits::AsPrimitive;
use std::simd::{LaneCount, Mask, MaskElement, SupportedLaneCount};

use crate::bitmap::Bitmap;

Expand Down Expand Up @@ -136,33 +133,32 @@ impl<'a> BitMask<'a> {

#[cfg(feature = "simd")]
#[inline]
pub fn get_simd<T>(&self, idx: usize) -> T
pub fn get_simd<T, const N: usize>(&self, idx: usize) -> Mask<T, N>
where
T: ToBitMask,
<T as ToBitMask>::BitMask: Copy + 'static,
u64: AsPrimitive<<T as ToBitMask>::BitMask>,
T: MaskElement,
LaneCount<N>: SupportedLaneCount,
{
// We don't support 64-lane masks because then we couldn't load our
// bitwise mask as a u64 and then do the byteshift on it.

let lanes = std::mem::size_of::<T::BitMask>() * 8;
let lanes = LaneCount::<N>::BITMASK_LEN;
assert!(lanes < 64);

let start_byte_idx = (self.offset + idx) / 8;
let byte_shift = (self.offset + idx) % 8;
if idx + lanes <= self.len {
// SAFETY: fast path, we know this is completely in-bounds.
let mask = load_padded_le_u64(unsafe { self.bytes.get_unchecked(start_byte_idx..) });
T::from_bitmask((mask >> byte_shift).as_())
Mask::from_bitmask(mask >> byte_shift)
} else if idx < self.len {
// SAFETY: we know that at least the first byte is in-bounds.
// This is partially out of bounds, we have to do extra masking.
let mask = load_padded_le_u64(unsafe { self.bytes.get_unchecked(start_byte_idx..) });
let num_out_of_bounds = idx + lanes - self.len;
let shifted = (mask << num_out_of_bounds) >> (num_out_of_bounds + byte_shift);
T::from_bitmask(shifted.as_())
Mask::from_bitmask(shifted)
} else {
T::from_bitmask((0u64).as_())
Mask::from_bitmask(0u64)
}
}

Expand Down
2 changes: 1 addition & 1 deletion crates/polars-arrow/src/compute/aggregate/simd/packed.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::simd::{SimdFloat as _, SimdInt as _, SimdUint as _};
use std::simd::prelude::{SimdFloat as _, SimdInt as _, SimdUint as _};

use super::super::sum::Sum;
use crate::types::simd::*;
Expand Down
89 changes: 62 additions & 27 deletions crates/polars-arrow/src/compute/cast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ pub use binary_to::*;
pub use boolean_to::*;
pub use decimal_to::*;
pub use dictionary_to::*;
use polars_error::{polars_bail, polars_err, PolarsResult};
use polars_error::{polars_bail, polars_ensure, polars_err, PolarsResult};
pub use primitive_to::*;
pub use utf8_to::*;

Expand Down Expand Up @@ -391,32 +391,67 @@ fn cast_list_to_fixed_size_list<O: Offset>(
size: usize,
options: CastOptions,
) -> PolarsResult<FixedSizeListArray> {
let offsets = list.offsets().buffer().iter();
let expected = (0..list.len()).map(|ix| O::from_as_usize(ix * size));

match offsets
.zip(expected)
.find(|(actual, expected)| *actual != expected)
{
Some(_) => polars_bail!(ComputeError:
"not all elements have the specified width {size}"
),
None => {
let sliced_values = list.values().sliced(
list.offsets().first().to_usize(),
list.offsets().range().to_usize(),
);
let new_values = cast(sliced_values.as_ref(), inner.data_type(), options)?;
FixedSizeListArray::try_new(
ArrowDataType::FixedSizeList(Box::new(inner.clone()), size),
new_values,
list.validity().cloned(),
)
.map_err(
|_| polars_err!(ComputeError: "not all elements have the specified width {size}"),
)
},
}
let null_cnt = list.null_count();
let new_values = if null_cnt == 0 {
let offsets = list.offsets().buffer().iter();
let expected = (0..list.len()).map(|ix| O::from_as_usize(ix * size));

match offsets
.zip(expected)
.find(|(actual, expected)| *actual != expected)
{
Some(_) => polars_bail!(ComputeError:
"not all elements have the specified width {size}"
),
None => {
let sliced_values = list.values().sliced(
list.offsets().first().to_usize(),
list.offsets().range().to_usize(),
);
cast(sliced_values.as_ref(), inner.data_type(), options)?
},
}
} else {
let offsets = list.offsets().as_slice();
// Check the lengths of each list are equal to the fixed size.
// SAFETY: we know the index is in bound.
let mut expected_offset = unsafe { *offsets.get_unchecked(0) } + O::from_as_usize(size);
for i in 1..=list.len() {
// SAFETY: we know the index is in bound.
let current_offset = unsafe { *offsets.get_unchecked(i) };
if list.is_null(i - 1) {
expected_offset = current_offset + O::from_as_usize(size);
} else {
polars_ensure!(current_offset == expected_offset, ComputeError:
"not all elements have the specified width {size}");
expected_offset += O::from_as_usize(size);
}
}

// Build take indices for the values. This is used to fill in the null slots.
let mut indices =
MutablePrimitiveArray::<O>::with_capacity(list.values().len() + null_cnt * size);
for i in 0..list.len() {
if list.is_null(i) {
indices.extend_constant(size, None)
} else {
// SAFETY: we know the index is in bound.
let current_offset = unsafe { *offsets.get_unchecked(i) };
for j in 0..size {
indices.push(Some(current_offset + O::from_as_usize(j)));
}
}
}
let take_values = crate::compute::take::take(list.values().as_ref(), &indices.into())?;

cast(take_values.as_ref(), inner.data_type(), options)?
};
FixedSizeListArray::try_new(
ArrowDataType::FixedSizeList(Box::new(inner.clone()), size),
new_values,
list.validity().cloned(),
)
.map_err(|_| polars_err!(ComputeError: "not all elements have the specified width {size}"))
}

/// Cast `array` to the provided data type and return a new [`Array`] with
Expand Down
33 changes: 30 additions & 3 deletions crates/polars-arrow/src/legacy/array/fixed_size_list.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
use polars_error::PolarsResult;

use crate::array::{ArrayRef, FixedSizeListArray};
use crate::array::{ArrayRef, FixedSizeListArray, NullArray};
use crate::bitmap::MutableBitmap;
use crate::datatypes::ArrowDataType;
use crate::legacy::array::{convert_inner_type, is_nested_null};
use crate::legacy::kernels::concatenate::concatenate_owned_unchecked;

#[derive(Default)]
Expand Down Expand Up @@ -34,6 +35,8 @@ impl AnonymousBuilder {
}

pub fn push_null(&mut self) {
self.arrays
.push(NullArray::new(ArrowDataType::Null, self.width).boxed());
match &mut self.validity {
Some(validity) => validity.push(false),
None => self.init_validity(),
Expand All @@ -48,8 +51,32 @@ impl AnonymousBuilder {
}

pub fn finish(self, inner_dtype: Option<&ArrowDataType>) -> PolarsResult<FixedSizeListArray> {
let values = concatenate_owned_unchecked(&self.arrays)?;
let inner_dtype = inner_dtype.unwrap_or_else(|| self.arrays[0].data_type());
let mut inner_dtype = inner_dtype.unwrap_or_else(|| self.arrays[0].data_type());

if is_nested_null(inner_dtype) {
for arr in &self.arrays {
if !is_nested_null(arr.data_type()) {
inner_dtype = arr.data_type();
break;
}
}
};

// convert nested null arrays to the correct dtype.
let arrays = self
.arrays
.iter()
.map(|arr| {
if is_nested_null(arr.data_type()) {
convert_inner_type(&**arr, inner_dtype)
} else {
arr.to_boxed()
}
})
.collect::<Vec<_>>();

let values = concatenate_owned_unchecked(&arrays)?;

let data_type = FixedSizeListArray::default_datatype(inner_dtype.clone(), self.width);
Ok(FixedSizeListArray::new(
data_type,
Expand Down
44 changes: 2 additions & 42 deletions crates/polars-arrow/src/legacy/array/list.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
use polars_error::PolarsResult;

use crate::array::{new_null_array, Array, ArrayRef, ListArray, NullArray, StructArray};
use crate::array::{new_null_array, Array, ArrayRef, ListArray, NullArray};
use crate::bitmap::MutableBitmap;
use crate::compute::concatenate;
use crate::datatypes::ArrowDataType;
use crate::legacy::array::is_nested_null;
use crate::legacy::kernels::concatenate::concatenate_owned_unchecked;
use crate::legacy::prelude::*;
use crate::offset::Offsets;
Expand Down Expand Up @@ -162,44 +163,3 @@ impl<'a> AnonymousBuilder<'a> {
))
}
}

fn is_nested_null(data_type: &ArrowDataType) -> bool {
match data_type {
ArrowDataType::Null => true,
ArrowDataType::LargeList(field) => is_nested_null(field.data_type()),
ArrowDataType::Struct(fields) => {
fields.iter().all(|field| is_nested_null(field.data_type()))
},
_ => false,
}
}

/// Cast null arrays to inner type and ensure that all offsets remain correct
pub fn convert_inner_type(array: &dyn Array, dtype: &ArrowDataType) -> Box<dyn Array> {
match dtype {
ArrowDataType::LargeList(field) => {
let array = array.as_any().downcast_ref::<LargeListArray>().unwrap();
let inner = array.values();
let new_values = convert_inner_type(inner.as_ref(), field.data_type());
let dtype = LargeListArray::default_datatype(new_values.data_type().clone());
LargeListArray::new(
dtype,
array.offsets().clone(),
new_values,
array.validity().cloned(),
)
.boxed()
},
ArrowDataType::Struct(fields) => {
let array = array.as_any().downcast_ref::<StructArray>().unwrap();
let inner = array.values();
let new_values = inner
.iter()
.zip(fields)
.map(|(arr, field)| convert_inner_type(arr.as_ref(), field.data_type()))
.collect::<Vec<_>>();
StructArray::new(dtype.clone(), new_values, array.validity().cloned()).boxed()
},
_ => new_null_array(dtype.clone(), array.len()),
}
}
57 changes: 56 additions & 1 deletion crates/polars-arrow/src/legacy/array/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
use crate::array::{Array, BinaryArray, BooleanArray, ListArray, PrimitiveArray, Utf8Array};
use crate::array::{
new_null_array, Array, BinaryArray, BooleanArray, FixedSizeListArray, ListArray,
PrimitiveArray, StructArray, Utf8Array,
};
use crate::bitmap::MutableBitmap;
use crate::datatypes::ArrowDataType;
use crate::legacy::utils::CustomIterTools;
Expand All @@ -16,6 +19,8 @@ pub mod utf8;

pub use slice::*;

use crate::legacy::prelude::LargeListArray;

macro_rules! iter_to_values {
($iterator:expr, $validity:expr, $offsets:expr, $length_so_far:expr) => {{
$iterator
Expand Down Expand Up @@ -206,3 +211,53 @@ pub trait PolarsArray: Array {
}

impl<A: Array + ?Sized> PolarsArray for A {}

fn is_nested_null(data_type: &ArrowDataType) -> bool {
match data_type {
ArrowDataType::Null => true,
ArrowDataType::LargeList(field) => is_nested_null(field.data_type()),
ArrowDataType::FixedSizeList(field, _) => is_nested_null(field.data_type()),
ArrowDataType::Struct(fields) => {
fields.iter().all(|field| is_nested_null(field.data_type()))
},
_ => false,
}
}

/// Cast null arrays to inner type and ensure that all offsets remain correct
pub fn convert_inner_type(array: &dyn Array, dtype: &ArrowDataType) -> Box<dyn Array> {
match dtype {
ArrowDataType::LargeList(field) => {
let array = array.as_any().downcast_ref::<LargeListArray>().unwrap();
let inner = array.values();
let new_values = convert_inner_type(inner.as_ref(), field.data_type());
let dtype = LargeListArray::default_datatype(new_values.data_type().clone());
LargeListArray::new(
dtype,
array.offsets().clone(),
new_values,
array.validity().cloned(),
)
.boxed()
},
ArrowDataType::FixedSizeList(field, width) => {
let array = array.as_any().downcast_ref::<FixedSizeListArray>().unwrap();
let inner = array.values();
let new_values = convert_inner_type(inner.as_ref(), field.data_type());
let dtype =
FixedSizeListArray::default_datatype(new_values.data_type().clone(), *width);
FixedSizeListArray::new(dtype, new_values, array.validity().cloned()).boxed()
},
ArrowDataType::Struct(fields) => {
let array = array.as_any().downcast_ref::<StructArray>().unwrap();
let inner = array.values();
let new_values = inner
.iter()
.zip(fields)
.map(|(arr, field)| convert_inner_type(arr.as_ref(), field.data_type()))
.collect::<Vec<_>>();
StructArray::new(dtype.clone(), new_values, array.validity().cloned()).boxed()
},
_ => new_null_array(dtype.clone(), array.len()),
}
}

0 comments on commit 4b1e68f

Please sign in to comment.