Skip to content

Commit

Permalink
update arrow: completely backed by Vec (#2114)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Dec 22, 2021
1 parent d6fe3b9 commit 57c8788
Show file tree
Hide file tree
Showing 53 changed files with 141 additions and 198 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ The Python bindings of `polars` have their own changelog.
* performance
- improve csv-parser performance by ~25%
* bug fix
- fix ub of alignedvec dealloc
- fix ub of AlignedVec dealloc
- various minor

### Polars 0.14.6
Expand Down
2 changes: 1 addition & 1 deletion nodejs-polars/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -749,7 +749,7 @@ pub fn take(cx: CallContext) -> JsResult<JsExternal> {
let df = params.get_external::<JsDataFrame>(&cx, "_df")?;
let indices = params.get::<JsObject>("indices")?;
let len = indices.get_array_length()?;
let indices: AlignedVec<u32> = (0..len)
let indices: Vec<u32> = (0..len)
.map(|v| {
let wv: WrappedValue = indices
.get_element_unchecked::<JsUnknown>(v)
Expand Down
3 changes: 1 addition & 2 deletions nodejs-polars/src/list_construction.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,11 @@ use crate::conversion::prelude::*;
use crate::prelude::*;
use napi::{JsBoolean, JsNumber, JsObject, JsString, JsTypedArrayValue, JsUnknown};
use polars::chunked_array::ChunkedArray;
use polars::prelude::AlignedVec;

macro_rules! typed_to_chunked {
($arr:expr, $type:ty, $pl_type:ty) => {{
let v: &[$type] = $arr.as_ref();
let mut buffer = AlignedVec::<$type>::new();
let mut buffer = Vec::<$type>::new();
buffer.extend_from_slice(v);
ChunkedArray::<$pl_type>::new_from_aligned_vec("", buffer)
}};
Expand Down
2 changes: 1 addition & 1 deletion polars/polars-arrow/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ description = "Arrow interfaces for Polars DataFrame library"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
arrow = { package = "arrow2", git = "https://github.com/jorgecarleitao/arrow2", rev = "f64339cca9379c17a93dfa9714e1dab04e7a0ee0", default-features = false }
arrow = { package = "arrow2", git = "https://github.com/jorgecarleitao/arrow2", rev = "4bbbe62c7fb36d24d2ccdb3bf3130d97aac4c069", default-features = false }
# arrow = { package = "arrow2", git = "https://github.com/ritchie46/arrow2", default-features = false, features = ["compute"], branch = "offset_pub" }
# arrow = { package = "arrow2", version = "0.8", default-features = false }
num = "^0.4"
Expand Down
9 changes: 4 additions & 5 deletions polars/polars-arrow/src/array/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use arrow::array::{Array, ArrayRef, BooleanArray, ListArray, PrimitiveArray, Utf8Array};
use arrow::bitmap::MutableBitmap;
use arrow::buffer::MutableBuffer;
use arrow::datatypes::DataType;
use arrow::types::{NativeType, NaturalDataType};
use std::sync::Arc;
Expand Down Expand Up @@ -85,7 +84,7 @@ pub trait ListFromIter {
let (lower, _) = iterator.size_hint();

let mut validity = MutableBitmap::with_capacity(lower);
let mut offsets = MutableBuffer::<i64>::with_capacity(lower + 1);
let mut offsets = Vec::<i64>::with_capacity(lower + 1);
let mut length_so_far = 0i64;
offsets.push(length_so_far);

Expand All @@ -112,8 +111,8 @@ pub trait ListFromIter {
let iterator = iter.into_iter();
let (lower, _) = iterator.size_hint();

let mut validity = MutableBitmap::with_capacity(lower);
let mut offsets = MutableBuffer::<i64>::with_capacity(lower + 1);
let mut validity = Vec::with_capacity(lower);
let mut offsets = Vec::<i64>::with_capacity(lower + 1);
let mut length_so_far = 0i64;
offsets.push(length_so_far);

Expand Down Expand Up @@ -142,7 +141,7 @@ pub trait ListFromIter {
let (lower, _) = iterator.size_hint();

let mut validity = MutableBitmap::with_capacity(lower);
let mut offsets = MutableBuffer::<i64>::with_capacity(lower + 1);
let mut offsets = Vec::<i64>::with_capacity(lower + 1);
let mut length_so_far = 0i64;
offsets.push(length_so_far);
let values: Utf8Array<i64> = iterator
Expand Down
6 changes: 3 additions & 3 deletions polars/polars-arrow/src/kernels/list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use crate::kernels::take::take_unchecked;
use crate::trusted_len::PushUnchecked;
use crate::utils::CustomIterTools;
use arrow::array::{ArrayRef, ListArray, PrimitiveArray};
use arrow::buffer::{Buffer, MutableBuffer};
use arrow::buffer::Buffer;

/// Get the indices that would result in a get operation on the lists values.
/// for example, consider this list:
Expand Down Expand Up @@ -75,7 +75,7 @@ pub fn array_to_unit_list(array: ArrayRef) -> ListArray<i64> {
}
};

let offsets: Buffer<i64> = MutableBuffer::from_vec(offsets).into();
let offsets: Buffer<i64> = offsets.into();
let dtype = ListArray::<i64>::default_datatype(array.data_type().clone());
ListArray::<i64>::from_data(dtype, offsets, array, None)
}
Expand All @@ -90,7 +90,7 @@ mod test {

fn get_array() -> ListArray<i64> {
let values = Int32Array::from_slice(&[1, 2, 3, 4, 5, 6]);
let offsets = Buffer::from(&[0i64, 3, 5, 6]);
let offsets = Buffer::from(vec![0i64, 3, 5, 6]);

let dtype = ListArray::<i64>::default_datatype(DataType::Int32);
ListArray::<i64>::from_data(dtype, offsets, Arc::new(values), None)
Expand Down
5 changes: 2 additions & 3 deletions polars/polars-arrow/src/kernels/rolling/no_nulls.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
use super::*;
use crate::utils::CustomIterTools;
use arrow::array::{ArrayRef, PrimitiveArray};
use arrow::buffer::MutableBuffer;
use arrow::datatypes::DataType;
use arrow::types::NativeType;
use num::Float;
Expand Down Expand Up @@ -34,7 +33,7 @@ where

aggregator(&buf)
})
.collect_trusted::<MutableBuffer<f64>>();
.collect_trusted::<Vec<f64>>();

let validity = create_validity(min_periods, len as usize, window_size, det_offsets_fn);
Arc::new(PrimitiveArray::from_data(
Expand Down Expand Up @@ -64,7 +63,7 @@ where
let vals = unsafe { values.get_unchecked(start..end) };
aggregator(vals)
})
.collect_trusted::<MutableBuffer<K>>();
.collect_trusted::<Vec<K>>();

let validity = create_validity(min_periods, len as usize, window_size, det_offsets_fn);
Arc::new(PrimitiveArray::from_data(
Expand Down
7 changes: 3 additions & 4 deletions polars/polars-arrow/src/kernels/rolling/nulls.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ use super::*;
use crate::utils::CustomIterTools;
use arrow::array::{ArrayRef, PrimitiveArray};
use arrow::bitmap::utils::{count_zeros, get_bit_unchecked};
use arrow::buffer::MutableBuffer;
use arrow::types::NativeType;
use num::{Float, One, Zero};
use std::ops::AddAssign;
Expand Down Expand Up @@ -50,7 +49,7 @@ where
}
}
})
.collect_trusted::<MutableBuffer<K>>();
.collect_trusted::<Vec<K>>();

Arc::new(PrimitiveArray::from_data(
K::DATA_TYPE,
Expand Down Expand Up @@ -400,7 +399,7 @@ mod test {

#[test]
fn test_rolling_sum_nulls() {
let buf = Buffer::from([1.0, 2.0, 3.0, 4.0]);
let buf = Buffer::from(vec![1.0, 2.0, 3.0, 4.0]);
let arr = &PrimitiveArray::from_data(
DataType::Float64,
buf,
Expand Down Expand Up @@ -435,7 +434,7 @@ mod test {

#[test]
fn test_rolling_max_no_nulls() {
let buf = Buffer::from([1.0, 2.0, 3.0, 4.0]);
let buf = Buffer::from(vec![1.0, 2.0, 3.0, 4.0]);
let arr = &PrimitiveArray::from_data(
DataType::Float64,
buf,
Expand Down
12 changes: 6 additions & 6 deletions polars/polars-arrow/src/kernels/set.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ use crate::array::default_arrays::FromData;
use crate::error::{PolarsError, Result};
use crate::kernels::BinaryMaskedSliceIterator;
use crate::prelude::PolarsArray;
use crate::trusted_len::PushUnchecked;
use arrow::array::*;
use arrow::buffer::MutableBuffer;
use arrow::{datatypes::DataType, types::NativeType};
use std::ops::BitOr;

Expand All @@ -21,14 +21,14 @@ where
let validity = array.validity().unwrap();
let validity = BooleanArray::from_data_default(validity.clone(), None);

let mut av = MutableBuffer::with_capacity(array.len());
let mut av = Vec::with_capacity(array.len());
BinaryMaskedSliceIterator::new(&validity)
.into_iter()
.for_each(|(lower, upper, truthy)| {
if truthy {
av.extend_from_slice(&values[lower..upper])
} else {
av.extend_from_trusted_len_iter(std::iter::repeat(value).take(upper - lower))
av.extend_trusted_len(std::iter::repeat(value).take(upper - lower))
}
});

Expand All @@ -44,12 +44,12 @@ pub fn set_with_mask<T: NativeType>(
) -> PrimitiveArray<T> {
let values = array.values();

let mut buf = MutableBuffer::with_capacity(array.len());
let mut buf = Vec::with_capacity(array.len());
BinaryMaskedSliceIterator::new(mask)
.into_iter()
.for_each(|(lower, upper, truthy)| {
if truthy {
buf.extend_from_trusted_len_iter(std::iter::repeat(value).take(upper - lower))
buf.extend_trusted_len(std::iter::repeat(value).take(upper - lower))
} else {
buf.extend_from_slice(&values[lower..upper])
}
Expand All @@ -76,7 +76,7 @@ where
T: NativeType,
I: IntoIterator<Item = usize>,
{
let mut buf = MutableBuffer::with_capacity(array.len());
let mut buf = Vec::with_capacity(array.len());
buf.extend_from_slice(array.values().as_slice());
let mut_slice = buf.as_mut_slice();

Expand Down
14 changes: 7 additions & 7 deletions polars/polars-arrow/src/kernels/take.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use crate::utils::with_match_primitive_type;
use crate::{bit_util::unset_bit_raw, prelude::*, utils::CustomIterTools};
use arrow::array::*;
use arrow::bitmap::MutableBitmap;
use arrow::buffer::{Buffer, MutableBuffer};
use arrow::buffer::Buffer;
use arrow::datatypes::{DataType, PhysicalType};
use arrow::types::NativeType;
use std::sync::Arc;
Expand Down Expand Up @@ -46,7 +46,7 @@ pub unsafe fn take_primitive_unchecked<T: NativeType>(
let validity_values = arr.validity().expect("should have nulls");

// first take the values, these are always needed
let values: AlignedVec<T> = index_values
let values: Vec<T> = index_values
.iter()
.map(|idx| *array_values.get_unchecked(*idx as usize))
.collect_trusted();
Expand Down Expand Up @@ -364,7 +364,7 @@ pub unsafe fn take_utf8_unchecked(
) -> Arc<LargeStringArray> {
let data_len = indices.len();

let mut offset_buf = MutableBuffer::<i64>::from_len_zeroed(data_len + 1);
let mut offset_buf = vec![0; data_len + 1];
let offset_typed = offset_buf.as_mut_slice();

let mut length_so_far = 0;
Expand All @@ -382,7 +382,7 @@ pub unsafe fn take_utf8_unchecked(
};

// 16 bytes per string as default alloc
let mut values_buf = AlignedVec::<u8>::with_capacity(values_capacity);
let mut values_buf = Vec::<u8>::with_capacity(values_capacity);

// both 0 nulls
if !arr.has_validity() && !indices.has_validity() {
Expand Down Expand Up @@ -483,12 +483,12 @@ pub unsafe fn take_utf8_unchecked(
pub unsafe fn take_value_indices_from_list(
list: &ListArray<i64>,
indices: &UInt32Array,
) -> (UInt32Array, AlignedVec<i64>) {
) -> (UInt32Array, Vec<i64>) {
let offsets = list.offsets().as_slice();

let mut new_offsets = AlignedVec::with_capacity(indices.len());
let mut new_offsets = Vec::with_capacity(indices.len());
// will likely have at least indices.len values
let mut values = AlignedVec::with_capacity(indices.len());
let mut values = Vec::with_capacity(indices.len());
let mut current_offset = 0;
// add first offset
new_offsets.push(0);
Expand Down
1 change: 0 additions & 1 deletion polars/polars-arrow/src/prelude.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,5 @@ pub use crate::array::default_arrays::*;
pub use crate::array::*;
use arrow::array::{ListArray, Utf8Array};

pub type AlignedVec<T> = arrow::buffer::MutableBuffer<T>;
pub type LargeStringArray = Utf8Array<i64>;
pub type LargeListArray = ListArray<i64>;
47 changes: 8 additions & 39 deletions polars/polars-arrow/src/trusted_len/mod.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
mod boolean;
mod rev;

use crate::utils::{FromTrustedLenIterator, TrustMyLength};
use crate::utils::TrustMyLength;
use arrow::bitmap::utils::{BitmapIter, ZipValidity};
use arrow::buffer::MutableBuffer;
use arrow::types::NativeType;
pub use rev::FromIteratorReversed;
use std::slice::Iter;

Expand Down Expand Up @@ -63,15 +61,6 @@ unsafe impl<T, I: TrustedLen + Iterator<Item = T>> TrustedLen for ZipValidity<'_
unsafe impl TrustedLen for BitmapIter<'_> {}
unsafe impl<A: TrustedLen> TrustedLen for std::iter::StepBy<A> {}

impl<T: arrow::types::NativeType> FromTrustedLenIterator<T> for MutableBuffer<T> {
fn from_iter_trusted_length<I: IntoIterator<Item = T>>(iter: I) -> Self {
let iter = iter.into_iter();
// Safety:
// Guarded by trait system
unsafe { MutableBuffer::from_trusted_len_iter_unchecked(iter) }
}
}

pub trait PushUnchecked<T> {
/// Will push an item and not check if there is enough capacity
///
Expand All @@ -87,34 +76,8 @@ pub trait PushUnchecked<T> {

/// Extend the array with an iterator who's length can be trusted
fn extend_trusted_len<I: IntoIterator<Item = T> + TrustedLen>(&mut self, iter: I);
}

impl<T: NativeType> PushUnchecked<T> for MutableBuffer<T> {
unsafe fn push_unchecked(&mut self, value: T) {
let end = self.as_mut_ptr().add(self.len());
std::ptr::write(end, value);
self.set_len(self.len() + 1);
}

unsafe fn push_unchecked_no_len_set(&mut self, value: T) {
let end = self.as_mut_ptr().add(self.len());
std::ptr::write(end, value);
}

fn extend_trusted_len<I: IntoIterator<Item = T> + TrustedLen>(&mut self, iter: I) {
let iter = iter.into_iter();
let upper = iter.size_hint().1.expect("must have an upper bound");
self.reserve(upper);

unsafe {
let mut dst = self.as_mut_ptr().add(self.len());
for value in iter {
std::ptr::write(dst, value);
dst = dst.add(1)
}
self.set_len(self.len() + upper)
}
}
fn from_trusted_len_iter<I: IntoIterator<Item = T> + TrustedLen>(iter: I) -> Self;
}

impl<T> PushUnchecked<T> for Vec<T> {
Expand Down Expand Up @@ -146,4 +109,10 @@ impl<T> PushUnchecked<T> for Vec<T> {
self.set_len(self.len() + upper)
}
}

fn from_trusted_len_iter<I: IntoIterator<Item = T> + TrustedLen>(iter: I) -> Self {
let mut v = vec![];
v.extend_trusted_len(iter);
v
}
}
5 changes: 1 addition & 4 deletions polars/polars-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,6 @@ docs-selection = [
[dependencies]
ahash = "0.7"
anyhow = "1.0"
# arrow = { package = "arrow2", git = "https://github.com/jorgecarleitao/arrow2", rev = "e9a6c3ef7e1a328c298bd45e36ac2abf8ae44ebb", default-features = false, features = ["compute"] }
# arrow = { package = "arrow2", git = "https://github.com/ritchie46/arrow2", default-features = false, features = ["compute"], branch = "fn_to" }
# arrow = { package = "arrow2", version = "0.8", default-features = false, features = ["compute"] }

comfy-table = { version = "4.0", optional = true }
hashbrown = { version = "0.11", features = ["rayon"] }
Expand All @@ -156,7 +153,7 @@ unsafe_unwrap = "^0.1.0"
package = "arrow2"
git = "https://github.com/jorgecarleitao/arrow2"
# git = "https://github.com/ritchie46/arrow2"
rev = "f64339cca9379c17a93dfa9714e1dab04e7a0ee0"
rev = "4bbbe62c7fb36d24d2ccdb3bf3130d97aac4c069"
# branch = "offset_pub"
# version = "0.8"
default-features = false
Expand Down
6 changes: 3 additions & 3 deletions polars/polars-core/src/chunked_array/bitwise.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ where
.iter()
.zip(r_vals)
.map(|(l, r)| *l & *r)
.collect_trusted::<AlignedVec<_>>();
.collect_trusted::<Vec<_>>();

let arr =
PrimitiveArray::from_data(T::get_dtype().to_arrow(), av.into(), valididity);
Expand Down Expand Up @@ -58,7 +58,7 @@ where
.iter()
.zip(r_vals)
.map(|(l, r)| *l | *r)
.collect_trusted::<AlignedVec<_>>();
.collect_trusted::<Vec<_>>();

let arr =
PrimitiveArray::from_data(T::get_dtype().to_arrow(), av.into(), valididity);
Expand Down Expand Up @@ -91,7 +91,7 @@ where
.iter()
.zip(r_vals)
.map(|(l, r)| l.bitxor(*r))
.collect_trusted::<AlignedVec<_>>();
.collect_trusted::<Vec<_>>();

let arr =
PrimitiveArray::from_data(T::get_dtype().to_arrow(), av.into(), valididity);
Expand Down

0 comments on commit 57c8788

Please sign in to comment.