Skip to content

Commit

Permalink
More migrations.
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgecarleitao authored and ritchie46 committed Jun 23, 2021
1 parent 194a8f2 commit 4e26e8a
Show file tree
Hide file tree
Showing 32 changed files with 188 additions and 189 deletions.
2 changes: 1 addition & 1 deletion polars/polars-arrow/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@ description = "Arrow interfaces for Polars DataFrame library"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
arrow = { package = "arrow2", git = "https://github.com/jorgecarleitao/arrow2", rev = "7401f4e32fdd4f9c4e21fb2782a3996d6a0277b4", default-features = false }
arrow = { package = "arrow2", git = "https://github.com/jorgecarleitao/arrow2", rev = "7461b42b3e9e19ef2ff8b52d85e4915ebc44c1fd", default-features = false }
thiserror = "^1.0"
num = "^0.4"
3 changes: 1 addition & 2 deletions polars/polars-arrow/src/array.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
use crate::utils::CustomIterTools;
use arrow::array::{Array, ArrayRef, BooleanArray, ListArray, PrimitiveArray, Utf8Array};
use arrow::array::{ArrayRef, BooleanArray, ListArray, PrimitiveArray, Utf8Array};
use arrow::bitmap::MutableBitmap;
use arrow::buffer::MutableBuffer;
use arrow::datatypes::DataType;
use arrow::types::{NativeType, NaturalDataType};
use num::Num;
use std::sync::Arc;

pub trait ValueSize {
Expand Down
2 changes: 1 addition & 1 deletion polars/polars-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ dtype-u64 = []
parquet = ["arrow/io_parquet"]

[dependencies]
arrow = { package = "arrow2", git = "https://github.com/jorgecarleitao/arrow2", rev = "7401f4e32fdd4f9c4e21fb2782a3996d6a0277b4", default-features = false }
arrow = { package = "arrow2", git = "https://github.com/jorgecarleitao/arrow2", rev = "7461b42b3e9e19ef2ff8b52d85e4915ebc44c1fd", default-features = false }
#arrow = {version = "4.2", default-features = false }
#parquet = {version = "4.2", default-features = false, optional = true }
polars-arrow = {version = "0.14.2", path = "../polars-arrow"}
Expand Down
14 changes: 7 additions & 7 deletions polars/polars-core/src/chunked_array/builder/categorical.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ use arrow::array::*;
use std::marker::PhantomData;

pub enum RevMappingBuilder {
Global(PlHashMap<u32, u32>, Utf8Primitive<i64>, u128),
Local(Utf8Primitive<i64>),
Global(PlHashMap<u32, u32>, MutableUtf8Array<i64>, u128),
Local(MutableUtf8Array<i64>),
}

impl RevMappingBuilder {
Expand All @@ -26,10 +26,10 @@ impl RevMappingBuilder {
fn finish(self) -> RevMapping {
use RevMappingBuilder::*;
match self {
Local(mut b) => RevMapping::Local(b.to()),
Local(mut b) => RevMapping::Local(b.into()),
Global(mut map, mut b, uuid) => {
map.shrink_to_fit();
RevMapping::Global(map, b.to(), uuid)
RevMapping::Global(map, b.into(), uuid)
}
}
}
Expand Down Expand Up @@ -68,14 +68,14 @@ impl RevMapping {
}

pub struct CategoricalChunkedBuilder {
array_builder: Primitive<u32>,
array_builder: UInt32Vec,
field: Field,
reverse_mapping: RevMappingBuilder,
}

impl CategoricalChunkedBuilder {
pub fn new(name: &str, capacity: usize) -> Self {
let builder = Utf8Primitive::<i64>::with_capacity(capacity / 10);
let builder = MutableUtf8Array::<i64>::with_capacity(capacity / 10);
let reverse_mapping = if use_string_cache() {
let uuid = crate::STRING_CACHE.lock_map().uuid;
RevMappingBuilder::Global(PlHashMap::default(), builder, uuid)
Expand All @@ -84,7 +84,7 @@ impl CategoricalChunkedBuilder {
};

Self {
array_builder: Primitive::<u32>::with_capacity(capacity),
array_builder: UInt32Vec::with_capacity(capacity),
field: Field::new(name, DataType::Categorical),
reverse_mapping,
}
Expand Down
35 changes: 18 additions & 17 deletions polars/polars-core/src/chunked_array/builder/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ pub trait ChunkedBuilder<N, T> {
}

pub struct BooleanChunkedBuilder {
array_builder: BooleanPrimitive,
array_builder: MutableBooleanArray,
field: Field,
}

Expand Down Expand Up @@ -58,7 +58,7 @@ impl ChunkedBuilder<bool, BooleanType> for BooleanChunkedBuilder {
impl BooleanChunkedBuilder {
pub fn new(name: &str, capacity: usize) -> Self {
BooleanChunkedBuilder {
array_builder: BooleanPrimitive::with_capacity(capacity),
array_builder: MutableBooleanArray::with_capacity(capacity),
field: Field::new(name, DataType::Boolean),
}
}
Expand All @@ -69,7 +69,7 @@ where
T: PolarsPrimitiveType,
T::Native: Default,
{
array_builder: Primitive<T::Native>,
array_builder: MutablePrimitiveArray<T::Native>,
field: Field,
}

Expand All @@ -91,8 +91,7 @@ where
}

fn finish(mut self) -> ChunkedArray<T> {
let arr: PrimitiveArray<T::Native> = self.array_builder.to(T::get_dtype().to_arrow());
let arr = Arc::new(arr) as ArrayRef;
let arr = self.array_builder.into_arc();

ChunkedArray {
field: Arc::new(self.field),
Expand All @@ -108,15 +107,18 @@ where
T: PolarsPrimitiveType,
{
pub fn new(name: &str, capacity: usize) -> Self {
let array_builder = MutablePrimitiveArray::<T::Native>::with_capacity(capacity)
.to(T::get_dtype().to_arrow());

PrimitiveChunkedBuilder {
array_builder: Primitive::<T::Native>::with_capacity(capacity),
array_builder,
field: Field::new(name, T::get_dtype()),
}
}
}

pub struct Utf8ChunkedBuilder {
pub builder: Utf8Primitive<i64>,
pub builder: MutableUtf8Array<i64>,
pub capacity: usize,
field: Field,
}
Expand All @@ -130,7 +132,7 @@ impl Utf8ChunkedBuilder {
/// * `bytes_capacity` - Number of bytes needed to store the string values.
pub fn new(name: &str, capacity: usize, bytes_capacity: usize) -> Self {
Utf8ChunkedBuilder {
builder: Utf8Primitive::<i64>::with_capacities(capacity, bytes_capacity),
builder: MutableUtf8Array::<i64>::with_capacities(capacity, bytes_capacity),
capacity,
field: Field::new(name, DataType::Utf8),
}
Expand All @@ -145,7 +147,7 @@ impl Utf8ChunkedBuilder {
/// Appends a null slot into the builder
#[inline]
pub fn append_null(&mut self) {
self.builder.push(None);
self.builder.push::<&str>(None);
}

#[inline]
Expand All @@ -154,7 +156,7 @@ impl Utf8ChunkedBuilder {
}

pub fn finish(mut self) -> Utf8Chunked {
let arr = Arc::new(self.builder.to());
let arr = self.builder.into_arc();
ChunkedArray {
field: Arc::new(self.field),
chunks: vec![arr],
Expand Down Expand Up @@ -281,7 +283,7 @@ where
fn new_from_slice(name: &str, v: &[S]) -> Self {
let values_size = v.iter().fold(0, |acc, s| acc + s.as_ref().len());

let mut builder = Utf8Primitive::<i64>::with_capacities(values_size, v.len());
let mut builder = MutableUtf8Array::<i64>::with_capacities(values_size, v.len());
v.iter().for_each(|val| {
builder.push(Some(val.as_ref()));
});
Expand Down Expand Up @@ -343,7 +345,7 @@ where

macro_rules! finish_list_builder {
($self:ident) => {{
let arr = Arc::new($self.builder.finish());
let arr = $self.builder.into_arc();
ListChunked {
field: Arc::new($self.field.clone()),
chunks: vec![arr],
Expand All @@ -368,8 +370,7 @@ where
pub fn append_slice(&mut self, opt_v: Option<&[T::Native]>) {
match opt_v {
Some(v) => {
self.builder.values().append_slice(v);
self.builder.push(true);
self.builder.push(v);
}
None => {
self.builder.append(false).expect("should not fail");
Expand Down Expand Up @@ -431,9 +432,9 @@ where
}
}

type LargePrimitiveBuilder<T> = ListPrimitive<i64, Primitive<T>, T>;
type LargeListUtf8Builder = ListPrimitive<i64, Utf8Primitive<i64>, &'static str>;
type LargeListBooleanBuilder = ListPrimitive<i64, BooleanPrimitive, bool>;
type LargePrimitiveBuilder<T> = MutableListArray<i64, MutablePrimitiveArray<T>>;
type LargeListUtf8Builder = MutableListArray<i64, MutableUtf8Array<i64>>;
type LargeListBooleanBuilder = MutableListArray<i64, MutableBooleanArray>;

pub struct ListUtf8ChunkedBuilder {
builder: LargeListUtf8Builder,
Expand Down
7 changes: 6 additions & 1 deletion polars/polars-core/src/chunked_array/iterator/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,12 @@ where
type Item = Option<T::Native>;
type IntoIter = Box<dyn PolarsIterator<Item = Self::Item> + 'a>;
fn into_iter(self) -> Self::IntoIter {
Box::new(self.downcast_iter().flatten().trust_my_length(self.len()))
Box::new(
self.downcast_iter()
.flatten()
.map(|x| x.copied())
.trust_my_length(self.len()),
)
}
}

Expand Down
33 changes: 17 additions & 16 deletions polars/polars-core/src/chunked_array/kernels/take.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ pub(crate) unsafe fn take_primitive_iter_unchecked<
None
}
})
.collect::<Primitive<T::Native>>()
.collect::<PrimitiveArray<T::Native>>()
.to(T::get_dtype().to_arrow());

Arc::new(arr)
Expand All @@ -105,7 +105,7 @@ pub(crate) fn take_primitive_iter<T: PolarsPrimitiveType, I: IntoIterator<Item =
None
}
})
.collect::<Primitive<T::Native>>()
.collect::<PrimitiveArray<T::Native>>()
.to(T::get_dtype().to_arrow());

Arc::new(arr)
Expand All @@ -125,7 +125,7 @@ pub(crate) unsafe fn take_no_null_primitive_opt_iter_unchecked<
let iter = indices
.into_iter()
.map(|opt_idx| opt_idx.map(|idx| *array_values.get_unchecked(idx)));
let arr = Primitive::from_trusted_len_iter_unchecked(iter).to(T::get_dtype().to_arrow());
let arr = PrimitiveArray::from_trusted_len_iter_unchecked(iter).to(T::get_dtype().to_arrow());

Arc::new(arr)
}
Expand All @@ -150,7 +150,7 @@ pub(crate) unsafe fn take_primitive_opt_iter_unchecked<
}
})
});
let arr = Primitive::from_trusted_len_iter_unchecked(iter).to(T::get_dtype().to_arrow());
let arr = PrimitiveArray::from_trusted_len_iter_unchecked(iter).to(T::get_dtype().to_arrow());

Arc::new(arr)
}
Expand Down Expand Up @@ -347,7 +347,7 @@ pub(crate) fn take_utf8_iter<I: IntoIterator<Item = usize>>(

pub(crate) unsafe fn take_utf8(
arr: &LargeStringArray,
indices: &Int32Array,
indices: &UInt32Array,
) -> Arc<LargeStringArray> {
let data_len = indices.len();

Expand All @@ -363,7 +363,7 @@ pub(crate) unsafe fn take_utf8(
// Allocate 2.0 times the expected size.
// where expected size is the length of bytes multiplied by the factor (take_len / current_len)
let mut values_capacity = if arr.len() > 0 {
((arr.value_data().len() as f32 * 2.0) as usize) / arr.len() * indices.len() as usize
((arr.len() as f32 * 2.0) as usize) / arr.len() * indices.len() as usize
} else {
0
};
Expand Down Expand Up @@ -413,17 +413,17 @@ pub(crate) unsafe fn take_utf8(
});
validity = indices.validity().clone();
} else {
let mut builder = Utf8Primitive::with_capacities(data_len, length_so_far as usize);
let mut builder = MutableUtf8Array::with_capacities(data_len, length_so_far as usize);

if indices.null_count() == 0 {
(0..data_len).for_each(|idx| {
let index = indices.value_unchecked(idx) as usize;
if arr.is_valid(index) {
builder.push(if arr.is_valid(index) {
let s = arr.value_unchecked(index);
builder.push(Some(s));
Some(s)
} else {
builder.push(None);
}
None
});
});
} else {
(0..data_len).for_each(|idx| {
Expand All @@ -432,17 +432,18 @@ pub(crate) unsafe fn take_utf8(

if arr.is_valid(index) {
let s = arr.value_unchecked(index);
builder.append_value(s).unwrap();
builder.push(Some(s));
} else {
builder.append_null().unwrap();
builder.push_null();
}
} else {
builder.append_null().unwrap();
builder.push_null();
}
});
}

return Arc::new(builder.finish());
let array: Utf8Array<i64> = builder.into();
return Arc::new(array);
}

// Safety: all "values" are &str, and thus valid utf8
Expand All @@ -461,7 +462,7 @@ mod test {
fn test_utf8_kernel() {
let s = LargeStringArray::from(vec![Some("foo"), None, Some("bar")]);
unsafe {
let out = take_utf8(&s, &UInt32Array::from(vec![1, 2]));
let out = take_utf8(&s, &UInt32Array::from_slice(&[1, 2]));
assert!(out.is_null(0));
assert!(out.is_valid(1));
let out = take_utf8(&s, &UInt32Array::from(vec![None, Some(2)]));
Expand Down
13 changes: 7 additions & 6 deletions polars/polars-core/src/chunked_array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ use polars_arrow::prelude::ValueSize;
use std::convert::TryFrom;
use std::iter::{Copied, Map};
use std::marker::PhantomData;
use std::ops::Not;
use std::sync::Arc;

pub mod ops;
Expand Down Expand Up @@ -151,7 +150,7 @@ impl<T> ChunkedArray<T> {
/// Get the index of the first non null value in this ChunkedArray.
pub fn first_non_null(&self) -> Option<usize> {
let mut offset = 0;
for null_bitmap in self.null_bits() {
for (_, null_bitmap) in self.null_bits() {
if let Some(null_bitmap) = null_bitmap {
for (idx, is_valid) in null_bitmap.iter().enumerate() {
if is_valid {
Expand All @@ -167,8 +166,10 @@ impl<T> ChunkedArray<T> {
}

/// Get the buffer of bits representing null values
pub fn null_bits(&self) -> impl Iterator<Item = &Option<Bitmap>> + '_ {
self.chunks.iter().map(|arr| arr.validity())
pub fn null_bits(&self) -> impl Iterator<Item = (usize, &Option<Bitmap>)> + '_ {
self.chunks
.iter()
.map(|arr| (arr.null_count(), arr.validity()))
}

/// Shrink the capacity of this array to fit it's length.
Expand Down Expand Up @@ -1035,13 +1036,13 @@ pub(crate) mod test {
let before = arr
.chunks()
.iter()
.map(|arr| arr.get_buffer_memory_size())
.map(|arr| arrow::compute::aggregate::estimated_bytes_size(arr.as_ref()))
.sum::<usize>();
arr.shrink_to_fit();
let after = arr
.chunks()
.iter()
.map(|arr| arr.get_buffer_memory_size())
.map(|arr| arrow::compute::aggregate::estimated_bytes_size(arr.as_ref()))
.sum::<usize>();
assert!(before > after);
}
Expand Down

0 comments on commit 4e26e8a

Please sign in to comment.