Skip to content

Commit

Permalink
And more migrations (#827)
Browse files Browse the repository at this point in the history
* More migrations.

* more migrations

Co-authored-by: Jorge C. Leitao <jorgecarleitao@gmail.com>
  • Loading branch information
ritchie46 and jorgecarleitao committed Jun 16, 2021
1 parent 9f50fff commit dd198da
Show file tree
Hide file tree
Showing 30 changed files with 523 additions and 265 deletions.
2 changes: 1 addition & 1 deletion polars/polars-arrow/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@ description = "Arrow interfaces for Polars DataFrame library"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
arrow = { package = "arrow2", git = "https://github.com/jorgecarleitao/arrow2", rev = "175dede04d34bd5a73ef6cfc27b84a388a1b35e8", default-features = false }
arrow = { package = "arrow2", git = "https://github.com/jorgecarleitao/arrow2", rev = "7401f4e32fdd4f9c4e21fb2782a3996d6a0277b4", default-features = false }
thiserror = "^1.0"
num = "^0.4"
29 changes: 28 additions & 1 deletion polars/polars-arrow/src/array.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
use arrow::array::ListArray;
use arrow::{
array::{ArrayRef, ListArray, Utf8Array},
datatypes::DataType,
};

pub trait ValueSize {
/// Useful for a Utf8 or a List to get underlying value size.
Expand All @@ -11,3 +14,27 @@ impl ValueSize for ListArray<i64> {
self.values().len()
}
}

impl ValueSize for Utf8Array<i64> {
fn get_values_size(&self) -> usize {
self.values().len()
}
}

impl ValueSize for ArrayRef {
fn get_values_size(&self) -> usize {
match self.data_type() {
DataType::LargeUtf8 => self
.as_any()
.downcast_ref::<Utf8Array<i64>>()
.unwrap()
.get_values_size(),
DataType::LargeList(_) => self
.as_any()
.downcast_ref::<ListArray<i64>>()
.unwrap()
.get_values_size(),
_ => unimplemented!(),
}
}
}
1 change: 1 addition & 0 deletions polars/polars-arrow/src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,4 @@ pub fn combine_validities(opt_l: Option<&Bitmap>, opt_r: Option<&Bitmap>) -> Opt
(None, None) => None,
}
}
unsafe impl<I, J> arrow::trusted_len::TrustedLen for TrustMyLength<I, J> where I: Iterator<Item = J> {}
2 changes: 1 addition & 1 deletion polars/polars-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ dtype-u64 = []
parquet = ["arrow/io_parquet"]

[dependencies]
arrow = { package = "arrow2", git = "https://github.com/jorgecarleitao/arrow2", rev = "175dede04d34bd5a73ef6cfc27b84a388a1b35e8", default-features = false }
arrow = { package = "arrow2", git = "https://github.com/jorgecarleitao/arrow2", rev = "7401f4e32fdd4f9c4e21fb2782a3996d6a0277b4", default-features = false }
#arrow = {version = "4.2", default-features = false }
#parquet = {version = "4.2", default-features = false, optional = true }
polars-arrow = {version = "0.14.2", path = "../polars-arrow"}
Expand Down
42 changes: 12 additions & 30 deletions polars/polars-core/src/chunked_array/builder/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use crate::{
prelude::*,
utils::{get_iter_capacity, NoNull},
};
use arrow::{array::*, bitmap::Bitmap, buffer::MutableBuffer};
use arrow::{array::*, bitmap::Bitmap};
use num::Num;
use std::borrow::Cow;
use std::iter::FromIterator;
Expand Down Expand Up @@ -91,7 +91,7 @@ where
}

fn finish(mut self) -> ChunkedArray<T> {
let arr: PrimitiveArray<T::Native> = self.array_builder.into();
let arr: PrimitiveArray<T::Native> = self.array_builder.to(T::get_dtype().to_arrow());
let arr = Arc::new(arr) as ArrayRef;

ChunkedArray {
Expand All @@ -109,7 +109,7 @@ where
{
pub fn new(name: &str, capacity: usize) -> Self {
PrimitiveChunkedBuilder {
array_builder: Primitive::<T>::with_capacity(capacity),
array_builder: Primitive::<T::Native>::with_capacity(capacity),
field: Field::new(name, T::get_dtype()),
}
}
Expand All @@ -130,7 +130,7 @@ impl Utf8ChunkedBuilder {
/// * `bytes_capacity` - Number of bytes needed to store the string values.
pub fn new(name: &str, capacity: usize, bytes_capacity: usize) -> Self {
Utf8ChunkedBuilder {
builder: Utf8Primitive::<i32>::with_capacity(bytes_capacity, capacity),
builder: Utf8Primitive::<i64>::with_capacities(capacity, bytes_capacity),
capacity,
field: Field::new(name, DataType::Utf8),
}
Expand All @@ -139,25 +139,22 @@ impl Utf8ChunkedBuilder {
/// Appends a value of type `T` into the builder
#[inline]
pub fn append_value<S: AsRef<str>>(&mut self, v: S) {
self.builder.append_value(v.as_ref()).unwrap();
self.builder.push(Some(v.as_ref()));
}

/// Appends a null slot into the builder
#[inline]
pub fn append_null(&mut self) {
self.builder.append_null().unwrap();
self.builder.push(None);
}

#[inline]
pub fn append_option<S: AsRef<str>>(&mut self, opt: Option<S>) {
match opt {
Some(s) => self.append_value(s.as_ref()),
None => self.append_null(),
}
self.builder.push(opt.map(|x| x.as_ref()));
}

pub fn finish(mut self) -> Utf8Chunked {
let arr = Arc::new(self.builder.finish());
let arr = Arc::new(self.builder.to());
ChunkedArray {
field: Arc::new(self.field),
chunks: vec![arr],
Expand Down Expand Up @@ -195,31 +192,16 @@ impl ChunkedBuilder<Cow<'_, str>, Utf8Type> for Utf8ChunkedBuilderCow {
}
}

/// Get the null count and the null bitmap of the arrow array
pub fn get_bitmap<T: Array + ?Sized>(arr: &T) -> (usize, Option<Bitmap>) {
let data = arr.data();
(
data.null_count(),
data.null_bitmap().as_ref().map(|bitmap| {
let buff = bitmap.buffer_ref();
buff.clone()
}),
)
}

// Used in polars/src/chunked_array/apply.rs:24 to collect from aligned vecs and null bitmaps
impl<T> FromIterator<(MutableBuffer<T::Native>, Option<Bitmap>)> for ChunkedArray<T>
impl<T> FromIterator<(AlignedVec<T::Native>, Option<Bitmap>)> for ChunkedArray<T>
where
T: PolarsNumericType,
{
fn from_iter<I: IntoIterator<Item = (MutableBuffer<T::Native>, Option<Bitmap>)>>(
iter: I,
) -> Self {
fn from_iter<I: IntoIterator<Item = (AlignedVec<T::Native>, Option<Bitmap>)>>(iter: I) -> Self {
let mut chunks = vec![];

for (values, opt_buffer) in iter {
let arr = values.into_primitive_array::<T>(opt_buffer);
chunks.push(Arc::new(arr) as ArrayRef)
chunks.push(to_array::<T>(values, opt_buffer))
}
ChunkedArray::new_from_chunks("from_iter", chunks)
}
Expand Down Expand Up @@ -301,7 +283,7 @@ where

let mut builder = Utf8Primitive::<i64>::with_capacities(values_size, v.len());
v.iter().for_each(|val| {
builder.append_value(val.as_ref()).unwrap();
builder.push(Some(val.as_ref()));
});

let field = Arc::new(Field::new(name, DataType::Utf8));
Expand Down
28 changes: 13 additions & 15 deletions polars/polars-core/src/chunked_array/comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use num::{Num, NumCast, ToPrimitive};
use std::ops::{BitAnd, BitOr, Not};
use std::sync::Arc;

type LargeStringArray = Utf8Array<u64>;
type LargeStringArray = Utf8Array<i64>;

impl<T> ChunkedArray<T>
where
Expand Down Expand Up @@ -453,13 +453,13 @@ impl NumComp for u64 {}
impl<T> ChunkedArray<T>
where
T: PolarsNumericType,
T::Native: NumCast,
T::Native: NumCast + NumComp,
{
fn primitive_compare_scalar<Rhs: NumComp + ToPrimitive>(
&self,
rhs: Rhs,
op: comparison::Operator,
) {
) -> BooleanChunked {
let rhs = NumCast::from(rhs).expect("could not cast to underlying chunkedarray type");
self.apply_kernel_cast(|arr| {
Arc::new(comparison::primitive_compare_scalar(arr, rhs, op).unwrap())
Expand All @@ -470,7 +470,7 @@ where
impl<T, Rhs> ChunkCompare<Rhs> for ChunkedArray<T>
where
T: PolarsNumericType,
T::Native: NumCast,
T::Native: NumCast + NumComp,
Rhs: NumComp + ToPrimitive,
{
fn eq_missing(&self, rhs: Rhs) -> BooleanChunked {
Expand Down Expand Up @@ -503,10 +503,8 @@ where
}

impl Utf8Chunked {
fn utf8_compare_scalar(&self, rhs: &str, op: comparison::Operator) {
self.apply_kernel_cast(|arr| {
Arc::new(comparison::utf8_compare_scalar(arr, rhs, op).unwrap())
})
fn utf8_compare_scalar(&self, rhs: &str, op: comparison::Operator) -> BooleanChunked {
self.apply_kernel_cast(|arr| Arc::new(comparison::utf8_compare_scalar(arr, rhs, op)))
}
}

Expand All @@ -516,26 +514,26 @@ impl ChunkCompare<&str> for Utf8Chunked {
}

fn eq(&self, rhs: &str) -> BooleanChunked {
self.utf8_compare_scalar(comparison::Operator::Eq)
self.utf8_compare_scalar(rhs, comparison::Operator::Eq)
}
fn neq(&self, rhs: &str) -> BooleanChunked {
self.utf8_compare_scalar(comparison::Operator::Neq)
self.utf8_compare_scalar(rhs, comparison::Operator::Neq)
}

fn gt(&self, rhs: &str) -> BooleanChunked {
self.utf8_compare_scalar(comparison::Operator::Gt)
self.utf8_compare_scalar(rhs, comparison::Operator::Gt)
}

fn gt_eq(&self, rhs: &str) -> BooleanChunked {
self.utf8_compare_scalar(comparison::Operator::GtEq)
self.utf8_compare_scalar(rhs, comparison::Operator::GtEq)
}

fn lt(&self, rhs: &str) -> BooleanChunked {
self.utf8_compare_scalar(comparison::Operator::Lt)
self.utf8_compare_scalar(rhs, comparison::Operator::Lt)
}

fn lt_eq(&self, rhs: &str) -> BooleanChunked {
self.utf8_compare_scalar(comparison::Operator::LtEq)
self.utf8_compare_scalar(rhs, comparison::Operator::LtEq)
}
}

Expand Down Expand Up @@ -686,7 +684,7 @@ impl Not for &BooleanChunked {
let chunks = self
.downcast_iter()
.map(|a| {
let arr = compute::boolean::not(a).expect("should not fail");
let arr = compute::boolean::not(a);
Arc::new(arr) as ArrayRef
})
.collect::<Vec<_>>();
Expand Down
22 changes: 9 additions & 13 deletions polars/polars-core/src/chunked_array/kernels/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ pub(crate) mod take_agg;
#[cfg_attr(docsrs, doc(cfg(feature = "temporal")))]
pub mod temporal;

use crate::datatypes::{DataType, PolarsFloatType, PolarsNumericType};
use crate::datatypes::{DataType, PolarsNumericType};
use arrow::array::{Array, ArrayRef, BooleanArray, PrimitiveArray};
use arrow::bitmap::Bitmap;
use arrow::types::NativeType;
Expand Down Expand Up @@ -45,10 +45,9 @@ where
Arc::new(array)
}

pub(crate) fn is_nan<T>(arr: &PrimitiveArray<T::Native>) -> ArrayRef
pub(crate) fn is_nan<T>(arr: &PrimitiveArray<T>) -> ArrayRef
where
T: PolarsFloatType,
T::Native: Float,
T: NativeType + Float,
{
let validity = arr.validity();

Expand All @@ -57,10 +56,9 @@ where
Arc::new(BooleanArray::from_data(values, arr.validity().clone()))
}

pub(crate) fn is_not_nan<T>(arr: &PrimitiveArray<T::Native>) -> ArrayRef
pub(crate) fn is_not_nan<T>(arr: &PrimitiveArray<T>) -> ArrayRef
where
T: PolarsFloatType,
T::Native: Float,
T: NativeType + Float,
{
let validity = arr.validity();

Expand All @@ -69,10 +67,9 @@ where
Arc::new(BooleanArray::from_data(values, arr.validity().clone()))
}

pub(crate) fn is_finite<T>(arr: &PrimitiveArray<T::Native>) -> ArrayRef
pub(crate) fn is_finite<T>(arr: &PrimitiveArray<T>) -> ArrayRef
where
T: PolarsFloatType,
T::Native: Float,
T: NativeType + Float,
{
let validity = arr.validity();

Expand All @@ -81,10 +78,9 @@ where
Arc::new(BooleanArray::from_data(values, arr.validity().clone()))
}

pub(crate) fn is_infinite<T>(arr: &PrimitiveArray<T::Native>) -> ArrayRef
pub(crate) fn is_infinite<T>(arr: &PrimitiveArray<T>) -> ArrayRef
where
T: PolarsFloatType,
T::Native: Float,
T: NativeType + Float,
{
let validity = arr.validity();

Expand Down
10 changes: 3 additions & 7 deletions polars/polars-core/src/chunked_array/kernels/strings.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
use crate::prelude::Arc;
use arrow::array::{ArrayRef, UInt32Array, Utf8Array};
use arrow::array::{Array, ArrayRef, UInt32Array, Utf8Array};
use arrow::buffer::Buffer;
use arrow::datatypes::DataType;

pub(crate) fn string_lengths(array: &Utf8Array<i64>) -> ArrayRef {
let values = array
.offsets()
.iter()
.windows(2)
.map(|x| (x[1] - x[0]) as u32);
let values = array.offsets().windows(2).map(|x| (x[1] - x[0]) as u32);

let values = Buffer::from_trusted_len_iter(values);

UInt32Array::from_data(DataType::UInt32, values, array.validity().clone());
let array = UInt32Array::from_data(DataType::UInt32, values, array.validity().clone());
Arc::new(array)
}

0 comments on commit dd198da

Please sign in to comment.