Skip to content

Commit

Permalink
Split slice
Browse files Browse the repository at this point in the history
This allows for specialized impls.

Also fixes a bug in Categorical -> unique fast path.
When offset == 0 in slice operation, the fast path
was a false positive.
  • Loading branch information
ritchie46 committed Dec 2, 2021
1 parent 78f2de5 commit abcb75b
Show file tree
Hide file tree
Showing 7 changed files with 146 additions and 78 deletions.
11 changes: 8 additions & 3 deletions polars/polars-core/src/chunked_array/categorical/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,14 @@ impl CategoricalChunked {
}

pub(crate) fn can_fast_unique(&self) -> bool {
self.bit_settings & 1 << 4 != 0 && self.chunks.len() == 1 && {
let arr = self.downcast_iter().next().unwrap();
arr.values().offset() == 0
self.bit_settings & 1 << 3 != 0 && self.chunks.len() == 1
}

pub(crate) fn set_fast_unique(&mut self, can: bool) {
if can {
self.bit_settings |= 1u8 << 3;
} else {
self.bit_settings &= !(1u8 << 3);
}
}

Expand Down
69 changes: 1 addition & 68 deletions polars/polars-core/src/chunked_array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ use polars_arrow::prelude::*;

#[cfg(feature = "dtype-categorical")]
use crate::chunked_array::categorical::RevMapping;
use crate::utils::{slice_offsets, CustomIterTools};
use crate::utils::CustomIterTools;
use std::mem;

#[cfg(not(feature = "dtype-categorical"))]
Expand Down Expand Up @@ -264,16 +264,6 @@ impl<T> ChunkedArray<T> {
}
}

/// Combined length of all the chunks.
pub fn len(&self) -> usize {
self.chunks.iter().fold(0, |acc, arr| acc + arr.len())
}

/// Check if ChunkedArray is empty.
pub fn is_empty(&self) -> bool {
self.len() == 0
}

/// Unique id representing the number of chunks
pub fn chunk_id(&self) -> ChunkIdIter {
self.chunks.iter().map(|chunk| chunk.len())
Expand All @@ -295,11 +285,6 @@ impl<T> ChunkedArray<T> {
self.chunks.iter().map(|arr| arr.null_count()).sum()
}

/// Take a view of top n elements
pub fn limit(&self, num_elements: usize) -> Self {
self.slice(0, num_elements)
}

/// Append arrow array in place.
///
/// ```rust
Expand Down Expand Up @@ -342,41 +327,6 @@ impl<T> ChunkedArray<T> {
}
}

/// Slice the array. The chunks are reallocated the underlying data slices are zero copy.
///
/// When offset is negative it will be counted from the end of the array.
/// This method will never error,
/// and will slice the best match when offset, or length is out of bounds
pub fn slice(&self, offset: i64, length: usize) -> Self {
let (raw_offset, slice_len) = slice_offsets(offset, length, self.len());

let mut remaining_length = slice_len;
let mut remaining_offset = raw_offset;
let mut new_chunks = vec![];

for chunk in &self.chunks {
let chunk_len = chunk.len();
if remaining_offset > 0 && remaining_offset >= chunk_len {
remaining_offset -= chunk_len;
continue;
}
let take_len;
if remaining_length + remaining_offset > chunk_len {
take_len = chunk_len - remaining_offset;
} else {
take_len = remaining_length;
}

new_chunks.push(chunk.slice(remaining_offset, take_len).into());
remaining_length -= take_len;
remaining_offset = 0;
if remaining_length == 0 {
break;
}
}
self.copy_with_chunks(new_chunks)
}

/// Get a mask of the null values.
pub fn is_null(&self) -> BooleanChunked {
if !self.has_validity() {
Expand Down Expand Up @@ -420,23 +370,6 @@ impl<T> ChunkedArray<T> {
self.field.data_type()
}

/// Get the head of the ChunkedArray
pub fn head(&self, length: Option<usize>) -> Self {
match length {
Some(len) => self.slice(0, std::cmp::min(len, self.len())),
None => self.slice(0, std::cmp::min(10, self.len())),
}
}

/// Get the tail of the ChunkedArray
pub fn tail(&self, length: Option<usize>) -> Self {
let len = match length {
Some(len) => std::cmp::min(len, self.len()),
None => std::cmp::min(10, self.len()),
};
self.slice(-(len as i64), len)
}

/// Name of the ChunkedArray.
pub fn name(&self) -> &str {
self.field.name()
Expand Down
66 changes: 61 additions & 5 deletions polars/polars-core/src/chunked_array/ops/chunkops.rs
Original file line number Diff line number Diff line change
@@ -1,18 +1,48 @@
#[cfg(feature = "object")]
use crate::chunked_array::object::builder::ObjectChunkedBuilder;
use crate::prelude::*;
use crate::utils::slice_offsets;
#[cfg(feature = "object")]
use arrow::array::Array;
use arrow::compute::concatenate;
use itertools::Itertools;
#[cfg(feature = "dtype-categorical")]
use std::ops::Deref;

pub trait ChunkOps {
/// Aggregate to contiguous memory.
fn rechunk(&self) -> Self
where
Self: std::marker::Sized;
#[inline]
fn slice(
chunks: &[ArrayRef],
offset: i64,
slice_length: usize,
own_length: usize,
) -> Vec<ArrayRef> {
let (raw_offset, slice_len) = slice_offsets(offset, slice_length, own_length);

let mut remaining_length = slice_len;
let mut remaining_offset = raw_offset;
let mut new_chunks = Vec::with_capacity(1);

for chunk in chunks {
let chunk_len = chunk.len();
if remaining_offset > 0 && remaining_offset >= chunk_len {
remaining_offset -= chunk_len;
continue;
}
let take_len;
if remaining_length + remaining_offset > chunk_len {
take_len = chunk_len - remaining_offset;
} else {
take_len = remaining_length;
}

new_chunks.push(chunk.slice(remaining_offset, take_len).into());
remaining_length -= take_len;
remaining_offset = 0;
if remaining_length == 0 {
break;
}
}
new_chunks
}

impl<T> ChunkOps for ChunkedArray<T>
Expand All @@ -31,6 +61,10 @@ where
ChunkedArray::new_from_chunks(self.name(), chunks)
}
}
#[inline]
fn slice(&self, offset: i64, length: usize) -> Self {
self.copy_with_chunks(slice(&self.chunks, offset, length, self.len()))
}
}

impl ChunkOps for BooleanChunked {
Expand All @@ -46,6 +80,10 @@ impl ChunkOps for BooleanChunked {
ChunkedArray::new_from_chunks(self.name(), chunks)
}
}
#[inline]
fn slice(&self, offset: i64, length: usize) -> Self {
self.copy_with_chunks(slice(&self.chunks, offset, length, self.len()))
}
}

impl ChunkOps for Utf8Chunked {
Expand All @@ -61,6 +99,10 @@ impl ChunkOps for Utf8Chunked {
ChunkedArray::new_from_chunks(self.name(), chunks)
}
}
#[inline]
fn slice(&self, offset: i64, length: usize) -> Self {
self.copy_with_chunks(slice(&self.chunks, offset, length, self.len()))
}
}

#[cfg(feature = "dtype-categorical")]
Expand All @@ -74,6 +116,12 @@ impl ChunkOps for CategoricalChunked {
out.categorical_map = cat_map;
out
}
#[inline]
fn slice(&self, offset: i64, length: usize) -> Self {
let mut out = self.copy_with_chunks(slice(&self.chunks, offset, length, self.len()));
out.set_fast_unique(false);
out
}
}

impl ChunkOps for ListChunked {
Expand All @@ -93,6 +141,10 @@ impl ChunkOps for ListChunked {
ca
}
}
#[inline]
fn slice(&self, offset: i64, length: usize) -> Self {
self.copy_with_chunks(slice(&self.chunks, offset, length, self.len()))
}
}

#[cfg(feature = "object")]
Expand Down Expand Up @@ -132,6 +184,10 @@ where
builder.finish()
}
}
#[inline]
fn slice(&self, offset: i64, length: usize) -> Self {
self.copy_with_chunks(slice(&self.chunks, offset, length, self.len()))
}
}

#[cfg(test)]
Expand Down
13 changes: 13 additions & 0 deletions polars/polars-core/src/chunked_array/ops/len.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
use super::*;

impl<T> ChunkLen for ChunkedArray<T> {
/// Combined length of all the chunks.
#[inline]
fn len(&self) -> usize {
match self.chunks.len() {
// fast path
1 => self.chunks[0].len(),
_ => self.chunks.iter().fold(0, |acc, arr| acc + arr.len()),
}
}
}
58 changes: 58 additions & 0 deletions polars/polars-core/src/chunked_array/ops/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ pub mod full;
mod interpolate;
#[cfg(feature = "is_in")]
mod is_in;
mod len;
mod peaks;
#[cfg(feature = "repeat_by")]
mod repeat_by;
Expand Down Expand Up @@ -776,3 +777,60 @@ pub trait StrConcat {
/// * `delimiter` - A string that will act as delimiter between values.
fn str_concat(&self, delimiter: &str) -> Utf8Chunked;
}

pub trait ChunkLen {
/// Get the length of the ChunkedArray
fn len(&self) -> usize;

/// Check if ChunkedArray is empty.
fn is_empty(&self) -> bool {
self.len() == 0
}
}

pub trait ChunkOps: ChunkLen {
/// Aggregate to contiguous memory.
fn rechunk(&self) -> Self
where
Self: std::marker::Sized;

/// Slice the array. The chunks are reallocated the underlying data slices are zero copy.
///
/// When offset is negative it will be counted from the end of the array.
/// This method will never error,
/// and will slice the best match when offset, or length is out of bounds
fn slice(&self, offset: i64, length: usize) -> Self
where
Self: std::marker::Sized;

/// Take a view of top n elements
fn limit(&self, num_elements: usize) -> Self
where
Self: Sized,
{
self.slice(0, num_elements)
}

/// Get the head of the ChunkedArray
fn head(&self, length: Option<usize>) -> Self
where
Self: Sized,
{
match length {
Some(len) => self.slice(0, std::cmp::min(len, self.len())),
None => self.slice(0, std::cmp::min(10, self.len())),
}
}

/// Get the tail of the ChunkedArray
fn tail(&self, length: Option<usize>) -> Self
where
Self: Sized,
{
let len = match length {
Some(len) => std::cmp::min(len, self.len()),
None => std::cmp::min(10, self.len()),
};
self.slice(-(len as i64), len)
}
}
2 changes: 1 addition & 1 deletion polars/polars-core/src/prelude.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ pub use crate::{
PrimitiveChunkedBuilder, Utf8ChunkedBuilder,
},
iterator::{IntoNoNullIterator, PolarsIterator},
ops::{aggregate::*, chunkops::ChunkOps, *},
ops::{aggregate::*, *},
ChunkedArray,
},
datatypes,
Expand Down
5 changes: 4 additions & 1 deletion polars/polars-core/src/utils/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,10 @@ macro_rules! split_array {
}

#[cfg(feature = "private")]
pub fn split_ca<T>(ca: &ChunkedArray<T>, n: usize) -> Result<Vec<ChunkedArray<T>>> {
pub fn split_ca<T>(ca: &ChunkedArray<T>, n: usize) -> Result<Vec<ChunkedArray<T>>>
where
ChunkedArray<T>: ChunkOps,
{
split_array!(ca, n, i64)
}

Expand Down

0 comments on commit abcb75b

Please sign in to comment.