Skip to content

Commit

Permalink
remove chunk_id from ChunkedArray
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Apr 30, 2021
1 parent fb14369 commit 47b9be9
Show file tree
Hide file tree
Showing 11 changed files with 42 additions and 65 deletions.
2 changes: 0 additions & 2 deletions polars/polars-core/src/chunked_array/builder/categorical.rs
Original file line number Diff line number Diff line change
Expand Up @@ -152,11 +152,9 @@ impl CategoricalChunkedBuilder {

pub fn finish(mut self) -> ChunkedArray<CategoricalType> {
let arr = Arc::new(self.array_builder.finish());
let len = arr.len();
ChunkedArray {
field: Arc::new(self.field),
chunks: vec![arr],
chunk_id: vec![len],
phantom: PhantomData,
categorical_map: Some(Arc::new(self.reverse_mapping.finish())),
}
Expand Down
9 changes: 0 additions & 9 deletions polars/polars-core/src/chunked_array/builder/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,9 @@ impl ChunkedBuilder<bool, BooleanType> for BooleanChunkedBuilder {
fn finish(mut self) -> BooleanChunked {
let arr = Arc::new(self.array_builder.finish());

let len = arr.len();
ChunkedArray {
field: Arc::new(self.field),
chunks: vec![arr],
chunk_id: vec![len],
phantom: PhantomData,
categorical_map: None,
}
Expand Down Expand Up @@ -96,11 +94,9 @@ where
fn finish(mut self) -> ChunkedArray<T> {
let arr = Arc::new(self.array_builder.finish());

let len = arr.len();
ChunkedArray {
field: Arc::new(self.field),
chunks: vec![arr],
chunk_id: vec![len],
phantom: PhantomData,
categorical_map: None,
}
Expand Down Expand Up @@ -162,11 +158,9 @@ impl Utf8ChunkedBuilder {

pub fn finish(mut self) -> Utf8Chunked {
let arr = Arc::new(self.builder.finish());
let len = arr.len();
ChunkedArray {
field: Arc::new(self.field),
chunks: vec![arr],
chunk_id: vec![len],
phantom: PhantomData,
categorical_map: None,
}
Expand Down Expand Up @@ -313,7 +307,6 @@ where
ChunkedArray {
field,
chunks: vec![Arc::new(builder.finish())],
chunk_id: vec![v.len()],
phantom: PhantomData,
categorical_map: None,
}
Expand Down Expand Up @@ -367,11 +360,9 @@ where
macro_rules! finish_list_builder {
($self:ident) => {{
let arr = Arc::new($self.builder.finish());
let len = arr.len();
ListChunked {
field: Arc::new($self.field.clone()),
chunks: vec![arr],
chunk_id: vec![len],
phantom: PhantomData,
categorical_map: None,
}
Expand Down
26 changes: 13 additions & 13 deletions polars/polars-core/src/chunked_array/comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ where
}
}
// same length
else if self.chunk_id == rhs.chunk_id {
else if self.chunk_id().zip(rhs.chunk_id()).all(|(l, r)| l == r) {
// should not fail if arrays are equal
self.comparison(rhs, comparison::eq)
.expect("should not fail.")
Expand All @@ -92,7 +92,7 @@ where
}
}
// same length
else if self.chunk_id == rhs.chunk_id {
else if self.chunk_id().zip(rhs.chunk_id()).all(|(l, r)| l == r) {
self.comparison(rhs, comparison::neq)
.expect("should not fail.")
} else {
Expand All @@ -110,7 +110,7 @@ where
}
}
// same length
else if self.chunk_id == rhs.chunk_id {
else if self.chunk_id().zip(rhs.chunk_id()).all(|(l, r)| l == r) {
self.comparison(rhs, comparison::gt)
.expect("should not fail.")
} else {
Expand All @@ -128,7 +128,7 @@ where
}
}
// same length
else if self.chunk_id == rhs.chunk_id {
else if self.chunk_id().zip(rhs.chunk_id()).all(|(l, r)| l == r) {
self.comparison(rhs, comparison::gt_eq)
.expect("should not fail.")
} else {
Expand All @@ -146,7 +146,7 @@ where
}
}
// same length
else if self.chunk_id == rhs.chunk_id {
else if self.chunk_id().zip(rhs.chunk_id()).all(|(l, r)| l == r) {
self.comparison(rhs, comparison::lt)
.expect("should not fail.")
} else {
Expand All @@ -164,7 +164,7 @@ where
}
}
// same length
else if self.chunk_id == rhs.chunk_id {
else if self.chunk_id().zip(rhs.chunk_id()).all(|(l, r)| l == r) {
self.comparison(rhs, comparison::lt_eq)
.expect("should not fail.")
} else {
Expand Down Expand Up @@ -322,7 +322,7 @@ impl ChunkCompare<&Utf8Chunked> for Utf8Chunked {
}
}
// same length
else if self.chunk_id == rhs.chunk_id {
else if self.chunk_id().zip(rhs.chunk_id()).all(|(l, r)| l == r) {
self.comparison(rhs, eq_utf8).expect("should not fail")
} else {
apply_operand_on_chunkedarray_by_iter!(self, rhs, ==)
Expand All @@ -339,7 +339,7 @@ impl ChunkCompare<&Utf8Chunked> for Utf8Chunked {
}
}
// same length
else if self.chunk_id == rhs.chunk_id {
else if self.chunk_id().zip(rhs.chunk_id()).all(|(l, r)| l == r) {
self.comparison(rhs, neq_utf8).expect("should not fail")
} else {
apply_operand_on_chunkedarray_by_iter!(self, rhs, !=)
Expand All @@ -356,7 +356,7 @@ impl ChunkCompare<&Utf8Chunked> for Utf8Chunked {
}
}
// same length
else if self.chunk_id == rhs.chunk_id {
else if self.chunk_id().zip(rhs.chunk_id()).all(|(l, r)| l == r) {
self.comparison(rhs, gt_utf8).expect("should not fail")
} else {
apply_operand_on_chunkedarray_by_iter!(self, rhs, >)
Expand All @@ -373,7 +373,7 @@ impl ChunkCompare<&Utf8Chunked> for Utf8Chunked {
}
}
// same length
else if self.chunk_id == rhs.chunk_id {
else if self.chunk_id().zip(rhs.chunk_id()).all(|(l, r)| l == r) {
self.comparison(rhs, gt_eq_utf8).expect("should not fail")
} else {
apply_operand_on_chunkedarray_by_iter!(self, rhs, >=)
Expand All @@ -390,7 +390,7 @@ impl ChunkCompare<&Utf8Chunked> for Utf8Chunked {
}
}
// same length
else if self.chunk_id == rhs.chunk_id {
else if self.chunk_id().zip(rhs.chunk_id()).all(|(l, r)| l == r) {
self.comparison(rhs, lt_utf8).expect("should not fail")
} else {
apply_operand_on_chunkedarray_by_iter!(self, rhs, <)
Expand All @@ -407,7 +407,7 @@ impl ChunkCompare<&Utf8Chunked> for Utf8Chunked {
}
}
// same length
else if self.chunk_id == rhs.chunk_id {
else if self.chunk_id().zip(rhs.chunk_id()).all(|(l, r)| l == r) {
self.comparison(rhs, lt_eq_utf8).expect("should not fail")
} else {
apply_operand_on_chunkedarray_by_iter!(self, rhs, <=)
Expand Down Expand Up @@ -588,7 +588,7 @@ impl BooleanChunked {

macro_rules! impl_bitwise_op {
($self:ident, $rhs:ident, $arrow_method:ident, $op:tt) => {{
if $self.chunk_id == $rhs.chunk_id {
if $self.chunk_id().zip($rhs.chunk_id()).all(|(l, r)| l == r) {
let result = $self.bit_operation($rhs, compute::$arrow_method);
result.unwrap()
} else {
Expand Down
42 changes: 14 additions & 28 deletions polars/polars-core/src/chunked_array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,7 @@ use polars_arrow::array::ValueSize;
use std::mem;
use std::ops::{Deref, DerefMut};

/// Get a 'hash' of the chunks in order to compare chunk sizes quickly.
fn create_chunk_id(chunks: &[ArrayRef]) -> Vec<usize> {
let mut chunk_id = Vec::with_capacity(chunks.len());
for a in chunks {
chunk_id.push(a.len())
}
chunk_id
}
pub type ChunkIdIter<'a> = std::iter::Map<std::slice::Iter<'a, ArrayRef>, fn(&ArrayRef) -> usize>;

/// # ChunkedArray
///
Expand Down Expand Up @@ -156,8 +149,6 @@ fn create_chunk_id(chunks: &[ArrayRef]) -> Vec<usize> {
pub struct ChunkedArray<T> {
pub(crate) field: Arc<Field>,
pub(crate) chunks: Vec<ArrayRef>,
// chunk lengths
chunk_id: Vec<usize>,
phantom: PhantomData<T>,
/// maps categorical u32 indexes to String values
pub(crate) categorical_map: Option<Arc<RevMapping>>,
Expand Down Expand Up @@ -272,8 +263,8 @@ impl<T> ChunkedArray<T> {
}

/// Unique id representing the number of chunks
pub fn chunk_id(&self) -> &Vec<usize> {
&self.chunk_id
pub fn chunk_id(&self) -> ChunkIdIter {
self.chunks.iter().map(|chunk| chunk.len())
}

/// A reference to the chunks
Expand Down Expand Up @@ -313,7 +304,6 @@ impl<T> ChunkedArray<T> {
));
}
if self.field.data_type() == other.data_type() {
self.chunk_id.push(other.len());
self.chunks.push(other);
Ok(())
} else {
Expand All @@ -330,11 +320,9 @@ impl<T> ChunkedArray<T> {

/// Create a new ChunkedArray from self, where the chunks are replaced.
fn copy_with_chunks(&self, chunks: Vec<ArrayRef>) -> Self {
let chunk_id = create_chunk_id(&chunks);
ChunkedArray {
field: self.field.clone(),
chunks,
chunk_id,
phantom: PhantomData,
categorical_map: self.categorical_map.clone(),
}
Expand Down Expand Up @@ -459,10 +447,8 @@ impl<T> ChunkedArray<T> {
// replace an empty array
if self.chunks.len() == 1 && self.is_empty() {
self.chunks = other.chunks.clone();
self.chunk_id = create_chunk_id(&self.chunks);
} else {
self.chunks.extend_from_slice(&other.chunks);
self.chunk_id.extend_from_slice(&other.chunk_id);
}
}

Expand Down Expand Up @@ -490,18 +476,23 @@ where
/// Should be used to match the chunk_id of another ChunkedArray.
/// # Panics
/// It is the callers responsibility to ensure that this ChunkedArray has a single chunk.
pub(crate) fn match_chunks(&self, chunk_id: &[usize]) -> Self {
pub(crate) fn match_chunks<I>(&self, chunk_id: I) -> Self
where
I: Iterator<Item = usize>,
{
debug_assert!(self.chunks.len() == 1);
// Takes a ChunkedArray containing a single chunk
let slice = |ca: &Self| {
let array = &ca.chunks[0];

let mut chunks = Vec::with_capacity(chunk_id.len());
let mut offset = 0;
for len in chunk_id {
chunks.push(array.slice(offset, *len));
offset += *len;
}
let chunks = chunk_id
.map(|len| {
offset += len;
array.slice(offset, len)
})
.collect();

Self::new_from_chunks(self.name(), chunks)
};

Expand Down Expand Up @@ -531,11 +522,9 @@ where
T::get_dtype()
};
let field = Arc::new(Field::new(name, datatype));
let chunk_id = create_chunk_id(&chunks);
ChunkedArray {
field,
chunks,
chunk_id,
phantom: PhantomData,
categorical_map: None,
}
Expand Down Expand Up @@ -642,12 +631,10 @@ where
values: AlignedVec<T::Native>,
buffer: Option<Buffer>,
) -> Self {
let len = values.len();
let arr = Arc::new(values.into_primitive_array::<T>(buffer));
ChunkedArray {
field: Arc::new(Field::new(name, T::get_dtype())),
chunks: vec![arr],
chunk_id: vec![len],
phantom: PhantomData,
categorical_map: None,
}
Expand Down Expand Up @@ -816,7 +803,6 @@ impl<T> Clone for ChunkedArray<T> {
ChunkedArray {
field: self.field.clone(),
chunks: self.chunks.clone(),
chunk_id: self.chunk_id.clone(),
phantom: PhantomData,
categorical_map: self.categorical_map.clone(),
}
Expand Down
2 changes: 0 additions & 2 deletions polars/polars-core/src/chunked_array/object/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ where
ChunkedArray {
field: Arc::new(self.field),
chunks: vec![arr],
chunk_id: vec![len],
phantom: PhantomData,
categorical_map: None,
}
Expand Down Expand Up @@ -142,7 +141,6 @@ where
ObjectChunked {
field,
chunks: vec![arr],
chunk_id: vec![len],
phantom: PhantomData,
categorical_map: None,
}
Expand Down
1 change: 0 additions & 1 deletion polars/polars-core/src/chunked_array/upstream_traits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ impl<T> Default for ChunkedArray<T> {
ChunkedArray {
field: Arc::new(Field::new("default", DataType::Null)),
chunks: Default::default(),
chunk_id: Default::default(),
phantom: PhantomData,
categorical_map: None,
}
Expand Down
8 changes: 7 additions & 1 deletion polars/polars-core/src/frame/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,13 @@ impl DataFrame {

/// Ensure all the chunks in the DataFrame are aligned.
pub fn rechunk(&mut self) -> &mut Self {
if self.columns.iter().map(|s| s.chunk_lengths()).all_equal() {
// TODO: remove vec allocation
if self
.columns
.iter()
.map(|s| s.chunk_lengths().collect_vec())
.all_equal()
{
self
} else {
self.as_single_chunk()
Expand Down
5 changes: 2 additions & 3 deletions polars/polars-core/src/series/implementations/dates.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@ use super::private;
use super::IntoSeries;
use super::SeriesTrait;
use super::SeriesWrap;
use crate::chunked_array::comparison::*;
use crate::chunked_array::AsSinglePtr;
use crate::chunked_array::{comparison::*, AsSinglePtr, ChunkIdIter};
use crate::fmt::FmtList;
#[cfg(feature = "pivot")]
use crate::frame::groupby::pivot::*;
Expand Down Expand Up @@ -256,7 +255,7 @@ macro_rules! impl_dyn_series {
self.0.array_data()
}

fn chunk_lengths(&self) -> &Vec<usize> {
fn chunk_lengths(&self) -> ChunkIdIter {
self.0.chunk_id()
}
fn name(&self) -> &str {
Expand Down
4 changes: 2 additions & 2 deletions polars/polars-core/src/series/implementations/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ use super::SeriesTrait;
use crate::chunked_array::comparison::*;
use crate::chunked_array::{
ops::aggregate::{ChunkAggSeries, VarAggSeries},
AsSinglePtr,
AsSinglePtr, ChunkIdIter,
};
use crate::fmt::FmtList;
#[cfg(feature = "pivot")]
Expand Down Expand Up @@ -202,7 +202,7 @@ macro_rules! impl_dyn_series {
self.0.array_data()
}

fn chunk_lengths(&self) -> &Vec<usize> {
fn chunk_lengths(&self) -> ChunkIdIter {
self.0.chunk_id()
}
fn name(&self) -> &str {
Expand Down

0 comments on commit 47b9be9

Please sign in to comment.