Skip to content

Commit

Permalink
List builder for arbitrary nested types (#2297)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Jan 7, 2022
1 parent 6f8737d commit f38e3e8
Show file tree
Hide file tree
Showing 7 changed files with 140 additions and 9 deletions.
2 changes: 1 addition & 1 deletion polars/polars-arrow/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,5 @@ thiserror = "^1.0"

[features]
strings = []
compute = ["arrow/compute_cast"]
compute = ["arrow/compute_cast", "arrow/compute_concatenate"]
parquet = ["arrow/io_parquet", "arrow/io_parquet_compression"]
67 changes: 67 additions & 0 deletions polars/polars-arrow/src/array/list.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
use arrow::array::{Array, ListArray};
use arrow::bitmap::MutableBitmap;
use arrow::compute::concatenate;
use arrow::error::Result;

pub struct AnonymousBuilder<'a> {
arrays: Vec<&'a dyn Array>,
offsets: Vec<i64>,
validity: Option<MutableBitmap>,
size: i64,
}

impl<'a> AnonymousBuilder<'a> {
pub fn new(size: usize) -> Self {
let mut offsets = Vec::with_capacity(size + 1);
offsets.push(0i64);
Self {
arrays: Vec::with_capacity(size),
offsets,
validity: None,
size: 0,
}
}
#[inline]
fn last_offset(&self) -> i64 {
*self.offsets.last().unwrap()
}

pub fn push(&mut self, arr: &'a dyn Array) {
self.size += arr.len() as i64;
self.offsets.push(self.size);
self.arrays.push(arr);

if let Some(validity) = &mut self.validity {
validity.push(true)
}
}
pub fn push_null(&mut self) {
self.offsets.push(self.last_offset());
match &mut self.validity {
Some(validity) => validity.push(false),
None => self.init_validity(),
}
}

fn init_validity(&mut self) {
let len = self.offsets.len() - 1;

let mut validity = MutableBitmap::with_capacity(self.offsets.capacity());
validity.extend_constant(len, true);
validity.set(len - 1, false);
self.validity = Some(validity)
}

pub fn finish(self) -> Result<ListArray<i64>> {
let inner_dtype = self.arrays[0].data_type();
let values = concatenate::concatenate(&self.arrays)?;

let dtype = ListArray::<i64>::default_datatype(inner_dtype.clone());
Ok(ListArray::<i64>::from_data(
dtype,
self.offsets.into(),
values.into(),
self.validity.map(|validity| validity.into()),
))
}
}
1 change: 1 addition & 0 deletions polars/polars-arrow/src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use std::sync::Arc;
use crate::utils::CustomIterTools;

pub mod default_arrays;
pub mod list;

pub trait ValueSize {
/// Useful for a Utf8 or a List to get underlying value size.
Expand Down
39 changes: 38 additions & 1 deletion polars/polars-core/src/chunked_array/builder/list.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use super::*;
use polars_arrow::prelude::PolarsArray;
use polars_arrow::{array::list::AnonymousBuilder, prelude::*};

pub trait ListBuilderTrait {
fn append_opt_series(&mut self, opt_s: Option<&Series>);
Expand Down Expand Up @@ -319,3 +319,40 @@ pub fn get_list_builder(
get_bool_builder
)
}

pub struct AnonymousListBuilder<'a> {
name: String,
builder: AnonymousBuilder<'a>,
}

impl<'a> AnonymousListBuilder<'a> {
pub fn new(name: &str, capacity: usize) -> Self {
Self {
name: name.into(),
builder: AnonymousBuilder::new(capacity),
}
}

pub fn append_opt_series(&mut self, opt_s: Option<&'a Series>) {
match opt_s {
Some(s) => self.append_series(s),
None => {
self.append_null();
}
}
}

pub fn append_null(&mut self) {
self.builder.push_null();
}

pub fn append_series(&mut self, s: &'a Series) {
assert_eq!(s.chunks().len(), 1);
self.builder.push(s.chunks()[0].as_ref())
}

pub fn finish(self) -> ListChunked {
let arr = self.builder.finish().unwrap();
ListChunked::new_from_chunks(&self.name, vec![Arc::new(arr)])
}
}
2 changes: 1 addition & 1 deletion polars/polars-core/src/chunked_array/builder/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
mod boolean;
mod from;
mod list;
pub mod list;
mod primitive;
mod utf8;

Expand Down
24 changes: 18 additions & 6 deletions polars/polars-core/src/named_from.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::chunked_array::builder::get_list_builder;
use crate::chunked_array::builder::{get_list_builder, AnonymousListBuilder};
use crate::prelude::*;
use std::borrow::Cow;

Expand Down Expand Up @@ -58,14 +58,26 @@ impl_named_from!([Option<f64>], Float64Type, new_from_opt_slice);
impl<T: AsRef<[Series]>> NamedFrom<T, ListType> for Series {
fn new(name: &str, s: T) -> Self {
let series_slice = s.as_ref();
let values_cap = series_slice.iter().fold(0, |acc, s| acc + s.len());
let list_cap = series_slice.len();

let dt = series_slice[0].dtype();
let mut builder = get_list_builder(dt, values_cap, series_slice.len(), name);
for series in series_slice {
builder.append_series(series)

// inner type is also list so we need the anonymous builder
if matches!(dt, DataType::List(_)) {
let mut builder = AnonymousListBuilder::new(name, list_cap);
for s in series_slice {
builder.append_series(s)
}
builder.finish().into_series()
} else {
let values_cap = series_slice.iter().fold(0, |acc, s| acc + s.len());

let mut builder = get_list_builder(dt, values_cap, list_cap, name);
for series in series_slice {
builder.append_series(series)
}
builder.finish().into_series()
}
builder.finish().into_series()
}
}

Expand Down
14 changes: 14 additions & 0 deletions py-polars/polars/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,18 @@ def sequence_to_pyseries(
nested_value = _get_first_non_none(value)
nested_dtype = type(nested_value) if value is not None else float

# recursively call Series constructor
if nested_dtype == list:
return sequence_to_pyseries(
name=name,
values=[
sequence_to_pyseries(name, seq, dtype=None, strict=strict)
for seq in values
],
dtype=None,
strict=strict,
)

# logs will show a panic if we infer wrong dtype
# and its hard to error from rust side
# to reduce the likelihood of this happening
Expand Down Expand Up @@ -184,6 +196,8 @@ def sequence_to_pyseries(

elif dtype_ == pli.Series:
return PySeries.new_series_list(name, [v.inner() for v in values], strict)
elif dtype_ == PySeries:
return PySeries.new_series_list(name, values, strict)

else:
constructor = py_type_to_constructor(dtype_)
Expand Down

0 comments on commit f38e3e8

Please sign in to comment.