Skip to content

Commit

Permalink
Nested dict (#4131)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Jul 23, 2022
1 parent c4f3e6b commit d11c216
Show file tree
Hide file tree
Showing 9 changed files with 144 additions and 83 deletions.
2 changes: 1 addition & 1 deletion polars/polars-arrow/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ description = "Arrow interfaces for Polars DataFrame library"
[dependencies]
# arrow = { package = "arrow2", git = "https://github.com/jorgecarleitao/arrow2", rev = "98e49133b2e56e51e30335830485b3cf768eb5a2", features = ["compute_concatenate"], default-features = false }
# arrow = { package = "arrow2", path = "../../../arrow2", features = ["compute_concatenate"], default-features = false }
arrow = { package = "arrow2", git = "https://github.com/ritchie46/arrow2", branch = "apply_validity", features = ["compute_concatenate"], default-features = false }
arrow = { package = "arrow2", git = "https://github.com/ritchie46/arrow2", branch = "polars", features = ["compute_concatenate"], default-features = false }
# arrow = { package = "arrow2", version = "0.12", default-features = false, features = ["compute_concatenate"] }
hashbrown = "0.12"
num = "^0.4"
Expand Down
3 changes: 3 additions & 0 deletions polars/polars-arrow/src/kernels/concatenate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ pub fn concatenate_owned_unchecked(arrays: &[ArrayRef]) -> Result<ArrayRef> {
"concat requires input of at least one array".to_string(),
));
}
if arrays.len() == 1 {
return Ok(arrays[0].clone());
}
let mut arrays_ref = Vec::with_capacity(arrays.len());
let mut lengths = Vec::with_capacity(arrays.len());
let mut capacity = 0;
Expand Down
2 changes: 1 addition & 1 deletion polars/polars-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ package = "arrow2"
git = "https://github.com/ritchie46/arrow2"
# rev = "98e49133b2e56e51e30335830485b3cf768eb5a2"
# path = "../../../arrow2"
branch = "apply_validity"
branch = "polars"
# version = "0.12"
default-features = false
features = [
Expand Down
120 changes: 120 additions & 0 deletions polars/polars-core/src/chunked_array/from.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
use super::*;

#[allow(clippy::ptr_arg)]
fn from_chunks_list_dtype(chunks: &mut Vec<ArrayRef>, dtype: DataType) -> DataType {
// ensure we don't get List<null>
let dtype = if let Some(arr) = chunks.get(0) {
arr.data_type().into()
} else {
dtype
};

match dtype {
#[cfg(feature = "dtype-categorical")]
// arrow dictionaries are not nested as dictionaries, but only by their keys, so we must
// change the list-value array to the keys and store the dicitonary values in the datatype.
// if a global string cache is set, we also must modify the keys.
DataType::List(inner) if *inner == DataType::Categorical(None) => {
use polars_arrow::kernels::concatenate::concatenate_owned_unchecked;
let array = concatenate_owned_unchecked(chunks).unwrap();
let list_arr = array.as_any().downcast_ref::<ListArray<i64>>().unwrap();
let values_arr = list_arr.values();
let cat = unsafe {
Series::try_from_arrow_unchecked(
"",
vec![values_arr.clone()],
values_arr.data_type(),
)
.unwrap()
};

// we nest only the physical representation
// the mapping is still in our rev-map
let arrow_dtype = ListArray::<i64>::default_datatype(ArrowDataType::UInt32);
let new_array = unsafe {
ListArray::new_unchecked(
arrow_dtype,
list_arr.offsets().clone(),
cat.array_ref(0).clone(),
list_arr.validity().cloned(),
)
};
chunks.clear();
chunks.push(Box::new(new_array));
DataType::List(Box::new(cat.dtype().clone()))
}
_ => dtype,
}
}

impl<T> ChunkedArray<T>
where
T: PolarsDataType,
{
/// Create a new ChunkedArray from existing chunks.
pub fn from_chunks(name: &str, mut chunks: Vec<ArrayRef>) -> Self {
let dtype = match T::get_dtype() {
dtype @ DataType::List(_) => from_chunks_list_dtype(&mut chunks, dtype),
dt => dt,
};
let field = Arc::new(Field::new(name, dtype));
let mut out = ChunkedArray {
field,
chunks,
phantom: PhantomData,
categorical_map: None,
bit_settings: Default::default(),
length: 0,
};
out.compute_len();
out
}
}

// A hack to save compiler bloat for null arrays
impl Int32Chunked {
pub(crate) fn new_null(name: &str, len: usize) -> Self {
let arr = arrow::array::new_null_array(ArrowDataType::Null, len);
let field = Arc::new(Field::new(name, DataType::Null));
let chunks = vec![arr as ArrayRef];
let mut out = ChunkedArray {
field,
chunks,
phantom: PhantomData,
categorical_map: None,
bit_settings: Default::default(),
length: 0,
};
out.compute_len();
out
}
}

impl<T> ChunkedArray<T>
where
T: PolarsNumericType,
{
/// Create a new ChunkedArray by taking ownership of the Vec. This operation is zero copy.
pub fn from_vec(name: &str, v: Vec<T::Native>) -> Self {
let arr = to_array::<T>(v, None);
Self::from_chunks(name, vec![arr])
}

/// Nullify values in slice with an existing null bitmap
pub fn new_from_owned_with_null_bitmap(
name: &str,
values: Vec<T::Native>,
buffer: Option<Bitmap>,
) -> Self {
let arr = to_array::<T>(values, buffer);
let mut out = ChunkedArray {
field: Arc::new(Field::new(name, T::get_dtype())),
chunks: vec![arr],
phantom: PhantomData,
categorical_map: None,
..Default::default()
};
out.compute_len();
out
}
}
79 changes: 1 addition & 78 deletions polars/polars-core/src/chunked_array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ mod ndarray;
mod bitwise;
#[cfg(feature = "object")]
mod drop;
mod from;
pub(crate) mod list;
pub(crate) mod logical;
#[cfg(feature = "object")]
Expand Down Expand Up @@ -457,84 +458,6 @@ where
}
}

impl<T> ChunkedArray<T>
where
T: PolarsDataType,
{
/// Create a new ChunkedArray from existing chunks.
pub fn from_chunks(name: &str, chunks: Vec<ArrayRef>) -> Self {
// prevent List<Null> if the inner list type is known.
let datatype = if matches!(T::get_dtype(), DataType::List(_)) {
if let Some(arr) = chunks.get(0) {
arr.data_type().into()
} else {
T::get_dtype()
}
} else {
T::get_dtype()
};
let field = Arc::new(Field::new(name, datatype));
let mut out = ChunkedArray {
field,
chunks,
phantom: PhantomData,
categorical_map: None,
bit_settings: Default::default(),
length: 0,
};
out.compute_len();
out
}
}

// A hack to save compiler bloat for null arrays
impl Int32Chunked {
pub(crate) fn new_null(name: &str, len: usize) -> Self {
let arr = arrow::array::new_null_array(ArrowDataType::Null, len);
let field = Arc::new(Field::new(name, DataType::Null));
let chunks = vec![arr as ArrayRef];
let mut out = ChunkedArray {
field,
chunks,
phantom: PhantomData,
categorical_map: None,
bit_settings: Default::default(),
length: 0,
};
out.compute_len();
out
}
}

impl<T> ChunkedArray<T>
where
T: PolarsNumericType,
{
/// Create a new ChunkedArray by taking ownership of the Vec. This operation is zero copy.
pub fn from_vec(name: &str, v: Vec<T::Native>) -> Self {
let arr = to_array::<T>(v, None);
Self::from_chunks(name, vec![arr])
}

/// Nullify values in slice with an existing null bitmap
pub fn new_from_owned_with_null_bitmap(
name: &str,
values: Vec<T::Native>,
buffer: Option<Bitmap>,
) -> Self {
let arr = to_array::<T>(values, buffer);
let mut out = ChunkedArray {
field: Arc::new(Field::new(name, T::get_dtype())),
chunks: vec![arr],
phantom: PhantomData,
categorical_map: None,
..Default::default()
};
out.compute_len();
out
}
}

pub(crate) trait AsSinglePtr {
/// Rechunk and return a ptr to the start of the array
fn as_single_ptr(&mut self) -> Result<usize> {
Expand Down
2 changes: 1 addition & 1 deletion polars/polars-io/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ private = ["polars-time/private"]
ahash = "0.7"
anyhow = "1.0"
# arrow = { package = "arrow2", git = "https://github.com/jorgecarleitao/arrow2", rev = "98e49133b2e56e51e30335830485b3cf768eb5a2", default-features = false }
arrow = { package = "arrow2", git = "https://github.com/ritchie46/arrow2", branch = "apply_validity", default-features = false }
arrow = { package = "arrow2", git = "https://github.com/ritchie46/arrow2", branch = "polars", default-features = false }
# arrow = { package = "arrow2", version = "0.12", default-features = false }
# arrow = { package = "arrow2", path = "../../../arrow2", default-features = false }
csv-core = { version = "0.1.10", optional = true }
Expand Down
2 changes: 1 addition & 1 deletion py-polars/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion py-polars/src/set.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ pub(crate) fn set_at_idx(mut s: Series, idx: &Series, values: &Series) -> Result
let idx = idx.values().as_slice();

let values = values.to_physical_repr().cast(&s.dtype().to_physical())?;
use std::sync::Arc;

// do not shadow, otherwise s is not dropped immediately
// and we want to have mutable access
Expand Down
16 changes: 16 additions & 0 deletions py-polars/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,3 +204,19 @@ def recursive_logical_type() -> None:
read = pl.read_parquet(f, use_pyarrow=True)
assert read.dtypes == [pl.Int64, pl.List(pl.Categorical)]
assert read.shape == (2, 2)


def test_nested_dictionary() -> None:
with pl.StringCache():
df = (
pl.DataFrame({"str": ["A", "B", "A", "B", "C"], "group": [1, 1, 2, 1, 2]})
.with_column(pl.col("str").cast(pl.Categorical))
.groupby("group")
.agg([pl.col("str").list().alias("cat_list")])
)
f = io.BytesIO()
df.write_parquet(f)
f.seek(0)

read_df = pl.read_parquet(f)
assert df.frame_equal(read_df)

0 comments on commit d11c216

Please sign in to comment.