Skip to content

Commit

Permalink
fix cat append: closes #1457
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Sep 30, 2021
1 parent ab36afb commit 0af94e1
Show file tree
Hide file tree
Showing 8 changed files with 203 additions and 62 deletions.
34 changes: 0 additions & 34 deletions polars/polars-core/src/chunked_array/builder/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -653,38 +653,4 @@ mod test {
let ca = builder.finish();
dbg!(ca);
}

#[test]
#[cfg(feature = "dtype-categorical")]
fn test_categorical_builder() {
use crate::{reset_string_cache, toggle_string_cache};
let _lock = crate::SINGLE_LOCK.lock();
for b in &[false, true] {
reset_string_cache();
toggle_string_cache(*b);

// Use 2 builders to check if the global string cache
// does not interfere with the index mapping
let mut builder1 = CategoricalChunkedBuilder::new("foo", 10);
let mut builder2 = CategoricalChunkedBuilder::new("foo", 10);
builder1.from_iter(vec![None, Some("hello"), Some("vietnam")]);
builder2.from_iter(vec![Some("hello"), None, Some("world")].into_iter());

let ca = builder1.finish();
let v = AnyValue::Null;
assert_eq!(ca.get_any_value(0), v);
let v = AnyValue::Utf8("hello");
assert_eq!(ca.get_any_value(1), v);
let v = AnyValue::Utf8("vietnam");
assert_eq!(ca.get_any_value(2), v);

let ca = builder2.finish();
let v = AnyValue::Utf8("hello");
assert_eq!(ca.get_any_value(0), v);
let v = AnyValue::Null;
assert_eq!(ca.get_any_value(1), v);
let v = AnyValue::Utf8("world");
assert_eq!(ca.get_any_value(2), v);
}
}
}
40 changes: 40 additions & 0 deletions polars/polars-core/src/chunked_array/categorical/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@ use arrow::array::*;
use std::marker::PhantomData;

pub enum RevMappingBuilder {
/// Hashmap: maps the indexes from the global cache/categorical array to indexes in the local Utf8Array
/// Utf8Array: caches the string values
Global(PlHashMap<u32, u32>, MutableUtf8Array<i64>, u128),
/// Utf8Array: caches the string values
Local(MutableUtf8Array<i64>),
}

Expand Down Expand Up @@ -36,7 +39,10 @@ impl RevMappingBuilder {
}

pub enum RevMapping {
/// Hashmap: maps the indexes from the global cache/categorical array to indexes in the local Utf8Array
/// Utf8Array: caches the string values
Global(PlHashMap<u32, u32>, Utf8Array<i64>, u128),
/// Utf8Array: caches the string values
Local(Utf8Array<i64>),
}

Expand Down Expand Up @@ -179,6 +185,7 @@ impl CategoricalChunkedBuilder {

#[cfg(test)]
mod test {
use crate::chunked_array::categorical::CategoricalChunkedBuilder;
use crate::prelude::*;
use crate::{reset_string_cache, toggle_string_cache, SINGLE_LOCK};

Expand Down Expand Up @@ -216,4 +223,37 @@ mod test {

Ok(())
}

#[test]
fn test_categorical_builder() {
use crate::{reset_string_cache, toggle_string_cache};
let _lock = crate::SINGLE_LOCK.lock();
for b in &[false, true] {
reset_string_cache();
toggle_string_cache(*b);

// Use 2 builders to check if the global string cache
// does not interfere with the index mapping
let mut builder1 = CategoricalChunkedBuilder::new("foo", 10);
let mut builder2 = CategoricalChunkedBuilder::new("foo", 10);
builder1.from_iter(vec![None, Some("hello"), Some("vietnam")]);
builder2.from_iter(vec![Some("hello"), None, Some("world")].into_iter());

let ca = builder1.finish();
let v = AnyValue::Null;
assert_eq!(ca.get_any_value(0), v);
let v = AnyValue::Utf8("hello");
assert_eq!(ca.get_any_value(1), v);
let v = AnyValue::Utf8("vietnam");
assert_eq!(ca.get_any_value(2), v);

let ca = builder2.finish();
let v = AnyValue::Utf8("hello");
assert_eq!(ca.get_any_value(0), v);
let v = AnyValue::Null;
assert_eq!(ca.get_any_value(1), v);
let v = AnyValue::Utf8("world");
assert_eq!(ca.get_any_value(2), v);
}
}
}
67 changes: 67 additions & 0 deletions polars/polars-core/src/chunked_array/categorical/merge.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
use super::*;
use arrow::bitmap::MutableBitmap;
use arrow::buffer::MutableBuffer;
use std::sync::Arc;

impl CategoricalChunked {
pub(crate) fn merge_categorical_map(&self, other: &Self) -> Arc<RevMapping> {
match (
self.categorical_map.as_deref(),
other.categorical_map.as_deref(),
) {
(
Some(RevMapping::Global(l_map, l_slots, l_id)),
Some(RevMapping::Global(r_map, r_slots, r_id)),
) => {
if l_id != r_id {
panic!("The two categorical arrays are not created under the same global string cache. They cannot be merged")
}
let mut new_map = (*l_map).clone();

let mut offset_buf = MutableBuffer::new();
offset_buf.extend_from_slice(l_slots.offsets().as_slice());

let mut values_buf = MutableBuffer::new();
values_buf.extend_from_slice(l_slots.values().as_slice());

let validity_buf = if let Some(validity) = l_slots.validity() {
let mut validity_buf = MutableBitmap::new();
let (b, offset, len) = validity.as_slice();
validity_buf.extend_from_slice(b, offset, len);
Some(validity_buf)
} else {
None
};

// Safety
// all offsets are valid and the u8 data is valid utf8
let mut new_slots = unsafe {
MutableUtf8Array::from_data_unchecked(
DataType::Utf8.to_arrow(),
offset_buf,
values_buf,
validity_buf,
)
};

for (cat, idx) in r_map.iter() {
new_map.entry(*cat).or_insert_with(|| {
// Safety
// within bounds
let str_val = unsafe { r_slots.value_unchecked(*idx as usize) };
let new_idx = new_slots.len() as u32;
new_slots.push(Some(str_val));

new_idx
});
}
let new_rev = RevMapping::Global(new_map, new_slots.into(), *l_id);
Arc::new(new_rev)
}
_ => {
// pass for now. Still need to do some checks for local maps that are equal
self.categorical_map.as_ref().unwrap().clone()
}
}
}
}
21 changes: 21 additions & 0 deletions polars/polars-core/src/chunked_array/categorical/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ use crate::prelude::*;
use arrow::array::DictionaryArray;
use arrow::compute::cast::cast;
mod builder;
mod merge;

pub use builder::*;

impl From<&CategoricalChunked> for DictionaryArray<u32> {
Expand Down Expand Up @@ -80,4 +82,23 @@ mod test {

Ok(())
}

#[test]
fn test_append_categorical() {
let _lock = SINGLE_LOCK.lock();
reset_string_cache();
toggle_string_cache(true);

let mut s1 = Series::new("1", vec!["a", "b", "c"])
.cast::<CategoricalType>()
.unwrap();
let s2 = Series::new("2", vec!["a", "x", "y"])
.cast::<CategoricalType>()
.unwrap();
let appended = s1.append(&s2).unwrap();
assert_eq!(appended.str_value(0), "\"a\"");
assert_eq!(appended.str_value(1), "\"b\"");
assert_eq!(appended.str_value(4), "\"x\"");
assert_eq!(appended.str_value(5), "\"y\"");
}
}
26 changes: 0 additions & 26 deletions polars/polars-core/src/chunked_array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -442,32 +442,6 @@ impl<T> ChunkedArray<T> {
self.slice(-(len as i64), len)
}

/// Append in place.
pub fn append(&mut self, other: &Self)
where
Self: std::marker::Sized,
{
#[cfg(feature = "dtype-categorical")]
if let (Some(rev_map_l), Some(rev_map_r)) = (
self.categorical_map.as_ref(),
other.categorical_map.as_ref(),
) {
// first assertion checks if the global string cache is equal,
// the second checks if we append a slice from this array to self
if !rev_map_l.same_src(rev_map_r) && !Arc::ptr_eq(rev_map_l, rev_map_r) {
panic!("Appending categorical data can only be done if they are made under the same global string cache. \
Consider using a global string cache.")
}
}

// replace an empty array
if self.chunks.len() == 1 && self.is_empty() {
self.chunks = other.chunks.clone();
} else {
self.chunks.extend_from_slice(&other.chunks);
}
}

/// Name of the ChunkedArray.
pub fn name(&self) -> &str {
self.field.name()
Expand Down
70 changes: 70 additions & 0 deletions polars/polars-core/src/chunked_array/ops/append.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
use crate::prelude::*;

fn new_chunks(chunks: &mut Vec<ArrayRef>, other: &[ArrayRef], len: usize) {
// replace an empty array
if chunks.len() == 1 && len == 0 {
*chunks = other.to_owned();
} else {
chunks.extend_from_slice(other);
}
}

impl<T> ChunkedArray<T>
where
T: PolarsNumericType,
{
/// Append in place.
pub fn append(&mut self, other: &Self) {
let len = self.len();
new_chunks(&mut self.chunks, &other.chunks, len);
}
}

impl BooleanChunked {
pub fn append(&mut self, other: &Self) {
let len = self.len();
new_chunks(&mut self.chunks, &other.chunks, len);
}
}
impl Utf8Chunked {
pub fn append(&mut self, other: &Self) {
let len = self.len();
new_chunks(&mut self.chunks, &other.chunks, len);
}
}

impl ListChunked {
pub fn append(&mut self, other: &Self) {
let len = self.len();
new_chunks(&mut self.chunks, &other.chunks, len);
}
}
#[cfg(feature = "object")]
impl<T: PolarsObject> ObjectChunked<T> {
pub fn append(&mut self, other: &Self) {
let len = self.len();
new_chunks(&mut self.chunks, &other.chunks, len);
}
}
#[cfg(feature = "dtype-categorical")]
impl CategoricalChunked {
pub fn append(&mut self, other: &Self) {
if let (Some(rev_map_l), Some(rev_map_r)) = (
self.categorical_map.as_ref(),
other.categorical_map.as_ref(),
) {
// first assertion checks if the global string cache is equal,
// the second checks if we append a slice from this array to self
if !rev_map_l.same_src(rev_map_r) && !Arc::ptr_eq(rev_map_l, rev_map_r) {
panic!("Appending categorical data can only be done if they are made under the same global string cache. \
Consider using a global string cache.")
}

let new_rev_map = self.merge_categorical_map(other);
self.categorical_map = Some(new_rev_map);
}

let len = self.len();
new_chunks(&mut self.chunks, &other.chunks, len);
}
}
1 change: 1 addition & 0 deletions polars/polars-core/src/chunked_array/ops/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use arrow::buffer::Buffer;

pub(crate) mod aggregate;
mod any_value;
mod append;
mod apply;
mod bit_repr;
pub(crate) mod chunkops;
Expand Down
6 changes: 4 additions & 2 deletions polars/polars-core/src/series/implementations/utf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,10 @@ impl private::PrivateSeries for SeriesWrap<Utf8Chunked> {
}

fn str_value(&self, index: usize) -> Cow<str> {
// get AnyValue
Cow::Owned(format!("{}", self.get(index)))
match self.0.get(index) {
Some(s) => Cow::Borrowed(s),
None => Cow::Borrowed("null"),
}
}
}

Expand Down

0 comments on commit 0af94e1

Please sign in to comment.