Skip to content

Commit

Permalink
make cat merge fallible and loossen restrictions on categorical appen…
Browse files Browse the repository at this point in the history
…ds (#3491)
  • Loading branch information
ritchie46 committed May 25, 2022
1 parent e0182d9 commit c752b69
Show file tree
Hide file tree
Showing 6 changed files with 20 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use arrow::bitmap::MutableBitmap;
use std::sync::Arc;

impl CategoricalChunked {
pub(crate) fn merge_categorical_map(&self, other: &Self) -> Arc<RevMapping> {
pub(crate) fn merge_categorical_map(&self, other: &Self) -> Result<Arc<RevMapping>> {
match (
&**self.get_rev_map(),
&**other.get_rev_map()
Expand All @@ -13,7 +13,7 @@ impl CategoricalChunked {
RevMapping::Global(r_map, r_slots, r_id),
) => {
if l_id != r_id {
panic!("The two categorical arrays are not created under the same global string cache. They cannot be merged")
return Err(PolarsError::ComputeError("The two categorical arrays are not created under the same global string cache. They cannot be merged".into()))
}
let mut new_map = (*l_map).clone();

Expand Down Expand Up @@ -52,20 +52,20 @@ impl CategoricalChunked {
});
}
let new_rev = RevMapping::Global(new_map, new_slots.into(), *l_id);
Arc::new(new_rev)
Ok(Arc::new(new_rev))
}
(RevMapping::Local(arr_l), RevMapping::Local(arr_r)) => {
// they are from the same source, just clone
if std::ptr::eq(arr_l, arr_r) {
return self.get_rev_map().clone()
return Ok(self.get_rev_map().clone())
}

let arr = arrow::compute::concatenate::concatenate(&[arr_l, arr_r]).unwrap();
let arr = arr.as_any().downcast_ref::<Utf8Array<i64>>().unwrap().clone();

Arc::new(RevMapping::Local(arr))
Ok(Arc::new(RevMapping::Local(arr)))
}
_ => panic!("cannot combine categorical under a global string cache with a non cached categorical")
_ => Err(PolarsError::ComputeError("cannot combine categorical under a global string cache with a non cached categorical".into()))
}
}
}
Expand All @@ -89,7 +89,7 @@ mod test {
builder2.drain_iter(vec![Some("hello"), None, Some("world"), Some("bar")].into_iter());
let ca1 = builder1.finish();
let ca2 = builder2.finish();
let rev_map = ca1.merge_categorical_map(&ca2);
let rev_map = ca1.merge_categorical_map(&ca2).unwrap();

let mut ca = UInt32Chunked::new("", &[0, 1, 2, 3]);
ca.categorical_map = Some(rev_map);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,7 @@ use crate::chunked_array::ops::append::new_chunks;

impl CategoricalChunked {
pub fn append(&mut self, other: &Self) -> Result<()> {
let (rev_map_l, rev_map_r) = (self.get_rev_map(), other.get_rev_map());
// first assertion checks if the global string cache is equal,
// the second checks if we append a slice from this array to self
if !rev_map_l.same_src(rev_map_r) && !Arc::ptr_eq(rev_map_l, rev_map_r) {
return Err(PolarsError::ComputeError(
"Appending categorical data can only be done if they are made under the same global string cache. \
Consider using a global string cache.".into()
));
}

let new_rev_map = self.merge_categorical_map(other);
let new_rev_map = self.merge_categorical_map(other)?;
self.set_rev_map(new_rev_map, false);

let len = self.len();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ impl CategoricalChunked {
}
_ => self.logical().zip_with(mask, other.logical())?,
};
let new_state = self.merge_categorical_map(other);
let new_state = self.merge_categorical_map(other)?;
Ok(CategoricalChunked::from_cats_and_rev_map(cats, new_state))
}
}
2 changes: 1 addition & 1 deletion polars/polars-core/src/frame/hash_join/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -925,7 +925,7 @@ impl DataFrame {
#[cfg(feature = "dtype-categorical")]
DataType::Categorical(_) => {
let ca_left = s_left.categorical().unwrap();
let new_rev_map = ca_left.merge_categorical_map(s_right.categorical().unwrap());
let new_rev_map = ca_left.merge_categorical_map(s_right.categorical().unwrap())?;
let logical = s.u32().unwrap().clone();
CategoricalChunked::from_cats_and_rev_map(logical, new_rev_map).into_series()
}
Expand Down
5 changes: 3 additions & 2 deletions polars/polars-core/src/series/implementations/categorical.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,8 @@ impl private::PrivateSeries for SeriesWrap<CategoricalChunked> {
) -> Series {
let new_rev_map = self
.0
.merge_categorical_map(right_column.categorical().unwrap());
.merge_categorical_map(right_column.categorical().unwrap())
.unwrap();
let left = self.0.logical();
let right = right_column
.categorical()
Expand Down Expand Up @@ -210,7 +211,7 @@ impl SeriesTrait for SeriesWrap<CategoricalChunked> {
if self.0.dtype() == other.dtype() {
let other = other.categorical()?;
self.0.logical_mut().extend(other.logical());
let new_rev_map = self.0.merge_categorical_map(other);
let new_rev_map = self.0.merge_categorical_map(other)?;
self.0.set_rev_map(new_rev_map, false);
Ok(())
} else {
Expand Down
7 changes: 7 additions & 0 deletions py-polars/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,10 @@ def test_comp_categorical_lit_dtype() -> None:
.otherwise(pl.col("column"))
.alias("column")
).dtypes == [pl.Categorical, pl.Int32]


def test_categorical_describe_3487() -> None:
# test if we don't err
df = pl.DataFrame({"cats": ["a", "b"]})
df = df.with_column(pl.col("cats").cast(pl.Categorical))
df.describe()

0 comments on commit c752b69

Please sign in to comment.