Skip to content

Commit

Permalink
fix(rust, python): include single null value in global cat builder (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Oct 18, 2022
1 parent 8bc7596 commit a11f542
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -305,18 +305,14 @@ impl<'a> CategoricalChunkedBuilder<'a> {
let mut global_to_local = PlHashMap::with_capacity(local_to_global.len());

let compute_cats = || {
if local_to_global.is_empty() {
Default::default()
} else {
keys.into_iter()
.map(|opt_k| {
opt_k.map(|cat| {
debug_assert!((*cat as usize) < local_to_global.len());
*unsafe { local_to_global.get_unchecked(*cat as usize) }
})
keys.into_iter()
.map(|opt_k| {
opt_k.map(|cat| {
debug_assert!((*cat as usize) < local_to_global.len());
*unsafe { local_to_global.get_unchecked(*cat as usize) }
})
.collect::<UInt32Vec>()
}
})
.collect::<UInt32Vec>()
};

let (_, cats) = POOL.join(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ pub(crate) fn merge_categorical_map(
match (&**left, &**right) {
(RevMapping::Global(l_map, l_slots, l_id), RevMapping::Global(r_map, r_slots, r_id)) => {
if l_id != r_id {
return Err(PolarsError::ComputeError("The two categorical arrays are not created under the same global string cache. They cannot be merged".into()));
return Err(PolarsError::ComputeError("The two categorical arrays are not created under the same global string cache. They cannot be merged. Hint: set a global StringCache.".into()));
}
let mut new_map = (*l_map).clone();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@ use crate::prelude::PlHashMap;
pub(crate) static USE_STRING_CACHE: AtomicU32 = AtomicU32::new(0);

/// RAII for the string cache
pub struct IUseStringCache {}
pub struct IUseStringCache {
// only added so that it will never be constructed directly
#[allow(dead_code)]
private_zst: (),
}

impl Default for IUseStringCache {
fn default() -> Self {
Expand All @@ -29,7 +33,7 @@ impl IUseStringCache {
/// Hold the StringCache
pub fn new() -> IUseStringCache {
toggle_string_cache(true);
IUseStringCache {}
IUseStringCache { private_zst: () }
}
}

Expand Down
3 changes: 2 additions & 1 deletion polars/polars-core/src/frame/hash_join/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,8 @@ pub(crate) fn check_categorical_src(l: &DataType, r: &DataType) -> PolarsResult<
match (l, r) {
(DataType::Categorical(Some(l)), DataType::Categorical(Some(r))) => {
if !l.same_src(r) {
return Err(PolarsError::ComputeError("joins/or comparisons on categorical dtypes can only happen if they are created under the same global string cache".into()));
return Err(PolarsError::ComputeError("Joins/or comparisons on categorical dtypes can only happen if they are created under the same global string cache.\
Hint: set a global StringCache".into()));
}
Ok(())
}
Expand Down
20 changes: 20 additions & 0 deletions py-polars/tests/unit/io/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -872,3 +872,23 @@ def test_batched_csv_reader(foods_csv: str) -> None:
"fats_g": [10.0],
"sugars_g": [1],
}


def test_csv_single_categorical_null() -> None:
f = io.BytesIO()
pl.DataFrame(
{
"x": ["A"],
"y": [None],
"z": ["A"],
}
).write_csv(f)
f.seek(0)

df = pl.read_csv(
f,
dtypes={"y": pl.Categorical},
)

assert df.dtypes == [pl.Utf8, pl.Categorical, pl.Utf8]
assert df.to_dict(False) == {"x": ["A"], "y": [None], "z": ["A"]}

0 comments on commit a11f542

Please sign in to comment.