Skip to content

Commit

Permalink
fix(python): fix stringcache. latest refactor introduced a hashing er…
Browse files Browse the repository at this point in the history
…ror (#6056)
  • Loading branch information
ritchie46 committed Jan 5, 2023
1 parent 4bd67da commit f6ee650
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@ use once_cell::sync::Lazy;
use polars_utils::HashSingle;
use smartstring::{LazyCompact, SmartString};

use crate::datatypes::PlIdHashMap;
use crate::frame::groupby::hashing::HASHMAP_INIT_SIZE;
use crate::prelude::PlHashMap;
use crate::prelude::InitHashMaps;

/// We use atomic reference counting
/// to determine how many threads use the string cache
Expand Down Expand Up @@ -100,7 +101,7 @@ impl Hash for Key {
}

pub(crate) struct SCacheInner {
map: PlHashMap<Key, ()>,
map: PlIdHashMap<Key, ()>,
pub(crate) uuid: u128,
payloads: Vec<StrHashGlobal>,
}
Expand Down Expand Up @@ -146,18 +147,15 @@ impl SCacheInner {

#[inline]
pub(crate) fn insert(&mut self, s: &str) -> u32 {
let h = self.map.hasher().hash_single(s);
let h = StringCache::get_hash_builder().hash_single(s);
self.insert_from_hash(h, s)
}
}

impl Default for SCacheInner {
fn default() -> Self {
Self {
map: PlHashMap::with_capacity_and_hasher(
HASHMAP_INIT_SIZE,
StringCache::get_hash_builder(),
),
map: PlIdHashMap::with_capacity(HASHMAP_INIT_SIZE),
uuid: SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
Expand All @@ -177,6 +175,7 @@ pub(crate) struct StringCache(pub(crate) RwLock<SCacheInner>);
impl StringCache {
/// The global `StringCache` will always use a predictable seed. This allows local builders to mimic
/// the hashes in case of contention.
#[inline]
pub(crate) fn get_hash_builder() -> RandomState {
RandomState::with_seed(0)
}
Expand Down
13 changes: 13 additions & 0 deletions py-polars/tests/slow/test_categorical.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import polars as pl


def test_stringcache() -> None:
with pl.StringCache():
# create a reasonable sized columns so the categorical map
# is reallocated
df = pl.DataFrame({"cats": pl.arange(0, 1500, eager=True)}).select(
[pl.col("cats").cast(pl.Utf8).cast(pl.Categorical)]
)
assert df.filter(pl.col("cats").is_in(["1", "2"])).to_dict(False) == {
"cats": ["1", "2"]
}

0 comments on commit f6ee650

Please sign in to comment.