Skip to content

Commit

Permalink
refactor(rust): ensure reverse indices exist in global string cache (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Dec 31, 2022
1 parent 4703ca3 commit f8f9a7c
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 73 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use polars_utils::HashSingle;
use crate::datatypes::PlHashMap;
use crate::frame::groupby::hashing::HASHMAP_INIT_SIZE;
use crate::prelude::*;
use crate::{using_string_cache, StrHashGlobal, StringCache, POOL};
use crate::{using_string_cache, StringCache, POOL};

pub enum RevMappingBuilder {
/// Hashmap: maps the indexes from the global cache/categorical array to indexes in the local Utf8Array
Expand Down Expand Up @@ -272,31 +272,15 @@ impl<'a> CategoricalChunkedBuilder<'a> {
{
let cache = &mut crate::STRING_CACHE.lock_map();
id = cache.uuid;
let global_mapping = &mut cache.map;
let hb = global_mapping.hasher().clone();

for s in values.values_iter() {
let h = hb.hash_single(s);
let mut global_idx = global_mapping.len() as u32;
// Note that we don't create the StrHashGlobal to search the key in the hashmap
// as StrHashGlobal may allocate a string
let entry = global_mapping
.raw_entry_mut()
.from_hash(h, |val| (val.hash == h) && val.str == s);

match entry {
RawEntryMut::Occupied(entry) => global_idx = *entry.get(),
RawEntryMut::Vacant(entry) => {
// only just now we allocate the string
let key = StrHashGlobal::new(s.into(), h);
entry.insert_with_hasher(h, key, global_idx, |s| s.hash);
}
}
let global_idx = cache.insert(s);

// safety:
// we allocated enough
unsafe { local_to_global.push_unchecked(global_idx) }
}
if global_mapping.len() > u32::MAX as usize {
if cache.len() > u32::MAX as usize {
panic!("not more than {} categories supported", u32::MAX)
};
}
Expand Down Expand Up @@ -355,29 +339,14 @@ impl<'a> CategoricalChunkedBuilder<'a> {
{
let cache = &mut crate::STRING_CACHE.lock_map();
id = cache.uuid;
let global_mapping = &mut cache.map;

for (s, h) in values.values_iter().zip(hashes.into_iter()) {
let mut global_idx = global_mapping.len() as u32;
// Note that we don't create the StrHashGlobal to search the key in the hashmap
// as StrHashGlobal may allocate a string
let entry = global_mapping
.raw_entry_mut()
.from_hash(h, |val| (val.hash == h) && val.str == s);

match entry {
RawEntryMut::Occupied(entry) => global_idx = *entry.get(),
RawEntryMut::Vacant(entry) => {
// only just now we allocate the string
let key = StrHashGlobal::new(s.into(), h);
entry.insert_with_hasher(h, key, global_idx, |s| s.hash);
}
}
let global_idx = cache.insert_from_hash(h, s);
// safety:
// we allocated enough
unsafe { local_to_global.push_unchecked(global_idx) }
}
if global_mapping.len() > u32::MAX as usize {
if cache.len() > u32::MAX as usize {
panic!("not more than {} categories supported", u32::MAX)
};
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
use std::borrow::Borrow;
use std::hash::{Hash, Hasher};
use std::sync::atomic::{AtomicU32, Ordering};
use std::sync::{Mutex, MutexGuard};
use std::time::{SystemTime, UNIX_EPOCH};

use ahash::RandomState;
use hashbrown::hash_map::RawEntryMut;
use once_cell::sync::Lazy;
use polars_utils::HashSingle;
use smartstring::{LazyCompact, SmartString};

use crate::frame::groupby::hashing::HASHMAP_INIT_SIZE;
Expand Down Expand Up @@ -77,9 +78,72 @@ pub fn using_string_cache() -> bool {
USE_STRING_CACHE.load(Ordering::Acquire) > 0
}

// This is the hash and the Index offset in the linear buffer
#[derive(Copy, Clone)]
struct Key {
pub(super) hash: u64,
pub(super) idx: u32,
}

impl Key {
#[inline]
pub(super) fn new(hash: u64, idx: u32) -> Self {
Self { hash, idx }
}
}

impl Hash for Key {
#[inline]
fn hash<H: Hasher>(&self, state: &mut H) {
state.write_u64(self.hash)
}
}

pub(crate) struct SCacheInner {
pub(crate) map: PlHashMap<StrHashGlobal, u32>,
map: PlHashMap<Key, ()>,
pub(crate) uuid: u128,
payloads: Vec<StrHashGlobal>,
}

impl SCacheInner {
pub(crate) fn len(&self) -> usize {
self.map.len()
}

#[inline]
pub(crate) fn insert_from_hash(&mut self, h: u64, s: &str) -> u32 {
let mut global_idx = self.payloads.len() as u32;
// Note that we don't create the StrHashGlobal to search the key in the hashmap
// as StrHashGlobal may allocate a string
let entry = self.map.raw_entry_mut().from_hash(h, |key| {
(key.hash == h) && {
let pos = key.idx as usize;
let value = unsafe { self.payloads.get_unchecked(pos) };
s == value.as_str()
}
});

match entry {
RawEntryMut::Occupied(entry) => {
global_idx = entry.key().idx;
}
RawEntryMut::Vacant(entry) => {
let idx = self.payloads.len() as u32;
let key = Key::new(h, idx);
entry.insert_hashed_nocheck(h, key, ());

// only just now we allocate the string
self.payloads.push(s.into());
}
}
global_idx
}

#[inline]
pub(crate) fn insert(&mut self, s: &str) -> u32 {
let h = self.map.hasher().hash_single(s);
self.insert_from_hash(h, s)
}
}

impl Default for SCacheInner {
Expand All @@ -93,6 +157,7 @@ impl Default for SCacheInner {
.duration_since(UNIX_EPOCH)
.unwrap()
.as_nanos(),
payloads: Vec::with_capacity(HASHMAP_INIT_SIZE),
}
}
}
Expand Down Expand Up @@ -129,34 +194,4 @@ impl Default for StringCache {

pub(crate) static STRING_CACHE: Lazy<StringCache> = Lazy::new(Default::default);

#[derive(Eq, Clone)]
pub struct StrHashGlobal {
pub(crate) str: SmartString<LazyCompact>,
pub(crate) hash: u64,
}

impl Hash for StrHashGlobal {
fn hash<H: Hasher>(&self, state: &mut H) {
state.write_u64(self.hash)
}
}

impl StrHashGlobal {
pub(crate) fn new(s: SmartString<LazyCompact>, hash: u64) -> Self {
Self { str: s, hash }
}
}

impl PartialEq for StrHashGlobal {
fn eq(&self, other: &Self) -> bool {
// can be collisions in the hashtable even though the hashes are equal
// e.g. hashtable hash = hash % n_slots
(self.hash == other.hash) && (self.str == other.str)
}
}

impl Borrow<str> for StrHashGlobal {
fn borrow(&self) -> &str {
self.str.as_str()
}
}
type StrHashGlobal = SmartString<LazyCompact>;
6 changes: 3 additions & 3 deletions py-polars/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit f8f9a7c

Please sign in to comment.