Skip to content

Commit

Permalink
Improve performance of categorical casting (#3724)
Browse files Browse the repository at this point in the history
* cast to categorical (non-global) 35% faster

* refactor stringcache

* improve global cache
  • Loading branch information
ritchie46 committed Jun 18, 2022
1 parent 99f5f0f commit 3419ad7
Show file tree
Hide file tree
Showing 7 changed files with 192 additions and 107 deletions.
3 changes: 2 additions & 1 deletion polars/polars-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ dtype-i8 = []
dtype-i16 = []
dtype-u8 = []
dtype-u16 = []
dtype-categorical = []
dtype-categorical = ["smartstring"]
dtype-struct = []

parquet = ["arrow/io_parquet"]
Expand Down Expand Up @@ -167,6 +167,7 @@ regex = { version = "1.5", optional = true }
# activate if you want serde support for Series and DataFrames
serde = { version = "1", features = ["derive"], optional = true }
serde_json = { version = "1", optional = true }
smartstring = { version = "1", optional = true }
thiserror = "^1.0"

[dependencies.arrow]
Expand Down
75 changes: 60 additions & 15 deletions polars/polars-core/src/chunked_array/logical/categorical/builder.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
use crate::frame::groupby::hashing::HASHMAP_INIT_SIZE;
use crate::prelude::*;
use crate::{datatypes::PlHashMap, use_string_cache};
use crate::{datatypes::PlHashMap, use_string_cache, StrHashGlobal};
use ahash::CallHasher;
use arrow::array::*;
use hashbrown::hash_map::RawEntryMut;
use std::hash::{Hash, Hasher};

pub enum RevMappingBuilder {
/// Hashmap: maps the indexes from the global cache/categorical array to indexes in the local Utf8Array
Expand Down Expand Up @@ -121,6 +125,32 @@ impl RevMapping {
}
}

#[derive(Eq, Copy, Clone)]
pub struct StrHashLocal<'a> {
str: &'a str,
hash: u64,
}

impl<'a> Hash for StrHashLocal<'a> {
fn hash<H: Hasher>(&self, state: &mut H) {
state.write_u64(self.hash)
}
}

impl<'a> StrHashLocal<'a> {
pub(crate) fn new(s: &'a str, hash: u64) -> Self {
Self { str: s, hash }
}
}

impl<'a> PartialEq for StrHashLocal<'a> {
fn eq(&self, other: &Self) -> bool {
// can be collisions in the hashtable even though the hashes are equal
// e.g. hashtable hash = hash % n_slots
(self.hash == other.hash) && (self.str == other.str)
}
}

pub struct CategoricalChunkedBuilder {
array_builder: UInt32Vec,
name: String,
Expand Down Expand Up @@ -152,18 +182,28 @@ impl CategoricalChunkedBuilder {
{
if use_string_cache() {
let mut cache = crate::STRING_CACHE.lock_map();
let mapping = &mut cache.map;
let hb = mapping.hasher().clone();

for opt_s in i {
match opt_s {
Some(s) => {
let idx = match cache.map.get(s) {
Some(idx) => *idx,
None => {
let idx = cache.map.len() as u32;
cache.map.insert(s.to_string(), idx);
idx
let h = str::get_hash(s, &hb);
let mut idx = mapping.len() as u32;
// Note that we don't create the StrHashGlobal to search the key in the hashmap
// as StrHashGlobal may allocate a string
let entry = mapping
.raw_entry_mut()
.from_hash(h, |val| (val.hash == h) && val.str == s);

match entry {
RawEntryMut::Occupied(entry) => idx = *entry.get(),
RawEntryMut::Vacant(entry) => {
// only just now we allocate the string
let key = StrHashGlobal::new(s.into(), h);
entry.insert_with_hasher(h, key, idx, |s| s.hash);
}
};
}
// we still need to check if the idx is already stored in our map
self.reverse_mapping.insert(idx, s);
self.array_builder.push(Some(idx));
Expand All @@ -174,17 +214,22 @@ impl CategoricalChunkedBuilder {
}
}
} else {
let mut mapping = PlHashMap::new();
let mut mapping = PlHashMap::with_capacity(HASHMAP_INIT_SIZE);
let hb = mapping.hasher().clone();
for opt_s in i {
match opt_s {
Some(s) => {
let idx = match mapping.get(s) {
Some(idx) => *idx,
None => {
let idx = mapping.len() as u32;
let h = str::get_hash(s, &hb);
let key = StrHashLocal::new(s, h);
let mut idx = mapping.len() as u32;

let entry = mapping.raw_entry_mut().from_key_hashed_nocheck(h, &key);

match entry {
RawEntryMut::Occupied(entry) => idx = *entry.get(),
RawEntryMut::Vacant(entry) => {
entry.insert_with_hasher(h, key, idx, |s| s.hash);
self.reverse_mapping.insert(idx, s);
mapping.insert(s, idx);
idx
}
};
self.array_builder.push(Some(idx));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ mod builder;
mod from;
mod merge;
mod ops;
pub mod stringcache;

use super::*;
use crate::prelude::*;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
use crate::prelude::PlHashMap;
use once_cell::sync::Lazy;
use smartstring::{LazyCompact, SmartString};
use std::borrow::Borrow;
use std::hash::{Hash, Hasher};
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Mutex, MutexGuard};
use std::time::{SystemTime, UNIX_EPOCH};

pub(crate) static USE_STRING_CACHE: AtomicBool = AtomicBool::new(false);

pub fn with_string_cache<F: FnOnce() -> T, T>(func: F) -> T {
toggle_string_cache(true);
let out = func();
toggle_string_cache(false);
out
}

/// Use a global string cache for the Categorical Types.
///
/// This is used to cache the string categories locally.
/// This allows join operations on categorical types.
pub fn toggle_string_cache(toggle: bool) {
USE_STRING_CACHE.store(toggle, Ordering::Release);

if !toggle {
STRING_CACHE.clear()
}
}

/// Reset the global string cache used for the Categorical Types.
pub fn reset_string_cache() {
STRING_CACHE.clear()
}

/// Check if string cache is set.
pub(crate) fn use_string_cache() -> bool {
USE_STRING_CACHE.load(Ordering::Acquire)
}

pub(crate) struct SCacheInner {
pub(crate) map: PlHashMap<StrHashGlobal, u32>,
pub(crate) uuid: u128,
}

impl Default for SCacheInner {
fn default() -> Self {
Self {
map: Default::default(),
uuid: SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_nanos(),
}
}
}

/// Used by categorical data that need to share global categories.
/// In *eager* you need to specifically toggle global string cache to have a global effect.
/// In *lazy* it is toggled on at the start of a computation run and turned of (deleted) when a
/// result is produced.
pub(crate) struct StringCache(pub(crate) Mutex<SCacheInner>);

impl StringCache {
pub(crate) fn lock_map(&self) -> MutexGuard<SCacheInner> {
self.0.lock().unwrap()
}

pub(crate) fn clear(&self) {
let mut lock = self.lock_map();
*lock = Default::default();
}
}

impl Default for StringCache {
fn default() -> Self {
StringCache(Mutex::new(Default::default()))
}
}

pub(crate) static STRING_CACHE: Lazy<StringCache> = Lazy::new(Default::default);

#[derive(Eq, Clone)]
pub struct StrHashGlobal {
pub(crate) str: SmartString<LazyCompact>,
pub(crate) hash: u64,
}

impl<'a> Hash for StrHashGlobal {
fn hash<H: Hasher>(&self, state: &mut H) {
state.write_u64(self.hash)
}
}

impl StrHashGlobal {
pub(crate) fn new(s: SmartString<LazyCompact>, hash: u64) -> Self {
Self { str: s, hash }
}
}

impl PartialEq for StrHashGlobal {
fn eq(&self, other: &Self) -> bool {
// can be collisions in the hashtable even though the hashes are equal
// e.g. hashtable hash = hash % n_slots
(self.hash == other.hash) && (self.str == other.str)
}
}

impl Borrow<str> for StrHashGlobal {
fn borrow(&self) -> &str {
self.str.as_str()
}
}
93 changes: 3 additions & 90 deletions polars/polars-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,14 @@ mod tests;
pub(crate) mod vector_hasher;
use once_cell::sync::Lazy;

#[cfg(any(feature = "dtype-categorical", feature = "object"))]
#[cfg(feature = "object")]
use std::time::{SystemTime, UNIX_EPOCH};

#[cfg(feature = "dtype-categorical")]
use ahash::AHashMap;
use rayon::{ThreadPool, ThreadPoolBuilder};
#[cfg(feature = "dtype-categorical")]
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Mutex;

#[cfg(feature = "dtype-categorical")]
use std::sync::MutexGuard;
pub use crate::chunked_array::logical::categorical::stringcache::*;

#[cfg(feature = "object")]
pub(crate) static PROCESS_ID: Lazy<u128> = Lazy::new(|| {
Expand All @@ -60,89 +57,5 @@ pub static POOL: Lazy<ThreadPool> = Lazy::new(|| {
.expect("could not spawn threads")
});

#[cfg(feature = "dtype-categorical")]
struct SCacheInner {
map: AHashMap<String, u32>,
uuid: u128,
}

#[cfg(feature = "dtype-categorical")]
impl Default for SCacheInner {
fn default() -> Self {
Self {
map: Default::default(),
uuid: SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_nanos(),
}
}
}

/// Used by categorical data that need to share global categories.
/// In *eager* you need to specifically toggle global string cache to have a global effect.
/// In *lazy* it is toggled on at the start of a computation run and turned of (deleted) when a
/// result is produced.
#[cfg(feature = "dtype-categorical")]
pub(crate) struct StringCache(pub(crate) Mutex<SCacheInner>);

#[cfg(feature = "dtype-categorical")]
impl StringCache {
pub(crate) fn lock_map(&self) -> MutexGuard<SCacheInner> {
self.0.lock().unwrap()
}

pub(crate) fn clear(&self) {
let mut lock = self.lock_map();
*lock = Default::default();
}
}

#[cfg(feature = "dtype-categorical")]
impl Default for StringCache {
fn default() -> Self {
StringCache(Mutex::new(Default::default()))
}
}

#[cfg(feature = "dtype-categorical")]
pub(crate) static USE_STRING_CACHE: AtomicBool = AtomicBool::new(false);

#[cfg(feature = "dtype-categorical")]
pub(crate) static STRING_CACHE: Lazy<StringCache> = Lazy::new(Default::default);

// utility for the tests to ensure a single thread can execute
pub static SINGLE_LOCK: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));

#[cfg(feature = "dtype-categorical")]
pub fn with_string_cache<F: FnOnce() -> T, T>(func: F) -> T {
toggle_string_cache(true);
let out = func();
toggle_string_cache(false);
out
}

/// Use a global string cache for the Categorical Types.
///
/// This is used to cache the string categories locally.
/// This allows join operations on categorical types.
#[cfg(feature = "dtype-categorical")]
pub fn toggle_string_cache(toggle: bool) {
USE_STRING_CACHE.store(toggle, Ordering::Release);

if !toggle {
STRING_CACHE.clear()
}
}

/// Reset the global string cache used for the Categorical Types.
#[cfg(feature = "dtype-categorical")]
pub fn reset_string_cache() {
STRING_CACHE.clear()
}

/// Check if string cache is set.
#[cfg(feature = "dtype-categorical")]
pub(crate) fn use_string_cache() -> bool {
USE_STRING_CACHE.load(Ordering::Acquire)
}
2 changes: 1 addition & 1 deletion polars/polars-core/src/vector_hasher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ impl<'a> StrHash<'a> {

impl<'a> PartialEq for StrHash<'a> {
fn eq(&self, other: &Self) -> bool {
self.str == other.str
(self.hash == other.hash) && (self.str == other.str)
}
}

Expand Down
12 changes: 12 additions & 0 deletions py-polars/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 3419ad7

Please sign in to comment.