-
-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Improve performance of categorical casting (#3724)
* cast to categorical (non-global) 35% faster * refactor stringcache * improve global cache
- Loading branch information
Showing
7 changed files
with
192 additions
and
107 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,7 @@ mod builder; | |
mod from; | ||
mod merge; | ||
mod ops; | ||
pub mod stringcache; | ||
|
||
use super::*; | ||
use crate::prelude::*; | ||
|
113 changes: 113 additions & 0 deletions
113
polars/polars-core/src/chunked_array/logical/categorical/stringcache.rs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
use crate::prelude::PlHashMap; | ||
use once_cell::sync::Lazy; | ||
use smartstring::{LazyCompact, SmartString}; | ||
use std::borrow::Borrow; | ||
use std::hash::{Hash, Hasher}; | ||
use std::sync::atomic::{AtomicBool, Ordering}; | ||
use std::sync::{Mutex, MutexGuard}; | ||
use std::time::{SystemTime, UNIX_EPOCH}; | ||
|
||
pub(crate) static USE_STRING_CACHE: AtomicBool = AtomicBool::new(false); | ||
|
||
pub fn with_string_cache<F: FnOnce() -> T, T>(func: F) -> T { | ||
toggle_string_cache(true); | ||
let out = func(); | ||
toggle_string_cache(false); | ||
out | ||
} | ||
|
||
/// Use a global string cache for the Categorical Types. | ||
/// | ||
/// This is used to cache the string categories locally. | ||
/// This allows join operations on categorical types. | ||
pub fn toggle_string_cache(toggle: bool) { | ||
USE_STRING_CACHE.store(toggle, Ordering::Release); | ||
|
||
if !toggle { | ||
STRING_CACHE.clear() | ||
} | ||
} | ||
|
||
/// Reset the global string cache used for the Categorical Types. | ||
pub fn reset_string_cache() { | ||
STRING_CACHE.clear() | ||
} | ||
|
||
/// Check if string cache is set. | ||
pub(crate) fn use_string_cache() -> bool { | ||
USE_STRING_CACHE.load(Ordering::Acquire) | ||
} | ||
|
||
pub(crate) struct SCacheInner { | ||
pub(crate) map: PlHashMap<StrHashGlobal, u32>, | ||
pub(crate) uuid: u128, | ||
} | ||
|
||
impl Default for SCacheInner { | ||
fn default() -> Self { | ||
Self { | ||
map: Default::default(), | ||
uuid: SystemTime::now() | ||
.duration_since(UNIX_EPOCH) | ||
.unwrap() | ||
.as_nanos(), | ||
} | ||
} | ||
} | ||
|
||
/// Used by categorical data that need to share global categories. | ||
/// In *eager* you need to specifically toggle global string cache to have a global effect. | ||
/// In *lazy* it is toggled on at the start of a computation run and turned of (deleted) when a | ||
/// result is produced. | ||
pub(crate) struct StringCache(pub(crate) Mutex<SCacheInner>); | ||
|
||
impl StringCache { | ||
pub(crate) fn lock_map(&self) -> MutexGuard<SCacheInner> { | ||
self.0.lock().unwrap() | ||
} | ||
|
||
pub(crate) fn clear(&self) { | ||
let mut lock = self.lock_map(); | ||
*lock = Default::default(); | ||
} | ||
} | ||
|
||
impl Default for StringCache { | ||
fn default() -> Self { | ||
StringCache(Mutex::new(Default::default())) | ||
} | ||
} | ||
|
||
pub(crate) static STRING_CACHE: Lazy<StringCache> = Lazy::new(Default::default); | ||
|
||
#[derive(Eq, Clone)] | ||
pub struct StrHashGlobal { | ||
pub(crate) str: SmartString<LazyCompact>, | ||
pub(crate) hash: u64, | ||
} | ||
|
||
impl<'a> Hash for StrHashGlobal { | ||
fn hash<H: Hasher>(&self, state: &mut H) { | ||
state.write_u64(self.hash) | ||
} | ||
} | ||
|
||
impl StrHashGlobal { | ||
pub(crate) fn new(s: SmartString<LazyCompact>, hash: u64) -> Self { | ||
Self { str: s, hash } | ||
} | ||
} | ||
|
||
impl PartialEq for StrHashGlobal { | ||
fn eq(&self, other: &Self) -> bool { | ||
// can be collisions in the hashtable even though the hashes are equal | ||
// e.g. hashtable hash = hash % n_slots | ||
(self.hash == other.hash) && (self.str == other.str) | ||
} | ||
} | ||
|
||
impl Borrow<str> for StrHashGlobal { | ||
fn borrow(&self) -> &str { | ||
self.str.as_str() | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.