Skip to content

Commit

Permalink
feat[rust,python]: support nested StringCache scope, enabling clean c…
Browse files Browse the repository at this point in the history
…omposition of frame pipeline units (#4665)
  • Loading branch information
alexander-beedie committed Sep 1, 2022
1 parent 298b978 commit 6f872e6
Show file tree
Hide file tree
Showing 9 changed files with 72 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use polars_arrow::trusted_len::PushUnchecked;

use crate::frame::groupby::hashing::HASHMAP_INIT_SIZE;
use crate::prelude::*;
use crate::{datatypes::PlHashMap, use_string_cache, StrHashGlobal, StringCache, POOL};
use crate::{datatypes::PlHashMap, using_string_cache, StrHashGlobal, StringCache, POOL};

pub enum RevMappingBuilder {
/// Hashmap: maps the indexes from the global cache/categorical array to indexes in the local Utf8Array
Expand Down Expand Up @@ -58,7 +58,7 @@ impl Default for RevMapping {
fn default() -> Self {
let slice: &[Option<&str>] = &[];
let cats = Utf8Array::<i64>::from(slice);
if use_string_cache() {
if using_string_cache() {
let cache = &mut crate::STRING_CACHE.lock_map();
let id = cache.uuid;
RevMapping::Global(Default::default(), cats, id)
Expand Down Expand Up @@ -388,7 +388,7 @@ impl CategoricalChunkedBuilder {
where
I: IntoIterator<Item = Option<&'a str>>,
{
if use_string_cache() {
if using_string_cache() {
self.build_global_map_contention(i)
} else {
let _ = self.build_local_map(i, false);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use arrow::datatypes::IntegerType;
use polars_arrow::compute::cast::cast;

use super::*;
use crate::use_string_cache;
use crate::using_string_cache;

impl From<&CategoricalChunked> for DictionaryArray<u32> {
fn from(ca: &CategoricalChunked) -> Self {
Expand Down Expand Up @@ -91,7 +91,7 @@ impl CategoricalChunked {
keys: &PrimitiveArray<u32>,
values: &Utf8Array<i64>,
) -> Self {
if use_string_cache() {
if using_string_cache() {
let mut builder = CategoricalChunkedBuilder::new(name, keys.len());
builder.global_map_from_local(keys, values.clone());
builder.finish()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ pub fn with_string_cache<F: FnOnce() -> T, T>(func: F) -> T {
/// This allows join operations on categorical types.
pub fn toggle_string_cache(toggle: bool) {
USE_STRING_CACHE.store(toggle, Ordering::Release);

if !toggle {
STRING_CACHE.clear()
}
Expand All @@ -38,7 +37,7 @@ pub fn reset_string_cache() {
}

/// Check if string cache is set.
pub(crate) fn use_string_cache() -> bool {
pub fn using_string_cache() -> bool {
USE_STRING_CACHE.load(Ordering::Acquire)
}

Expand Down
12 changes: 6 additions & 6 deletions polars/polars-lazy/src/frame/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -687,9 +687,9 @@ impl LazyFrame {
pub fn collect(self) -> Result<DataFrame> {
let file_caching = self.opt_state.file_caching;
#[cfg(feature = "dtype-categorical")]
let use_string_cache = self.opt_state.global_string_cache;
let using_string_cache = self.opt_state.global_string_cache;
#[cfg(feature = "dtype-categorical")]
if use_string_cache {
if using_string_cache {
eprint!("global string cache in combination with LazyFrames is deprecated; please set the global string cache globally.")
}
let mut expr_arena = Arena::with_capacity(256);
Expand All @@ -698,8 +698,8 @@ impl LazyFrame {

// if string cache was already set, we skip this and global settings are respected
#[cfg(feature = "dtype-categorical")]
if use_string_cache {
toggle_string_cache(use_string_cache);
if using_string_cache {
toggle_string_cache(using_string_cache);
}

let finger_prints = if file_caching {
Expand Down Expand Up @@ -729,8 +729,8 @@ impl LazyFrame {
state.file_cache.assert_empty();
}
#[cfg(feature = "dtype-categorical")]
if use_string_cache {
toggle_string_cache(!use_string_cache);
if using_string_cache {
toggle_string_cache(!using_string_cache);
}
out
}
Expand Down
4 changes: 2 additions & 2 deletions polars/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -344,9 +344,9 @@ pub mod prelude;

pub use polars_core::apply_method_all_arrow_series;
pub use polars_core::df;
#[cfg(feature = "dtype-categorical")]
pub use polars_core::toggle_string_cache;
pub use polars_core::{chunked_array, datatypes, doc, error, frame, functions, series, testing};
#[cfg(feature = "dtype-categorical")]
pub use polars_core::{toggle_string_cache, using_string_cache};
#[cfg(feature = "polars-io")]
pub use polars_io as io;
#[cfg(feature = "lazy")]
Expand Down
3 changes: 2 additions & 1 deletion py-polars/polars/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def version() -> str:
scan_parquet,
)
from polars.show_versions import show_versions
from polars.string_cache import StringCache, toggle_string_cache
from polars.string_cache import StringCache, toggle_string_cache, using_string_cache
from polars.utils import threadpool_size

__all__ = [
Expand Down Expand Up @@ -191,6 +191,7 @@ def version() -> str:
# polars.stringcache
"StringCache",
"toggle_string_cache",
"using_string_cache",
# polars.config
"Config",
# polars.internal.when
Expand Down
22 changes: 17 additions & 5 deletions py-polars/polars/string_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
from types import TracebackType

try:
from polars.polars import toggle_string_cache as pytoggle_string_cache
from polars.polars import toggle_string_cache as _toggle_string_cache
from polars.polars import using_string_cache as _using_string_cache

_DOCUMENTING = False
except ImportError:
Expand All @@ -15,7 +16,8 @@ class StringCache:
Context manager that allows data sources to share the same categorical features.
This will temporarily cache the string categories until the context manager is
finished.
finished. If StringCaches are nested, the global cache will only be invalidated
when the outermost context exits.
Examples
--------
Expand Down Expand Up @@ -63,7 +65,9 @@ def __init__(self) -> None:
pass

def __enter__(self) -> StringCache:
pytoggle_string_cache(True)
self._already_enabled = _using_string_cache()
if not self._already_enabled:
_toggle_string_cache(True)
return self

def __exit__(
Expand All @@ -72,7 +76,10 @@ def __exit__(
exc_val: BaseException | None,
exc_tb: TracebackType | None,
) -> None:
pytoggle_string_cache(False)
# note: if global string cache was already enabled
# on __enter__, do NOT reset it on __exit__
if not self._already_enabled:
_toggle_string_cache(False)


def toggle_string_cache(toggle: bool) -> None:
Expand All @@ -83,4 +90,9 @@ def toggle_string_cache(toggle: bool) -> None:
are equal.
"""
pytoggle_string_cache(toggle)
_toggle_string_cache(toggle)


def using_string_cache() -> bool:
"""Return the current state of the global string cache (enabled/disabled)."""
return _using_string_cache()
6 changes: 6 additions & 0 deletions py-polars/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,11 @@ fn toggle_string_cache(toggle: bool) {
polars::toggle_string_cache(toggle)
}

#[pyfunction]
fn using_string_cache() -> bool {
polars::using_string_cache()
}

#[pyfunction]
fn concat_str(s: Vec<dsl::PyExpr>, sep: &str) -> dsl::PyExpr {
let s = s.into_iter().map(|e| e.inner).collect::<Vec<_>>();
Expand Down Expand Up @@ -505,6 +510,7 @@ fn polars(py: Python, m: &PyModule) -> PyResult<()> {
m.add_wrapped(wrap_pyfunction!(version)).unwrap();
m.add_wrapped(wrap_pyfunction!(toggle_string_cache))
.unwrap();
m.add_wrapped(wrap_pyfunction!(using_string_cache)).unwrap();
m.add_wrapped(wrap_pyfunction!(concat_str)).unwrap();
m.add_wrapped(wrap_pyfunction!(concat_lst)).unwrap();
m.add_wrapped(wrap_pyfunction!(concat_df)).unwrap();
Expand Down
33 changes: 33 additions & 0 deletions py-polars/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,3 +225,36 @@ def test_merge_lit_under_global_cache_4491() -> None:
.then(pl.col("label"))
.otherwise(pl.lit(None, pl.Categorical))
).to_dict(False) == {"label": [None, "bar"], "value": [3, 9]}


def test_nested_cache_composition() -> None:
# very artificial example/test, but validates the behaviour
# of ested StringCache scopes, which we want to play well
# with each other when composing more complex pipelines.

assert pl.using_string_cache() is False

# function representing a composable stage of a pipeline; it implements
# an inner scope for the case where it is called by itself, but when
# called as part of a larger series of ops it should not invalidate
# the string cache (the outermost scope should be respected).
def create_lazy(data: dict) -> pl.LazyFrame: # type: ignore[type-arg]
with pl.StringCache():
df = pl.DataFrame({"a": ["foo", "bar", "ham"], "b": [1, 2, 3]})
lf = df.with_column(pl.col("a").cast(pl.Categorical)).lazy()

# confirm that scope-exit does NOT invalidate the
# cache yet, as an outer context is still active
assert pl.using_string_cache() is True
return lf

# this outer scope should be respected
with pl.StringCache():
lf1 = create_lazy({"a": ["foo", "bar", "ham"], "b": [1, 2, 3]})
lf2 = create_lazy({"a": ["spam", "foo", "eggs"], "c": [3, 2, 2]})

res = lf1.join(lf2, on="a", how="inner").collect().rows()
assert sorted(res) == [("bar", 2, 2), ("foo", 1, 1), ("ham", 3, 3)]

# no other scope active; NOW we expect the cache to have been invalidated
assert pl.using_string_cache() is False

0 comments on commit 6f872e6

Please sign in to comment.