In [None]:
import polars as pl
from polars.exceptions import StringCacheMismatchError

Enum
- ordered categorical data type (strings known in advance)

In [None]:
# enums
bears_enum = pl.Enum(
    ["Polar", "Panda", "Brown"]
)
bears = pl.Series(
    ["Polar", "Panda", "Brown", "Brown", "Polar"],
    dtype=bears_enum
)
print(bears)

In [None]:
# conditional on the ordering of enum
log_levels = pl.Enum(["debug", "info", "warning", "error"])

logs = pl.DataFrame(
    {
        "level": ["debug", "info", "debug", "error"],
        "message": [
            "process id: 525",
            "Service started correctly",
            "startup time: 67ms",
            "Cannot connect to DB!",
        ],
    },
    schema_overrides={
        "level": log_levels,
    },
)

non_debug_logs = logs.filter(
    pl.col("level") > "debug",
)
print(non_debug_logs)

Categorical

In [None]:
# creating categorical data
bears_cat = pl.Series(
    ["Polar", "Panda", "Brown", "Brown", "Polar"],
    dtype=pl.Categorical
)
print(bears_cat)

In [None]:
# lexical comparison
print(bears_cat < "Cat")

bears_str = pl.Series(
    ["Panda", "Brown", "Brown", "Polar", "Polar"],
)
print(bears_cat == bears_str)

In [None]:
# comparing two categorical columns (fail)
bears_cat2 = pl.Series(
    ["Panda", "Brown", "Brown", "Polar", "Polar"],
    dtype=pl.Categorical,
)

try:
    print(bears_cat == bears_cat2)
except StringCacheMismatchError as exc:
    exc_str = str(exc).splitlines()[0]
    print("StringCacheMismatchError:", exc_str)
    
    
# comparing two categorical columns (pass)
with pl.StringCache():
    bears_cat = pl.Series(
        ["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=pl.Categorical
    )
    bears_cat2 = pl.Series(
        ["Panda", "Brown", "Brown", "Polar", "Polar"], dtype=pl.Categorical
    )

print(bears_cat == bears_cat2)