Skip to content

Commit

Permalink
feat(rust, python): impl hex and base64 for binary (#5892)
Browse files Browse the repository at this point in the history
  • Loading branch information
ozgrakkurt committed Dec 26, 2022
1 parent ab3b2c2 commit 3dccb4a
Show file tree
Hide file tree
Showing 26 changed files with 579 additions and 87 deletions.
1 change: 1 addition & 0 deletions polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ mode = ["polars-core/mode", "polars-lazy/mode"]
take_opt_iter = ["polars-core/take_opt_iter"]
extract_jsonpath = ["polars-core/strings", "polars-ops/extract_jsonpath", "polars-ops/strings"]
string_encoding = ["polars-core/string_encoding", "polars-core/strings"]
binary_encoding = ["polars-core/binary_encoding", "dtype-binary"]
groupby_list = ["polars-core/groupby_list"]
lazy_regex = ["polars-lazy/regex"]
cum_agg = ["polars-core/cum_agg", "polars-core/cum_agg"]
Expand Down
3 changes: 3 additions & 0 deletions polars/polars-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ string_encoding = ["base64", "hex"]
# support for ObjectChunked<T> (downcastable Series of any type)
object = ["serde_json"]

# extra utilities for BinaryChunked
binary_encoding = ["base64", "hex", "dtype-binary"]

fmt = ["comfy-table/tty"]
fmt_no_tty = ["comfy-table"]

Expand Down
33 changes: 33 additions & 0 deletions polars/polars-core/src/chunked_array/binary/encoding.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
use {base64, hex};

use crate::prelude::*;

impl BinaryChunked {
pub fn hex_decode(&self) -> PolarsResult<BinaryChunked> {
self.try_apply(|s| {
let bytes =
hex::decode(s).map_err(|e| PolarsError::ComputeError(e.to_string().into()))?;
Ok(bytes.into())
})
}

pub fn hex_encode(&self) -> Series {
self.apply(|s| hex::encode(s).into_bytes().into())
.cast_unchecked(&DataType::Utf8)
.unwrap()
}

pub fn base64_decode(&self) -> PolarsResult<BinaryChunked> {
self.try_apply(|s| {
let bytes =
base64::decode(s).map_err(|e| PolarsError::ComputeError(e.to_string().into()))?;
Ok(bytes.into())
})
}

pub fn base64_encode(&self) -> Series {
self.apply(|s| base64::encode(s).into_bytes().into())
.cast_unchecked(&DataType::Utf8)
.unwrap()
}
}
2 changes: 2 additions & 0 deletions polars/polars-core/src/chunked_array/binary/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#[cfg(feature = "binary_encoding")]
mod encoding;
2 changes: 2 additions & 0 deletions polars/polars-core/src/chunked_array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ pub mod kernels;
#[cfg(feature = "ndarray")]
mod ndarray;

#[cfg(feature = "dtype-binary")]
pub mod binary;
mod bitwise;
#[cfg(feature = "object")]
mod drop;
Expand Down
65 changes: 31 additions & 34 deletions polars/polars-core/src/chunked_array/strings/encoding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,50 +3,47 @@ use {base64, hex};
use crate::prelude::*;

impl Utf8Chunked {
#[cfg(feature = "string_encoding")]
pub fn hex_decode(&self, strict: Option<bool>) -> PolarsResult<Utf8Chunked> {
let ca = self.apply_on_opt(|e| {
e.and_then(|s| {
hex::decode(s)
// Safety
// We already know that it is a valid utf8.
.map(|bytes| Some(unsafe { String::from_utf8_unchecked(bytes) }.into()))
.unwrap_or(None)
})
});
#[cfg(not(feature = "binary_encoding"))]
pub fn hex_decode(&self) -> PolarsResult<Utf8Chunked> {
self.try_apply(|s| {
let bytes =
hex::decode(s).map_err(|e| PolarsError::ComputeError(e.to_string().into()))?;
let s = String::from_utf8(bytes)
.map_err(|e| PolarsError::ComputeError(e.to_string().into()))?;
Ok(s.into())
})
}

if strict.unwrap_or(false) && (ca.null_count() != self.null_count()) {
Err(PolarsError::ComputeError("Unable to decode inputs".into()))
} else {
Ok(ca)
}
#[cfg(feature = "binary_encoding")]
pub fn hex_decode(&self) -> PolarsResult<BinaryChunked> {
self.cast_unchecked(&DataType::Binary)?
.binary()?
.hex_decode()
}
#[cfg(feature = "string_encoding")]

#[must_use]
pub fn hex_encode(&self) -> Utf8Chunked {
self.apply(|s| hex::encode(s).into())
}

#[cfg(feature = "string_encoding")]
pub fn base64_decode(&self, strict: Option<bool>) -> PolarsResult<Utf8Chunked> {
let ca = self.apply_on_opt(|e| {
e.and_then(|s| {
base64::decode(s)
// Safety
// We already know that it is a valid utf8.
.map(|bytes| Some(unsafe { String::from_utf8_unchecked(bytes) }.into()))
.unwrap_or(None)
})
});
#[cfg(not(feature = "binary_encoding"))]
pub fn base64_decode(&self) -> PolarsResult<Utf8Chunked> {
self.try_apply(|s| {
let bytes =
base64::decode(s).map_err(|e| PolarsError::ComputeError(e.to_string().into()))?;
let s = String::from_utf8(bytes)
.map_err(|e| PolarsError::ComputeError(e.to_string().into()))?;
Ok(s.into())
})
}

if strict.unwrap_or(false) && (ca.null_count() != self.null_count()) {
Err(PolarsError::ComputeError("Unable to decode inputs".into()))
} else {
Ok(ca)
}
#[cfg(feature = "binary_encoding")]
pub fn base64_decode(&self) -> PolarsResult<BinaryChunked> {
self.cast_unchecked(&DataType::Binary)?
.binary()?
.base64_decode()
}

#[cfg(feature = "string_encoding")]
#[must_use]
pub fn base64_encode(&self) -> Utf8Chunked {
self.apply(|s| base64::encode(s).into())
Expand Down
25 changes: 25 additions & 0 deletions polars/polars-lazy/polars-plan/src/dsl/binary.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
use super::function_expr::BinaryFunction;
use super::*;
/// Specialized expressions for [`Series`] of [`DataType::Utf8`].
pub struct BinaryNameSpace(pub(crate) Expr);

impl BinaryNameSpace {
/// Check if a binary value contains a literal binary.
pub fn contains_literal<S: AsRef<[u8]>>(self, pat: S) -> Expr {
let pat = pat.as_ref().into();
self.0
.map_private(BinaryFunction::Contains { pat, literal: true }.into())
}

/// Check if a binary value ends with the given sequence.
pub fn ends_with<S: AsRef<[u8]>>(self, sub: S) -> Expr {
let sub = sub.as_ref().into();
self.0.map_private(BinaryFunction::EndsWith(sub).into())
}

/// Check if a binary value starts with the given sequence.
pub fn starts_with<S: AsRef<[u8]>>(self, sub: S) -> Expr {
let sub = sub.as_ref().into();
self.0.map_private(BinaryFunction::StartsWith(sub).into())
}
}
49 changes: 49 additions & 0 deletions polars/polars-lazy/polars-plan/src/dsl/function_expr/binary.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};

use super::*;

#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Clone, PartialEq, Debug, Eq, Hash)]
pub enum BinaryFunction {
Contains { pat: Vec<u8>, literal: bool },
StartsWith(Vec<u8>),
EndsWith(Vec<u8>),
}

impl Display for BinaryFunction {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
use self::*;
let s = match self {
BinaryFunction::Contains { .. } => "contains",
BinaryFunction::StartsWith(_) => "starts_with",
BinaryFunction::EndsWith(_) => "ends_with",
};

write!(f, "str.{s}")
}
}

pub(super) fn contains(s: &Series, pat: &[u8], literal: bool) -> PolarsResult<Series> {
let ca = s.binary()?;
if literal {
ca.contains_literal(pat).map(|ca| ca.into_series())
} else {
ca.contains(pat).map(|ca| ca.into_series())
}
}

pub(super) fn ends_with(s: &Series, sub: &[u8]) -> PolarsResult<Series> {
let ca = s.binary()?;
Ok(ca.ends_with(sub).into_series())
}
pub(super) fn starts_with(s: &Series, sub: &[u8]) -> PolarsResult<Series> {
let ca = s.binary()?;
Ok(ca.starts_with(sub).into_series())
}

impl From<BinaryFunction> for FunctionExpr {
fn from(b: BinaryFunction) -> Self {
FunctionExpr::BinaryExpr(b)
}
}
28 changes: 28 additions & 0 deletions polars/polars-lazy/polars-plan/src/dsl/function_expr/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#[cfg(feature = "arg_where")]
mod arg_where;
#[cfg(feature = "dtype-binary")]
mod binary;
#[cfg(feature = "round_series")]
mod clip;
#[cfg(feature = "temporal")]
Expand Down Expand Up @@ -38,6 +40,8 @@ use polars_core::prelude::*;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};

#[cfg(feature = "dtype-binary")]
pub(crate) use self::binary::BinaryFunction;
#[cfg(feature = "temporal")]
pub(super) use self::datetime::TemporalFunction;
pub(super) use self::nan::NanFunction;
Expand All @@ -64,6 +68,8 @@ pub enum FunctionExpr {
SearchSorted,
#[cfg(feature = "strings")]
StringExpr(StringFunction),
#[cfg(feature = "dtype-binary")]
BinaryExpr(BinaryFunction),
#[cfg(feature = "temporal")]
TemporalExpr(TemporalFunction),
#[cfg(feature = "date_offset")]
Expand Down Expand Up @@ -130,6 +136,8 @@ impl Display for FunctionExpr {
SearchSorted => "search_sorted",
#[cfg(feature = "strings")]
StringExpr(s) => return write!(f, "{s}"),
#[cfg(feature = "dtype-binary")]
BinaryExpr(b) => return write!(f, "{b}"),
#[cfg(feature = "temporal")]
TemporalExpr(fun) => return write!(f, "{fun}"),
#[cfg(feature = "date_offset")]
Expand Down Expand Up @@ -277,6 +285,8 @@ impl From<FunctionExpr> for SpecialEq<Arc<dyn SeriesUdf>> {
}
#[cfg(feature = "strings")]
StringExpr(s) => s.into(),
#[cfg(feature = "dtype-binary")]
BinaryExpr(s) => s.into(),
#[cfg(feature = "temporal")]
TemporalExpr(func) => func.into(),

Expand Down Expand Up @@ -403,6 +413,24 @@ impl From<StringFunction> for SpecialEq<Arc<dyn SeriesUdf>> {
}
}

#[cfg(feature = "dtype-binary")]
impl From<BinaryFunction> for SpecialEq<Arc<dyn SeriesUdf>> {
fn from(func: BinaryFunction) -> Self {
use BinaryFunction::*;
match func {
Contains { pat, literal } => {
map!(binary::contains, &pat, literal)
}
EndsWith(sub) => {
map!(binary::ends_with, &sub)
}
StartsWith(sub) => {
map!(binary::starts_with, &sub)
}
}
}
}

#[cfg(feature = "temporal")]
impl From<TemporalFunction> for SpecialEq<Arc<dyn SeriesUdf>> {
fn from(func: TemporalFunction) -> Self {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,13 @@ impl FunctionExpr {
}
}
}
#[cfg(feature = "dtype-binary")]
BinaryExpr(s) => {
use BinaryFunction::*;
match s {
Contains { .. } | EndsWith(_) | StartsWith(_) => with_dtype(DataType::Boolean),
}
}
#[cfg(feature = "temporal")]
TemporalExpr(fun) => {
use TemporalFunction::*;
Expand Down
7 changes: 7 additions & 0 deletions polars/polars-lazy/polars-plan/src/dsl/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
pub mod cat;
#[cfg(feature = "dtype-categorical")]
pub use cat::*;
#[cfg(feature = "dtype-binary")]
pub mod binary;
#[cfg(feature = "temporal")]
mod dt;
mod expr;
Expand Down Expand Up @@ -2244,6 +2246,11 @@ impl Expr {
string::StringNameSpace(self)
}

#[cfg(feature = "dtype-binary")]
pub fn binary(self) -> binary::BinaryNameSpace {
binary::BinaryNameSpace(self)
}

#[cfg(feature = "temporal")]
pub fn dt(self) -> dt::DateLikeNameSpace {
dt::DateLikeNameSpace(self)
Expand Down
3 changes: 2 additions & 1 deletion polars/polars-ops/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ description = "More operations on polars data structures"
[dependencies]
arrow.workspace = true
jsonpath_lib = { version = "0.3.0", optional = true, git = "https://github.com/ritchie46/jsonpath", branch = "improve_compiled" }
memchr = { version = "2", optional = true }
polars-arrow = { version = "0.26.1", path = "../polars-arrow", default-features = false }
polars-core = { version = "0.26.1", path = "../polars-core", features = ["private"], default-features = false }
polars-utils = { version = "0.26.1", path = "../polars-utils", default-features = false }
Expand All @@ -26,7 +27,7 @@ dtype-datetime = ["polars-core/dtype-datetime", "polars-core/temporal"]
dtype-time = ["polars-core/dtype-time", "polars-core/temporal"]
dtype-duration = ["polars-core/dtype-duration", "polars-core/temporal"]
dtype-struct = ["polars-core/dtype-struct", "polars-core/temporal"]
dtype-binary = ["polars-core/dtype-binary", "polars-core/dtype-binary"]
dtype-binary = ["polars-core/dtype-binary", "polars-core/dtype-binary", "memchr"]
dtype-u8 = ["polars-core/dtype-u8"]
dtype-u16 = ["polars-core/dtype-u16"]
dtype-i8 = ["polars-core/dtype-i8"]
Expand Down
15 changes: 15 additions & 0 deletions polars/polars-ops/src/chunked_array/binary/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#[cfg_attr(docsrs, doc(cfg(feature = "dtype-binary")))]
mod namespace;

pub use namespace::*;
use polars_core::prelude::*;

pub trait AsBinary {
fn as_binary(&self) -> &BinaryChunked;
}

impl AsBinary for BinaryChunked {
fn as_binary(&self) -> &BinaryChunked {
self
}
}

0 comments on commit 3dccb4a

Please sign in to comment.