Skip to content

Commit

Permalink
str.ends_with/ str.starts_with (#3770)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Jun 22, 2022
1 parent 996587a commit 3e797f5
Show file tree
Hide file tree
Showing 10 changed files with 179 additions and 20 deletions.
38 changes: 38 additions & 0 deletions polars/polars-lazy/src/dsl/function_expr/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ mod arg_where;
#[cfg(feature = "is_in")]
mod is_in;
mod pow;
#[cfg(feature = "strings")]
mod strings;

use super::*;
use polars_core::prelude::*;
Expand All @@ -20,6 +22,15 @@ pub enum FunctionExpr {
IsIn,
#[cfg(feature = "arg_where")]
ArgWhere,
#[cfg(feature = "strings")]
StringContains {
pat: String,
literal: bool,
},
#[cfg(feature = "strings")]
StringStartsWith(String),
#[cfg(feature = "strings")]
StringEndsWith(String),
}

impl FunctionExpr {
Expand Down Expand Up @@ -52,6 +63,10 @@ impl FunctionExpr {
IsIn => with_dtype(DataType::Boolean),
#[cfg(feature = "arg_where")]
ArgWhere => with_dtype(IDX_DTYPE),
#[cfg(feature = "strings")]
StringContains { .. } | StringEndsWith(_) | StringStartsWith(_) => {
with_dtype(DataType::Boolean)
}
}
}
}
Expand All @@ -62,6 +77,17 @@ macro_rules! wrap {
};
}

macro_rules! map_with_args {
($func:path, $($args:expr),*) => {{
let f = move |s: &mut [Series]| {
let s = &s[0];
$func(s, $($args),*)
};

NoEq::new(Arc::new(f))
}};
}

impl From<FunctionExpr> for NoEq<Arc<dyn SeriesUdf>> {
fn from(func: FunctionExpr) -> Self {
use FunctionExpr::*;
Expand Down Expand Up @@ -92,6 +118,18 @@ impl From<FunctionExpr> for NoEq<Arc<dyn SeriesUdf>> {
ArgWhere => {
wrap!(arg_where::arg_where)
}
#[cfg(feature = "strings")]
StringContains { pat, literal } => {
map_with_args!(strings::contains, &pat, literal)
}
#[cfg(feature = "strings")]
StringEndsWith(sub) => {
map_with_args!(strings::ends_with, &sub)
}
#[cfg(feature = "strings")]
StringStartsWith(sub) => {
map_with_args!(strings::starts_with, &sub)
}
}
}
}
19 changes: 19 additions & 0 deletions polars/polars-lazy/src/dsl/function_expr/strings.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
use super::*;

pub(super) fn contains(s: &Series, pat: &str, literal: bool) -> Result<Series> {
let ca = s.utf8()?;
if literal {
ca.contains_literal(pat).map(|ca| ca.into_series())
} else {
ca.contains(pat).map(|ca| ca.into_series())
}
}

pub(super) fn ends_with(s: &Series, sub: &str) -> Result<Series> {
let ca = s.utf8()?;
Ok(ca.ends_with(sub).into_series())
}
pub(super) fn starts_with(s: &Series, sub: &str) -> Result<Series> {
let ca = s.utf8()?;
Ok(ca.starts_with(sub).into_series())
}
32 changes: 32 additions & 0 deletions polars/polars-lazy/src/dsl/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,38 @@ use polars_time::prelude::*;
pub struct StringNameSpace(pub(crate) Expr);

impl StringNameSpace {
/// Check if a string value contains a literal substring.
pub fn contains_literal(self, pat: String) -> Expr {
self.0.map_private(
FunctionExpr::StringContains { pat, literal: true },
"str.contains_literal",
)
}

/// Check if a string value contains a Regex substring.
pub fn contains(self, pat: String) -> Expr {
self.0.map_private(
FunctionExpr::StringContains {
pat,
literal: false,
},
"str.contains",
)
}

/// Check if a string value ends with the `sub` string.
pub fn ends_with(self, sub: String) -> Expr {
self.0
.map_private(FunctionExpr::StringEndsWith(sub), "str.ends_with")
}

/// Check if a string value starts with the `sub` string.
pub fn starts_with(self, sub: String) -> Expr {
self.0
.map_private(FunctionExpr::StringStartsWith(sub), "str.starts_with")
}

/// Extract a regex pattern from the a string value.
pub fn extract(self, pat: &str, group_index: usize) -> Expr {
let pat = pat.to_string();
let function = move |s: Series| {
Expand Down
18 changes: 18 additions & 0 deletions polars/polars-ops/src/chunked_array/strings/namespace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,24 @@ pub trait Utf8NameSpaceImpl: AsUtf8 {
Ok(out)
}

/// Check if strings ends with a substring
fn ends_with(&self, sub: &str) -> BooleanChunked {
let ca = self.as_utf8();
let f = |s: &str| s.ends_with(sub);
let mut out: BooleanChunked = ca.into_iter().map(|opt_s| opt_s.map(f)).collect();
out.rename(ca.name());
out
}

/// Check if strings starts with a substring
fn starts_with(&self, sub: &str) -> BooleanChunked {
let ca = self.as_utf8();
let f = |s: &str| s.starts_with(sub);
let mut out: BooleanChunked = ca.into_iter().map(|opt_s| opt_s.map(f)).collect();
out.rename(ca.name());
out
}

/// Replace the leftmost (sub)string by a regex pattern
fn replace(&self, pat: &str, val: &str) -> Result<Utf8Chunked> {
let ca = self.as_utf8();
Expand Down
3 changes: 2 additions & 1 deletion py-polars/docs/source/reference/expression.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ These functions can be used as expression and sometimes also in eager contexts.
median
n_unique
first
last
head
tail
lit
Expand Down Expand Up @@ -317,6 +316,8 @@ The following methods are available under the `Expr.str` attribute.
ExprStringNameSpace.ljust
ExprStringNameSpace.rjust
ExprStringNameSpace.contains
ExprStringNameSpace.starts_with
ExprStringNameSpace.ends_with
ExprStringNameSpace.json_path_match
ExprStringNameSpace.extract
ExprStringNameSpace.extract_all
Expand Down
2 changes: 2 additions & 0 deletions py-polars/docs/source/reference/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,8 @@ The following methods are available under the `Series.str` attribute.
StringNameSpace.lengths
StringNameSpace.concat
StringNameSpace.contains
StringNameSpace.starts_with
StringNameSpace.ends_with
StringNameSpace.json_path_match
StringNameSpace.extract
StringNameSpace.extract_all
Expand Down
22 changes: 22 additions & 0 deletions py-polars/polars/internals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -4467,6 +4467,28 @@ def contains(self, pattern: str, literal: bool = False) -> Expr:
"""
return wrap_expr(self._pyexpr.str_contains(pattern, literal))

def ends_with(self, sub: str) -> Expr:
"""
Check if string values end with a substring
Parameters
----------
sub
Suffix
"""
return wrap_expr(self._pyexpr.str_ends_with(sub))

def starts_with(self, sub: str) -> Expr:
"""
Check if string values start with a substring
Parameters
----------
sub
Prefix
"""
return wrap_expr(self._pyexpr.str_starts_with(sub))

def json_path_match(self, json_path: str) -> Expr:
"""
Extract the first match of json string with provided JSONPath expression.
Expand Down
24 changes: 24 additions & 0 deletions py-polars/polars/internals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3990,6 +3990,30 @@ def contains(self, pattern: str, literal: bool = False) -> Series:
"""
return wrap_s(self._s.str_contains(pattern, literal))

def ends_with(self, sub: str) -> Series:
"""
Check if string values end with a substring
Parameters
----------
sub
Suffix
"""
s = wrap_s(self._s)
return s.to_frame().select(pli.col(s.name).str.ends_with(sub)).to_series()

def starts_with(self, sub: str) -> Series:
"""
Check if string values start with a substring
Parameters
----------
sub
Prefix
"""
s = wrap_s(self._s)
return s.to_frame().select(pli.col(s.name).str.starts_with(sub)).to_series()

def decode(self, encoding: str, strict: bool = False) -> Series:
"""
Decodes a value using the provided encoding.
Expand Down
32 changes: 13 additions & 19 deletions py-polars/src/lazy/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -586,26 +586,20 @@ impl PyExpr {
}

pub fn str_contains(&self, pat: String, literal: Option<bool>) -> PyExpr {
let function = move |s: Series| {
let ca = s.utf8()?;
let match_type = |p: &str| {
if literal.unwrap_or(false) {
ca.contains_literal(p)
} else {
ca.contains(p)
}
};
match match_type(&pat) {
Ok(ca) => Ok(ca.into_series()),
Err(e) => Err(PolarsError::ComputeError(format!("{:?}", e).into())),
}
};
self.clone()
.inner
.map(function, GetOutput::from_type(DataType::Boolean))
.with_fmt("str.contains")
.into()
match literal {
Some(true) => self.inner.clone().str().contains_literal(pat).into(),
_ => self.inner.clone().str().contains(pat).into(),
}
}

pub fn str_ends_with(&self, sub: String) -> PyExpr {
self.inner.clone().str().ends_with(sub).into()
}

pub fn str_starts_with(&self, sub: String) -> PyExpr {
self.inner.clone().str().starts_with(sub).into()
}

pub fn str_hex_encode(&self) -> PyExpr {
self.clone()
.inner
Expand Down
9 changes: 9 additions & 0 deletions py-polars/tests/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,12 @@ def test_format_empty_df() -> None:
)
assert df.shape == (0, 1)
assert df.dtypes == [pl.Utf8]


def test_starts_ends_with() -> None:
assert pl.DataFrame({"a": ["hamburger", "nuts", "lollypop"]}).select(
[
pl.col("a").str.ends_with("pop").alias("pop"),
pl.col("a").str.starts_with("ham").alias("ham"),
]
).to_dict(False) == {"pop": [False, False, True], "ham": [True, False, False]}

0 comments on commit 3e797f5

Please sign in to comment.