Skip to content

Commit

Permalink
Add flag to allow str.contains to search for string literals (#3711) (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
alexander-beedie committed Jun 18, 2022
1 parent 3419ad7 commit 7a9d7ac
Show file tree
Hide file tree
Showing 6 changed files with 87 additions and 16 deletions.
27 changes: 23 additions & 4 deletions polars/polars-ops/src/chunked_array/strings/namespace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,31 @@ pub trait Utf8NameSpaceImpl: AsUtf8 {
ca.apply(f)
}

/// Check if strings contain a regex pattern
/// Check if strings contain a regex pattern; select literal fast-path if no special chars
fn contains(&self, pat: &str) -> Result<BooleanChunked> {
let ca = self.as_utf8();
if pat
.chars()
.all(|c| c.is_alphanumeric() || c.is_whitespace() || (c == '_'))
{
self.contains_literal(pat)
} else {
let ca = self.as_utf8();
let reg = Regex::new(pat)?;
let f = |s| reg.is_match(s);
let mut out: BooleanChunked = if !ca.has_validity() {
ca.into_no_null_iter().map(f).collect()
} else {
ca.into_iter().map(|opt_s| opt_s.map(f)).collect()
};
out.rename(ca.name());
Ok(out)
}
}

let reg = Regex::new(pat)?;
let f = |s| reg.is_match(s);
/// Check if strings contain a given literal
fn contains_literal(&self, lit: &str) -> Result<BooleanChunked> {
let ca = self.as_utf8();
let f = |s: &str| s.contains(lit);
let mut out: BooleanChunked = if !ca.has_validity() {
ca.into_no_null_iter().map(f).collect()
} else {
Expand Down
11 changes: 7 additions & 4 deletions py-polars/polars/internals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -4450,16 +4450,19 @@ def rjust(self, width: int, fillchar: str = " ") -> Expr:
"""
return wrap_expr(self._pyexpr.str_rjust(width, fillchar))

def contains(self, pattern: str) -> Expr:
def contains(self, pattern: str, literal: bool = False) -> Expr:
"""
Check if string contains regex.
Check if string contains a substring that matches a regex.
Parameters
----------
pattern
Regex pattern.
A valid regex pattern.
literal
Treat pattern as a literal string.
"""
return wrap_expr(self._pyexpr.str_contains(pattern))
return wrap_expr(self._pyexpr.str_contains(pattern, literal))

def json_path_match(self, json_path: str) -> Expr:
"""
Expand Down
10 changes: 6 additions & 4 deletions py-polars/polars/internals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3973,24 +3973,26 @@ def concat(self, delimiter: str = "-") -> "Series":
s = wrap_s(self._s)
return s.to_frame().select(pli.col(s.name).str.concat(delimiter)).to_series()

def contains(self, pattern: str) -> Series:
def contains(self, pattern: str, literal: bool = False) -> Series:
"""
Check if strings in Series contain regex pattern.
Check if strings in Series contain a substring that matches a regex.
Parameters
----------
pattern
A valid regex pattern.
literal
Treat pattern as a literal string.
Returns
-------
Boolean mask
"""
return wrap_s(self._s.str_contains(pattern))
return wrap_s(self._s.str_contains(pattern, literal))

def decode(self, encoding: str, strict: bool = False) -> Series:
"""
Decodes a value using the provided encoding
Decodes a value using the provided encoding.
Parameters
----------
Expand Down
11 changes: 9 additions & 2 deletions py-polars/src/lazy/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -585,10 +585,17 @@ impl PyExpr {
self.clone().inner.str().rjust(width, fillchar).into()
}

pub fn str_contains(&self, pat: String) -> PyExpr {
pub fn str_contains(&self, pat: String, literal: Option<bool>) -> PyExpr {
let function = move |s: Series| {
let ca = s.utf8()?;
match ca.contains(&pat) {
let match_type = |p: &str| {
if literal.unwrap_or(false) {
ca.contains_literal(p)
} else {
ca.contains(p)
}
};
match match_type(&pat) {
Ok(ca) => Ok(ca.into_series()),
Err(e) => Err(PolarsError::ComputeError(format!("{:?}", e).into())),
}
Expand Down
10 changes: 8 additions & 2 deletions py-polars/src/series.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1107,9 +1107,15 @@ impl PySeries {
Ok(PySeries::new(s))
}

pub fn str_contains(&self, pat: &str) -> PyResult<Self> {
pub fn str_contains(&self, pat: &str, literal: Option<bool>) -> PyResult<Self> {
let ca = self.series.utf8().map_err(PyPolarsErr::from)?;
let s = ca.contains(pat).map_err(PyPolarsErr::from)?.into_series();
let s = if literal.unwrap_or(false) {
ca.contains_literal(pat)
} else {
ca.contains(pat)
}
.map_err(PyPolarsErr::from)?
.into_series();
Ok(s.into())
}

Expand Down
34 changes: 34 additions & 0 deletions py-polars/tests/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,40 @@ def test_auto_explode() -> None:
assert grouped.dtype == pl.Utf8


def test_contains() -> None:
df = pl.DataFrame(
data=[(1, "some * * text"), (2, "(with) special\n * chars"), (3, "**etc...?$")],
columns=["idx", "text"],
)
for pattern, as_literal, expected in (
(r"\* \*", False, [True, False, False]),
(r"* *", True, [True, False, False]),
(r"^\(", False, [False, True, False]),
(r"^\(", True, [False, False, False]),
(r"(", True, [False, True, False]),
(r"e", False, [True, True, True]),
(r"e", True, [True, True, True]),
(r"^\S+$", False, [False, False, True]),
(r"\?\$", False, [False, False, True]),
(r"?$", True, [False, False, True]),
):
# series
assert (
expected == df["text"].str.contains(pattern, literal=as_literal).to_list()
)
# frame select
assert (
expected
== df.select(pl.col("text").str.contains(pattern, literal=as_literal))[
"text"
].to_list()
)
# frame filter
assert sum(expected) == len(
df.filter(pl.col("text").str.contains(pattern, literal=as_literal))
)


def test_null_comparisons() -> None:
s = pl.Series("s", [None, "str", "a"])
assert (s.shift() == s).null_count() == 0
Expand Down

0 comments on commit 7a9d7ac

Please sign in to comment.