Skip to content

Commit

Permalink
str().extract_all / str().count_match (#3507)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed May 26, 2022
1 parent afb4df7 commit c8981ac
Show file tree
Hide file tree
Showing 13 changed files with 340 additions and 87 deletions.
2 changes: 1 addition & 1 deletion polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ lazy = ["polars-core/lazy", "polars-lazy", "polars-lazy/compile"]
# parallel = ["polars-core/parallel"]

# extra utilities for Utf8Chunked
strings = ["polars-core/strings", "polars-lazy/strings"]
strings = ["polars-core/strings", "polars-lazy/strings", "polars-ops/strings"]

# support for ObjectChunked<T> (downcastable Series of any type)
object = ["polars-core/object", "polars-lazy/object"]
Expand Down
2 changes: 0 additions & 2 deletions polars/polars-core/src/chunked_array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,6 @@ pub mod object;
#[cfg(feature = "random")]
#[cfg_attr(docsrs, doc(cfg(feature = "random")))]
mod random;
#[cfg(feature = "strings")]
#[cfg_attr(docsrs, doc(cfg(feature = "strings")))]
pub mod strings;
#[cfg(any(
feature = "temporal",
Expand Down
86 changes: 2 additions & 84 deletions polars/polars-core/src/chunked_array/strings/mod.rs
Original file line number Diff line number Diff line change
@@ -1,86 +1,4 @@
#[cfg(feature = "extract_jsonpath")]
mod json_path;

#[cfg(feature = "string_encoding")]
mod encoding;

use crate::prelude::*;
use arrow::compute::substring::substring;
use polars_arrow::kernels::string::*;
use regex::Regex;
use std::borrow::Cow;

fn f_regex_extract<'a>(reg: &Regex, input: &'a str, group_index: usize) -> Option<Cow<'a, str>> {
reg.captures(input)
.and_then(|cap| cap.get(group_index).map(|m| Cow::Borrowed(m.as_str())))
}

impl Utf8Chunked {
/// Get the length of the string values.
pub fn str_lengths(&self) -> UInt32Chunked {
self.apply_kernel_cast(&string_lengths)
}

/// Check if strings contain a regex pattern
pub fn contains(&self, pat: &str) -> Result<BooleanChunked> {
let reg = Regex::new(pat)?;
let f = |s| reg.is_match(s);
let mut ca: BooleanChunked = if !self.has_validity() {
self.into_no_null_iter().map(f).collect()
} else {
self.into_iter().map(|opt_s| opt_s.map(f)).collect()
};
ca.rename(self.name());
Ok(ca)
}

/// Replace the leftmost (sub)string by a regex pattern
pub fn replace(&self, pat: &str, val: &str) -> Result<Utf8Chunked> {
let reg = Regex::new(pat)?;
let f = |s| reg.replace(s, val);
Ok(self.apply(f))
}

/// Replace all (sub)strings by a regex pattern
pub fn replace_all(&self, pat: &str, val: &str) -> Result<Utf8Chunked> {
let reg = Regex::new(pat)?;
let f = |s| reg.replace_all(s, val);
Ok(self.apply(f))
}

/// Extract the nth capture group from pattern
pub fn extract(&self, pat: &str, group_index: usize) -> Result<Utf8Chunked> {
let reg = Regex::new(pat)?;
Ok(self.apply_on_opt(|e| e.and_then(|input| f_regex_extract(&reg, input, group_index))))
}

/// Modify the strings to their lowercase equivalent
#[must_use]
pub fn to_lowercase(&self) -> Utf8Chunked {
self.apply(|s| str::to_lowercase(s).into())
}

/// Modify the strings to their uppercase equivalent
#[must_use]
pub fn to_uppercase(&self) -> Utf8Chunked {
self.apply(|s| str::to_uppercase(s).into())
}

/// Concat with the values from a second Utf8Chunked
#[must_use]
pub fn concat(&self, other: &Utf8Chunked) -> Self {
self + other
}

/// Slice the string values
/// Determines a substring starting from `start` and with optional length `length` of each of the elements in `array`.
/// `start` can be negative, in which case the start counts from the end of the string.
pub fn str_slice(&self, start: i64, length: Option<u64>) -> Result<Self> {
let chunks = self
.downcast_iter()
.map(|c| Ok(substring(c, start, &length)?.into()))
.collect::<arrow::error::Result<_>>()?;

Ok(Self::from_chunks(self.name(), chunks))
}
}
#[cfg(feature = "extract_jsonpath")]
mod json_path;
2 changes: 2 additions & 0 deletions polars/polars-core/src/export.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@ pub use num;
pub use once_cell;
#[cfg(feature = "private")]
pub use rayon;
#[cfg(feature = "private")]
pub use regex;
28 changes: 28 additions & 0 deletions polars/polars-lazy/src/dsl/string.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use super::*;
use polars_arrow::array::ValueSize;
use polars_arrow::export::arrow::array::{MutableArray, MutableUtf8Array};
use polars_ops::prelude::Utf8NameSpaceImpl;
use polars_time::prelude::*;

/// Specialized expressions for [`Series`] of [`DataType::Utf8`].
Expand All @@ -18,6 +19,33 @@ impl StringNameSpace {
.with_fmt("str.extract")
}

/// Extract each successive non-overlapping match in an individual string as an array
pub fn extract_all(self, pat: &str) -> Expr {
let pat = pat.to_string();
let function = move |s: Series| {
let ca = s.utf8()?;
ca.extract_all(&pat).map(|ca| ca.into_series())
};
self.0
.map(
function,
GetOutput::from_type(DataType::List(Box::new(DataType::Utf8))),
)
.with_fmt("str.extract_all")
}

/// Count all successive non-overlapping regex matches.
pub fn count_match(self, pat: &str) -> Expr {
let pat = pat.to_string();
let function = move |s: Series| {
let ca = s.utf8()?;
ca.count_match(&pat).map(|ca| ca.into_series())
};
self.0
.map(function, GetOutput::from_type(DataType::UInt32))
.with_fmt("str.extract_all")
}

#[cfg(feature = "temporal")]
pub fn strptime(self, options: StrpTimeOptions) -> Expr {
let out_type = options.date_dtype.clone();
Expand Down
1 change: 1 addition & 0 deletions polars/polars-ops/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ to_dummies = []
list_to_struct = ["polars-core/dtype-struct", "list"]
list = []
diff = []
strings = []
2 changes: 2 additions & 0 deletions polars/polars-ops/src/chunked_array/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
mod list;
mod strings;
#[cfg(feature = "to_dummies")]
mod to_dummies;

Expand All @@ -11,3 +12,4 @@ use polars_core::prelude::*;
pub use to_dummies::*;

pub use list::*;
pub use strings::*;
18 changes: 18 additions & 0 deletions polars/polars-ops/src/chunked_array/strings/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#[cfg(feature = "strings")]
#[cfg_attr(docsrs, doc(cfg(feature = "strings")))]
mod namespace;

#[cfg(feature = "strings")]
pub use namespace::*;

use polars_core::prelude::*;

pub trait AsUtf8 {
fn as_utf8(&self) -> &Utf8Chunked;
}

impl AsUtf8 for Utf8Chunked {
fn as_utf8(&self) -> &Utf8Chunked {
self
}
}
131 changes: 131 additions & 0 deletions polars/polars-ops/src/chunked_array/strings/namespace.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
use super::*;

use polars_arrow::{
export::arrow::{self, compute::substring::substring},
kernels::string::*,
};
use polars_core::export::regex::Regex;
use std::borrow::Cow;

fn f_regex_extract<'a>(reg: &Regex, input: &'a str, group_index: usize) -> Option<Cow<'a, str>> {
reg.captures(input)
.and_then(|cap| cap.get(group_index).map(|m| Cow::Borrowed(m.as_str())))
}

pub trait Utf8NameSpaceImpl: AsUtf8 {
/// Get the length of the string values.
fn str_lengths(&self) -> UInt32Chunked {
let ca = self.as_utf8();
ca.apply_kernel_cast(&string_lengths)
}

/// Check if strings contain a regex pattern
fn contains(&self, pat: &str) -> Result<BooleanChunked> {
let ca = self.as_utf8();

let reg = Regex::new(pat)?;
let f = |s| reg.is_match(s);
let mut out: BooleanChunked = if !ca.has_validity() {
ca.into_no_null_iter().map(f).collect()
} else {
ca.into_iter().map(|opt_s| opt_s.map(f)).collect()
};
out.rename(ca.name());
Ok(out)
}

/// Replace the leftmost (sub)string by a regex pattern
fn replace(&self, pat: &str, val: &str) -> Result<Utf8Chunked> {
let ca = self.as_utf8();
let reg = Regex::new(pat)?;
let f = |s| reg.replace(s, val);
Ok(ca.apply(f))
}

/// Replace all (sub)strings by a regex pattern
fn replace_all(&self, pat: &str, val: &str) -> Result<Utf8Chunked> {
let ca = self.as_utf8();
let reg = Regex::new(pat)?;
let f = |s| reg.replace_all(s, val);
Ok(ca.apply(f))
}

/// Extract the nth capture group from pattern
fn extract(&self, pat: &str, group_index: usize) -> Result<Utf8Chunked> {
let ca = self.as_utf8();
let reg = Regex::new(pat)?;
Ok(ca.apply_on_opt(|e| e.and_then(|input| f_regex_extract(&reg, input, group_index))))
}

/// Extract each successive non-overlapping regex match in an individual string as an array
fn extract_all(&self, pat: &str) -> Result<ListChunked> {
let ca = self.as_utf8();
let reg = Regex::new(pat)?;

let mut builder = ListUtf8ChunkedBuilder::new(ca.name(), ca.len(), ca.get_values_size());

for opt_s in ca.into_iter() {
match opt_s {
None => builder.append_null(),
Some(s) => {
let mut iter = reg.find_iter(s).map(|m| m.as_str()).peekable();
if iter.peek().is_some() {
builder.append_values_iter(iter);
} else {
builder.append_null()
}
}
}
}
Ok(builder.finish())
}

/// Count all successive non-overlapping regex matches.
fn count_match(&self, pat: &str) -> Result<UInt32Chunked> {
let ca = self.as_utf8();
let reg = Regex::new(pat)?;

let mut out: UInt32Chunked = ca
.into_iter()
.map(|opt_s| opt_s.map(|s| reg.find_iter(s).count() as u32))
.collect();
out.rename(ca.name());
Ok(out)
}

/// Modify the strings to their lowercase equivalent
#[must_use]
fn to_lowercase(&self) -> Utf8Chunked {
let ca = self.as_utf8();
ca.apply(|s| str::to_lowercase(s).into())
}

/// Modify the strings to their uppercase equivalent
#[must_use]
fn to_uppercase(&self) -> Utf8Chunked {
let ca = self.as_utf8();
ca.apply(|s| str::to_uppercase(s).into())
}

/// Concat with the values from a second Utf8Chunked
#[must_use]
fn concat(&self, other: &Utf8Chunked) -> Utf8Chunked {
let ca = self.as_utf8();
ca + other
}

/// Slice the string values
/// Determines a substring starting from `start` and with optional length `length` of each of the elements in `array`.
/// `start` can be negative, in which case the start counts from the end of the string.
fn str_slice(&self, start: i64, length: Option<u64>) -> Result<Utf8Chunked> {
let ca = self.as_utf8();
let chunks = ca
.downcast_iter()
.map(|c| Ok(substring(c, start, &length)?.into()))
.collect::<arrow::error::Result<_>>()?;

Ok(Utf8Chunked::from_chunks(ca.name(), chunks))
}
}

impl Utf8NameSpaceImpl for Utf8Chunked {}
70 changes: 70 additions & 0 deletions py-polars/polars/internals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3975,6 +3975,76 @@ def extract(self, pattern: str, group_index: int = 1) -> Expr:
"""
return wrap_expr(self._pyexpr.str_extract(pattern, group_index))

def extract_all(self, pattern: str) -> Expr:
r"""
Extract each successive non-overlapping regex match in an individual string as an array
Parameters
----------
pattern
A valid regex pattern
Returns
-------
List[Utf8] array. Contain null if original value is null or regex capture nothing.
Examples
--------
>>> df = pl.DataFrame({"foo": ["123 bla 45 asd", "xyz 678 910t"]})
>>> df.select(
... [
... pl.col("foo").str.extract_all(r"(\d+)").alias("extracted_nrs"),
... ]
... )
shape: (2, 1)
┌────────────────┐
│ extracted_nrs │
│ --- │
│ list[str] │
╞════════════════╡
│ ["123", "45"] │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ["678", "910"] │
└────────────────┘
"""
return wrap_expr(self._pyexpr.str_extract_all(pattern))

def count_match(self, pattern: str) -> Expr:
r"""
Count all successive non-overlapping regex matches.
Parameters
----------
pattern
A valid regex pattern
Returns
-------
UInt32 array. Contain null if original value is null or regex capture nothing.
Examples
--------
>>> df = pl.DataFrame({"foo": ["123 bla 45 asd", "xyz 678 910t"]})
>>> df.select(
... [
... pl.col("foo").str.count_match(r"\d").alias("count_digits"),
... ]
... )
shape: (2, 1)
┌──────────────┐
│ count_digits │
│ --- │
│ u32 │
╞══════════════╡
│ 5 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 6 │
└──────────────┘
"""
return wrap_expr(self._pyexpr.count_match(pattern))

def split(self, by: str, inclusive: bool = False) -> Expr:
"""
Split the string by a substring.
Expand Down

0 comments on commit c8981ac

Please sign in to comment.