Skip to content

Commit

Permalink
Add extract_regex to StringNameSpace (#1567)
Browse files Browse the repository at this point in the history
  • Loading branch information
potter420 committed Oct 20, 2021
1 parent 1a6db29 commit de97f27
Show file tree
Hide file tree
Showing 8 changed files with 141 additions and 0 deletions.
13 changes: 13 additions & 0 deletions polars/polars-core/src/chunked_array/strings/mod.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
#[cfg(feature = "extract_jsonpath")]
mod json_path;
use std::borrow::Cow;

use crate::prelude::*;
use arrow::compute::substring::substring;
use polars_arrow::kernels::string::*;
use regex::Regex;

fn f_regex_extract<'a>(reg: &Regex, input: &'a str, group_index: usize) -> Option<Cow<'a, str>> {
reg.captures(input)
.and_then(|cap| cap.get(group_index).map(|m| Cow::Borrowed(m.as_str())))
}

impl Utf8Chunked {
/// Get the length of the string values.
pub fn str_lengths(&self) -> UInt32Chunked {
Expand Down Expand Up @@ -38,6 +45,12 @@ impl Utf8Chunked {
Ok(self.apply(f))
}

/// Extract the nth capture group from pattern
pub fn extract(&self, pat: &str, group_index: usize) -> Result<Utf8Chunked> {
let reg = Regex::new(pat)?;
Ok(self.apply_on_opt(|e| e.and_then(|input| f_regex_extract(&reg, input, group_index))))
}

/// Modify the strings to their lowercase equivalent
pub fn to_lowercase(&self) -> Utf8Chunked {
self.apply(|s| str::to_lowercase(s).into())
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/expression.rst
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,7 @@ The following methods are available under the `Expr.str` attribute.
ExprStringNameSpace.to_lowercase
ExprStringNameSpace.contains
ExprStringNameSpace.json_path_match
ExprStringNameSpace.extract
ExprStringNameSpace.replace
ExprStringNameSpace.replace_all
ExprStringNameSpace.slice
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ The following methods are available under the `Series.str` attribute.
StringNameSpace.lengths
StringNameSpace.contains
StringNameSpace.json_path_match
StringNameSpace.extract
StringNameSpace.replace
StringNameSpace.replace_all
StringNameSpace.to_lowercase
Expand Down
44 changes: 44 additions & 0 deletions py-polars/polars/eager/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2878,6 +2878,50 @@ def json_path_match(self, json_path: str) -> Series:
"""
return wrap_s(self._s.str_json_path_match(json_path))

def extract(self, pattern: str, group_index: int = 1) -> Series:
"""
Extract the target capture group from provided patterns.
Parameters
----------
pattern
A valid regex pattern
group_index
Index of the targeted capture group.
Group 0 mean the whole pattern, first group begin at index 1
Default to the first capture group
Returns
-------
Utf8 array. Contain null if original value is null or regex capture nothing.
Examples
--------
>>> df = pl.DataFrame({
... 'a': [
... 'http://vote.com/ballon_dor?candidate=messi&ref=polars',
... 'http://vote.com/ballon_dor?candidat=jorginho&ref=polars',
... 'http://vote.com/ballon_dor?candidate=ronaldo&ref=polars'
... ]})
>>> df.select([
... pl.col('a').str.extract('candidate=(\w+)', 1)
... ])
shape: (3, 1)
┌─────────┐
│ a │
│ --- │
│ str │
╞═════════╡
│ messi │
├╌╌╌╌╌╌╌╌╌┤
│ null │
├╌╌╌╌╌╌╌╌╌┤
│ ronaldo │
└─────────┘
"""
return wrap_s(self._s.str_extract(pattern, group_index))

def replace(self, pattern: str, value: str) -> Series:
"""
Replace first regex match with a string value.
Expand Down
44 changes: 44 additions & 0 deletions py-polars/polars/lazy/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -1892,6 +1892,50 @@ def json_path_match(self, json_path: str) -> Expr:
"""
return wrap_expr(self._pyexpr.str_json_path_match(json_path))

def extract(self, pattern: str, group_index: int = 1) -> Expr:
"""
Extract the target capture group from provided patterns.
Parameters
----------
pattern
A valid regex pattern
group_index
Index of the targeted capture group.
Group 0 mean the whole pattern, first group begin at index 1
Default to the first capture group
Returns
-------
Utf8 array. Contain null if original value is null or regex capture nothing.
Examples
--------
>>> df = pl.DataFrame({
... 'a': [
... 'http://vote.com/ballon_dor?candidate=messi&ref=polars',
... 'http://vote.com/ballon_dor?candidat=jorginho&ref=polars',
... 'http://vote.com/ballon_dor?candidate=ronaldo&ref=polars'
... ]})
>>> df.select([
... pl.col('a').str.extract('candidate=(\w+)', 1)
... ])
shape: (3, 1)
┌─────────┐
│ a │
│ --- │
│ str │
╞═════════╡
│ messi │
├╌╌╌╌╌╌╌╌╌┤
│ null │
├╌╌╌╌╌╌╌╌╌┤
│ ronaldo │
└─────────┘
"""
return wrap_expr(self._pyexpr.str_extract(pattern, group_index))

def replace(self, pattern: str, value: str) -> Expr:
"""
Replace first regex match with a string value.
Expand Down
14 changes: 14 additions & 0 deletions py-polars/src/lazy/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,20 @@ impl PyExpr {
.into()
}

pub fn str_extract(&self, pat: String, group_index: usize) -> PyExpr {
let function = move |s: Series| {
let ca = s.utf8()?;
match ca.extract(&pat, group_index) {
Ok(ca) => Ok(ca.into_series()),
Err(e) => Err(PolarsError::ComputeError(format!("{:?}", e).into())),
}
};
self.clone()
.inner
.map(function, GetOutput::from_type(DataType::Boolean))
.into()
}

pub fn strftime(&self, fmt: String) -> PyExpr {
let function = move |s: Series| s.strftime(&fmt);
self.clone()
Expand Down
9 changes: 9 additions & 0 deletions py-polars/src/series.rs
Original file line number Diff line number Diff line change
Expand Up @@ -937,6 +937,15 @@ impl PySeries {
Ok(s.into())
}

pub fn str_extract(&self, pat: &str, group_index: usize) -> PyResult<Self> {
let ca = self.series.utf8().map_err(PyPolarsEr::from)?;
let s = ca
.extract(pat, group_index)
.map_err(PyPolarsEr::from)?
.into_series();
Ok(s.into())
}

pub fn str_replace(&self, pat: &str, val: &str) -> PyResult<Self> {
let ca = self.series.utf8().map_err(PyPolarsEr::from)?;
let s = ca
Expand Down
15 changes: 15 additions & 0 deletions py-polars/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,21 @@ def test_jsonpath_single():
]


def test_extract_regex():
s = pl.Series(
[
"http://vote.com/ballon_dor?candidate=messi&ref=polars",
"http://vote.com/ballon_dor?candidat=jorginho&ref=polars",
"http://vote.com/ballon_dor?candidate=ronaldo&ref=polars",
]
)
assert s.str.extract("candidate=(\w+)", 1).to_list() == [
"messi",
None,
"ronaldo",
]


def test_rank_dispatch():
s = pl.Series("a", [1, 2, 3, 2, 2, 3, 0])

Expand Down

0 comments on commit de97f27

Please sign in to comment.