-
-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
str().extract_all / str().count_match (#3507)
- Loading branch information
Showing
13 changed files
with
340 additions
and
87 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,86 +1,4 @@ | ||
#[cfg(feature = "extract_jsonpath")] | ||
mod json_path; | ||
|
||
#[cfg(feature = "string_encoding")] | ||
mod encoding; | ||
|
||
use crate::prelude::*; | ||
use arrow::compute::substring::substring; | ||
use polars_arrow::kernels::string::*; | ||
use regex::Regex; | ||
use std::borrow::Cow; | ||
|
||
fn f_regex_extract<'a>(reg: &Regex, input: &'a str, group_index: usize) -> Option<Cow<'a, str>> { | ||
reg.captures(input) | ||
.and_then(|cap| cap.get(group_index).map(|m| Cow::Borrowed(m.as_str()))) | ||
} | ||
|
||
impl Utf8Chunked { | ||
/// Get the length of the string values. | ||
pub fn str_lengths(&self) -> UInt32Chunked { | ||
self.apply_kernel_cast(&string_lengths) | ||
} | ||
|
||
/// Check if strings contain a regex pattern | ||
pub fn contains(&self, pat: &str) -> Result<BooleanChunked> { | ||
let reg = Regex::new(pat)?; | ||
let f = |s| reg.is_match(s); | ||
let mut ca: BooleanChunked = if !self.has_validity() { | ||
self.into_no_null_iter().map(f).collect() | ||
} else { | ||
self.into_iter().map(|opt_s| opt_s.map(f)).collect() | ||
}; | ||
ca.rename(self.name()); | ||
Ok(ca) | ||
} | ||
|
||
/// Replace the leftmost (sub)string by a regex pattern | ||
pub fn replace(&self, pat: &str, val: &str) -> Result<Utf8Chunked> { | ||
let reg = Regex::new(pat)?; | ||
let f = |s| reg.replace(s, val); | ||
Ok(self.apply(f)) | ||
} | ||
|
||
/// Replace all (sub)strings by a regex pattern | ||
pub fn replace_all(&self, pat: &str, val: &str) -> Result<Utf8Chunked> { | ||
let reg = Regex::new(pat)?; | ||
let f = |s| reg.replace_all(s, val); | ||
Ok(self.apply(f)) | ||
} | ||
|
||
/// Extract the nth capture group from pattern | ||
pub fn extract(&self, pat: &str, group_index: usize) -> Result<Utf8Chunked> { | ||
let reg = Regex::new(pat)?; | ||
Ok(self.apply_on_opt(|e| e.and_then(|input| f_regex_extract(®, input, group_index)))) | ||
} | ||
|
||
/// Modify the strings to their lowercase equivalent | ||
#[must_use] | ||
pub fn to_lowercase(&self) -> Utf8Chunked { | ||
self.apply(|s| str::to_lowercase(s).into()) | ||
} | ||
|
||
/// Modify the strings to their uppercase equivalent | ||
#[must_use] | ||
pub fn to_uppercase(&self) -> Utf8Chunked { | ||
self.apply(|s| str::to_uppercase(s).into()) | ||
} | ||
|
||
/// Concat with the values from a second Utf8Chunked | ||
#[must_use] | ||
pub fn concat(&self, other: &Utf8Chunked) -> Self { | ||
self + other | ||
} | ||
|
||
/// Slice the string values | ||
/// Determines a substring starting from `start` and with optional length `length` of each of the elements in `array`. | ||
/// `start` can be negative, in which case the start counts from the end of the string. | ||
pub fn str_slice(&self, start: i64, length: Option<u64>) -> Result<Self> { | ||
let chunks = self | ||
.downcast_iter() | ||
.map(|c| Ok(substring(c, start, &length)?.into())) | ||
.collect::<arrow::error::Result<_>>()?; | ||
|
||
Ok(Self::from_chunks(self.name(), chunks)) | ||
} | ||
} | ||
#[cfg(feature = "extract_jsonpath")] | ||
mod json_path; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,3 +26,4 @@ to_dummies = [] | |
list_to_struct = ["polars-core/dtype-struct", "list"] | ||
list = [] | ||
diff = [] | ||
strings = [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
#[cfg(feature = "strings")] | ||
#[cfg_attr(docsrs, doc(cfg(feature = "strings")))] | ||
mod namespace; | ||
|
||
#[cfg(feature = "strings")] | ||
pub use namespace::*; | ||
|
||
use polars_core::prelude::*; | ||
|
||
pub trait AsUtf8 { | ||
fn as_utf8(&self) -> &Utf8Chunked; | ||
} | ||
|
||
impl AsUtf8 for Utf8Chunked { | ||
fn as_utf8(&self) -> &Utf8Chunked { | ||
self | ||
} | ||
} |
131 changes: 131 additions & 0 deletions
131
polars/polars-ops/src/chunked_array/strings/namespace.rs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
use super::*; | ||
|
||
use polars_arrow::{ | ||
export::arrow::{self, compute::substring::substring}, | ||
kernels::string::*, | ||
}; | ||
use polars_core::export::regex::Regex; | ||
use std::borrow::Cow; | ||
|
||
fn f_regex_extract<'a>(reg: &Regex, input: &'a str, group_index: usize) -> Option<Cow<'a, str>> { | ||
reg.captures(input) | ||
.and_then(|cap| cap.get(group_index).map(|m| Cow::Borrowed(m.as_str()))) | ||
} | ||
|
||
pub trait Utf8NameSpaceImpl: AsUtf8 { | ||
/// Get the length of the string values. | ||
fn str_lengths(&self) -> UInt32Chunked { | ||
let ca = self.as_utf8(); | ||
ca.apply_kernel_cast(&string_lengths) | ||
} | ||
|
||
/// Check if strings contain a regex pattern | ||
fn contains(&self, pat: &str) -> Result<BooleanChunked> { | ||
let ca = self.as_utf8(); | ||
|
||
let reg = Regex::new(pat)?; | ||
let f = |s| reg.is_match(s); | ||
let mut out: BooleanChunked = if !ca.has_validity() { | ||
ca.into_no_null_iter().map(f).collect() | ||
} else { | ||
ca.into_iter().map(|opt_s| opt_s.map(f)).collect() | ||
}; | ||
out.rename(ca.name()); | ||
Ok(out) | ||
} | ||
|
||
/// Replace the leftmost (sub)string by a regex pattern | ||
fn replace(&self, pat: &str, val: &str) -> Result<Utf8Chunked> { | ||
let ca = self.as_utf8(); | ||
let reg = Regex::new(pat)?; | ||
let f = |s| reg.replace(s, val); | ||
Ok(ca.apply(f)) | ||
} | ||
|
||
/// Replace all (sub)strings by a regex pattern | ||
fn replace_all(&self, pat: &str, val: &str) -> Result<Utf8Chunked> { | ||
let ca = self.as_utf8(); | ||
let reg = Regex::new(pat)?; | ||
let f = |s| reg.replace_all(s, val); | ||
Ok(ca.apply(f)) | ||
} | ||
|
||
/// Extract the nth capture group from pattern | ||
fn extract(&self, pat: &str, group_index: usize) -> Result<Utf8Chunked> { | ||
let ca = self.as_utf8(); | ||
let reg = Regex::new(pat)?; | ||
Ok(ca.apply_on_opt(|e| e.and_then(|input| f_regex_extract(®, input, group_index)))) | ||
} | ||
|
||
/// Extract each successive non-overlapping regex match in an individual string as an array | ||
fn extract_all(&self, pat: &str) -> Result<ListChunked> { | ||
let ca = self.as_utf8(); | ||
let reg = Regex::new(pat)?; | ||
|
||
let mut builder = ListUtf8ChunkedBuilder::new(ca.name(), ca.len(), ca.get_values_size()); | ||
|
||
for opt_s in ca.into_iter() { | ||
match opt_s { | ||
None => builder.append_null(), | ||
Some(s) => { | ||
let mut iter = reg.find_iter(s).map(|m| m.as_str()).peekable(); | ||
if iter.peek().is_some() { | ||
builder.append_values_iter(iter); | ||
} else { | ||
builder.append_null() | ||
} | ||
} | ||
} | ||
} | ||
Ok(builder.finish()) | ||
} | ||
|
||
/// Count all successive non-overlapping regex matches. | ||
fn count_match(&self, pat: &str) -> Result<UInt32Chunked> { | ||
let ca = self.as_utf8(); | ||
let reg = Regex::new(pat)?; | ||
|
||
let mut out: UInt32Chunked = ca | ||
.into_iter() | ||
.map(|opt_s| opt_s.map(|s| reg.find_iter(s).count() as u32)) | ||
.collect(); | ||
out.rename(ca.name()); | ||
Ok(out) | ||
} | ||
|
||
/// Modify the strings to their lowercase equivalent | ||
#[must_use] | ||
fn to_lowercase(&self) -> Utf8Chunked { | ||
let ca = self.as_utf8(); | ||
ca.apply(|s| str::to_lowercase(s).into()) | ||
} | ||
|
||
/// Modify the strings to their uppercase equivalent | ||
#[must_use] | ||
fn to_uppercase(&self) -> Utf8Chunked { | ||
let ca = self.as_utf8(); | ||
ca.apply(|s| str::to_uppercase(s).into()) | ||
} | ||
|
||
/// Concat with the values from a second Utf8Chunked | ||
#[must_use] | ||
fn concat(&self, other: &Utf8Chunked) -> Utf8Chunked { | ||
let ca = self.as_utf8(); | ||
ca + other | ||
} | ||
|
||
/// Slice the string values | ||
/// Determines a substring starting from `start` and with optional length `length` of each of the elements in `array`. | ||
/// `start` can be negative, in which case the start counts from the end of the string. | ||
fn str_slice(&self, start: i64, length: Option<u64>) -> Result<Utf8Chunked> { | ||
let ca = self.as_utf8(); | ||
let chunks = ca | ||
.downcast_iter() | ||
.map(|c| Ok(substring(c, start, &length)?.into())) | ||
.collect::<arrow::error::Result<_>>()?; | ||
|
||
Ok(Utf8Chunked::from_chunks(ca.name(), chunks)) | ||
} | ||
} | ||
|
||
impl Utf8NameSpaceImpl for Utf8Chunked {} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.