str().extract_all / str().count_match (#3507)

pola-rs · May 26, 2022 · c8981ac · c8981ac
1 parent afb4df7
commit c8981ac
Show file tree

Hide file tree

Showing 13 changed files with 340 additions and 87 deletions.
diff --git a/polars/Cargo.toml b/polars/Cargo.toml
@@ -34,7 +34,7 @@ lazy = ["polars-core/lazy", "polars-lazy", "polars-lazy/compile"]
 # parallel = ["polars-core/parallel"]
 
 # extra utilities for Utf8Chunked
-strings = ["polars-core/strings", "polars-lazy/strings"]
+strings = ["polars-core/strings", "polars-lazy/strings", "polars-ops/strings"]
 
 # support for ObjectChunked<T> (downcastable Series of any type)
 object = ["polars-core/object", "polars-lazy/object"]

diff --git a/polars/polars-core/src/chunked_array/mod.rs b/polars/polars-core/src/chunked_array/mod.rs
@@ -29,8 +29,6 @@ pub mod object;
 #[cfg(feature = "random")]
 #[cfg_attr(docsrs, doc(cfg(feature = "random")))]
 mod random;
-#[cfg(feature = "strings")]
-#[cfg_attr(docsrs, doc(cfg(feature = "strings")))]
 pub mod strings;
 #[cfg(any(
     feature = "temporal",

diff --git a/polars/polars-core/src/chunked_array/strings/mod.rs b/polars/polars-core/src/chunked_array/strings/mod.rs
@@ -1,86 +1,4 @@
-#[cfg(feature = "extract_jsonpath")]
-mod json_path;
-
 #[cfg(feature = "string_encoding")]
 mod encoding;
-
-use crate::prelude::*;
-use arrow::compute::substring::substring;
-use polars_arrow::kernels::string::*;
-use regex::Regex;
-use std::borrow::Cow;
-
-fn f_regex_extract<'a>(reg: &Regex, input: &'a str, group_index: usize) -> Option<Cow<'a, str>> {
-    reg.captures(input)
-        .and_then(|cap| cap.get(group_index).map(|m| Cow::Borrowed(m.as_str())))
-}
-
-impl Utf8Chunked {
-    /// Get the length of the string values.
-    pub fn str_lengths(&self) -> UInt32Chunked {
-        self.apply_kernel_cast(&string_lengths)
-    }
-
-    /// Check if strings contain a regex pattern
-    pub fn contains(&self, pat: &str) -> Result<BooleanChunked> {
-        let reg = Regex::new(pat)?;
-        let f = |s| reg.is_match(s);
-        let mut ca: BooleanChunked = if !self.has_validity() {
-            self.into_no_null_iter().map(f).collect()
-        } else {
-            self.into_iter().map(|opt_s| opt_s.map(f)).collect()
-        };
-        ca.rename(self.name());
-        Ok(ca)
-    }
-
-    /// Replace the leftmost (sub)string by a regex pattern
-    pub fn replace(&self, pat: &str, val: &str) -> Result<Utf8Chunked> {
-        let reg = Regex::new(pat)?;
-        let f = |s| reg.replace(s, val);
-        Ok(self.apply(f))
-    }
-
-    /// Replace all (sub)strings by a regex pattern
-    pub fn replace_all(&self, pat: &str, val: &str) -> Result<Utf8Chunked> {
-        let reg = Regex::new(pat)?;
-        let f = |s| reg.replace_all(s, val);
-        Ok(self.apply(f))
-    }
-
-    /// Extract the nth capture group from pattern
-    pub fn extract(&self, pat: &str, group_index: usize) -> Result<Utf8Chunked> {
-        let reg = Regex::new(pat)?;
-        Ok(self.apply_on_opt(|e| e.and_then(|input| f_regex_extract(&reg, input, group_index))))
-    }
-
-    /// Modify the strings to their lowercase equivalent
-    #[must_use]
-    pub fn to_lowercase(&self) -> Utf8Chunked {
-        self.apply(|s| str::to_lowercase(s).into())
-    }
-
-    /// Modify the strings to their uppercase equivalent
-    #[must_use]
-    pub fn to_uppercase(&self) -> Utf8Chunked {
-        self.apply(|s| str::to_uppercase(s).into())
-    }
-
-    /// Concat with the values from a second Utf8Chunked
-    #[must_use]
-    pub fn concat(&self, other: &Utf8Chunked) -> Self {
-        self + other
-    }
-
-    /// Slice the string values
-    /// Determines a substring starting from `start` and with optional length `length` of each of the elements in `array`.
-    /// `start` can be negative, in which case the start counts from the end of the string.
-    pub fn str_slice(&self, start: i64, length: Option<u64>) -> Result<Self> {
-        let chunks = self
-            .downcast_iter()
-            .map(|c| Ok(substring(c, start, &length)?.into()))
-            .collect::<arrow::error::Result<_>>()?;
-
-        Ok(Self::from_chunks(self.name(), chunks))
-    }
-}
+#[cfg(feature = "extract_jsonpath")]
+mod json_path;
diff --git a/polars/polars-core/src/export.rs b/polars/polars-core/src/export.rs
@@ -8,3 +8,5 @@ pub use num;
 pub use once_cell;
 #[cfg(feature = "private")]
 pub use rayon;
+#[cfg(feature = "private")]
+pub use regex;
diff --git a/polars/polars-lazy/src/dsl/string.rs b/polars/polars-lazy/src/dsl/string.rs
@@ -1,6 +1,7 @@
 use super::*;
 use polars_arrow::array::ValueSize;
 use polars_arrow::export::arrow::array::{MutableArray, MutableUtf8Array};
+use polars_ops::prelude::Utf8NameSpaceImpl;
 use polars_time::prelude::*;
 
 /// Specialized expressions for [`Series`] of [`DataType::Utf8`].
@@ -18,6 +19,33 @@ impl StringNameSpace {
             .with_fmt("str.extract")
     }
 
+    /// Extract each successive non-overlapping match in an individual string as an array
+    pub fn extract_all(self, pat: &str) -> Expr {
+        let pat = pat.to_string();
+        let function = move |s: Series| {
+            let ca = s.utf8()?;
+            ca.extract_all(&pat).map(|ca| ca.into_series())
+        };
+        self.0
+            .map(
+                function,
+                GetOutput::from_type(DataType::List(Box::new(DataType::Utf8))),
+            )
+            .with_fmt("str.extract_all")
+    }
+
+    /// Count all successive non-overlapping regex matches.
+    pub fn count_match(self, pat: &str) -> Expr {
+        let pat = pat.to_string();
+        let function = move |s: Series| {
+            let ca = s.utf8()?;
+            ca.count_match(&pat).map(|ca| ca.into_series())
+        };
+        self.0
+            .map(function, GetOutput::from_type(DataType::UInt32))
+            .with_fmt("str.extract_all")
+    }
+
     #[cfg(feature = "temporal")]
     pub fn strptime(self, options: StrpTimeOptions) -> Expr {
         let out_type = options.date_dtype.clone();

diff --git a/polars/polars-ops/Cargo.toml b/polars/polars-ops/Cargo.toml
@@ -26,3 +26,4 @@ to_dummies = []
 list_to_struct = ["polars-core/dtype-struct", "list"]
 list = []
 diff = []
+strings = []
diff --git a/polars/polars-ops/src/chunked_array/mod.rs b/polars/polars-ops/src/chunked_array/mod.rs
@@ -1,4 +1,5 @@
 mod list;
+mod strings;
 #[cfg(feature = "to_dummies")]
 mod to_dummies;
 
@@ -11,3 +12,4 @@ use polars_core::prelude::*;
 pub use to_dummies::*;
 
 pub use list::*;
+pub use strings::*;
diff --git a/polars/polars-ops/src/chunked_array/strings/mod.rs b/polars/polars-ops/src/chunked_array/strings/mod.rs
@@ -0,0 +1,18 @@
+#[cfg(feature = "strings")]
+#[cfg_attr(docsrs, doc(cfg(feature = "strings")))]
+mod namespace;
+
+#[cfg(feature = "strings")]
+pub use namespace::*;
+
+use polars_core::prelude::*;
+
+pub trait AsUtf8 {
+    fn as_utf8(&self) -> &Utf8Chunked;
+}
+
+impl AsUtf8 for Utf8Chunked {
+    fn as_utf8(&self) -> &Utf8Chunked {
+        self
+    }
+}
diff --git a/polars/polars-ops/src/chunked_array/strings/namespace.rs b/polars/polars-ops/src/chunked_array/strings/namespace.rs
@@ -0,0 +1,131 @@
+use super::*;
+
+use polars_arrow::{
+    export::arrow::{self, compute::substring::substring},
+    kernels::string::*,
+};
+use polars_core::export::regex::Regex;
+use std::borrow::Cow;
+
+fn f_regex_extract<'a>(reg: &Regex, input: &'a str, group_index: usize) -> Option<Cow<'a, str>> {
+    reg.captures(input)
+        .and_then(|cap| cap.get(group_index).map(|m| Cow::Borrowed(m.as_str())))
+}
+
+pub trait Utf8NameSpaceImpl: AsUtf8 {
+    /// Get the length of the string values.
+    fn str_lengths(&self) -> UInt32Chunked {
+        let ca = self.as_utf8();
+        ca.apply_kernel_cast(&string_lengths)
+    }
+
+    /// Check if strings contain a regex pattern
+    fn contains(&self, pat: &str) -> Result<BooleanChunked> {
+        let ca = self.as_utf8();
+
+        let reg = Regex::new(pat)?;
+        let f = |s| reg.is_match(s);
+        let mut out: BooleanChunked = if !ca.has_validity() {
+            ca.into_no_null_iter().map(f).collect()
+        } else {
+            ca.into_iter().map(|opt_s| opt_s.map(f)).collect()
+        };
+        out.rename(ca.name());
+        Ok(out)
+    }
+
+    /// Replace the leftmost (sub)string by a regex pattern
+    fn replace(&self, pat: &str, val: &str) -> Result<Utf8Chunked> {
+        let ca = self.as_utf8();
+        let reg = Regex::new(pat)?;
+        let f = |s| reg.replace(s, val);
+        Ok(ca.apply(f))
+    }
+
+    /// Replace all (sub)strings by a regex pattern
+    fn replace_all(&self, pat: &str, val: &str) -> Result<Utf8Chunked> {
+        let ca = self.as_utf8();
+        let reg = Regex::new(pat)?;
+        let f = |s| reg.replace_all(s, val);
+        Ok(ca.apply(f))
+    }
+
+    /// Extract the nth capture group from pattern
+    fn extract(&self, pat: &str, group_index: usize) -> Result<Utf8Chunked> {
+        let ca = self.as_utf8();
+        let reg = Regex::new(pat)?;
+        Ok(ca.apply_on_opt(|e| e.and_then(|input| f_regex_extract(&reg, input, group_index))))
+    }
+
+    /// Extract each successive non-overlapping regex match in an individual string as an array
+    fn extract_all(&self, pat: &str) -> Result<ListChunked> {
+        let ca = self.as_utf8();
+        let reg = Regex::new(pat)?;
+
+        let mut builder = ListUtf8ChunkedBuilder::new(ca.name(), ca.len(), ca.get_values_size());
+
+        for opt_s in ca.into_iter() {
+            match opt_s {
+                None => builder.append_null(),
+                Some(s) => {
+                    let mut iter = reg.find_iter(s).map(|m| m.as_str()).peekable();
+                    if iter.peek().is_some() {
+                        builder.append_values_iter(iter);
+                    } else {
+                        builder.append_null()
+                    }
+                }
+            }
+        }
+        Ok(builder.finish())
+    }
+
+    /// Count all successive non-overlapping regex matches.
+    fn count_match(&self, pat: &str) -> Result<UInt32Chunked> {
+        let ca = self.as_utf8();
+        let reg = Regex::new(pat)?;
+
+        let mut out: UInt32Chunked = ca
+            .into_iter()
+            .map(|opt_s| opt_s.map(|s| reg.find_iter(s).count() as u32))
+            .collect();
+        out.rename(ca.name());
+        Ok(out)
+    }
+
+    /// Modify the strings to their lowercase equivalent
+    #[must_use]
+    fn to_lowercase(&self) -> Utf8Chunked {
+        let ca = self.as_utf8();
+        ca.apply(|s| str::to_lowercase(s).into())
+    }
+
+    /// Modify the strings to their uppercase equivalent
+    #[must_use]
+    fn to_uppercase(&self) -> Utf8Chunked {
+        let ca = self.as_utf8();
+        ca.apply(|s| str::to_uppercase(s).into())
+    }
+
+    /// Concat with the values from a second Utf8Chunked
+    #[must_use]
+    fn concat(&self, other: &Utf8Chunked) -> Utf8Chunked {
+        let ca = self.as_utf8();
+        ca + other
+    }
+
+    /// Slice the string values
+    /// Determines a substring starting from `start` and with optional length `length` of each of the elements in `array`.
+    /// `start` can be negative, in which case the start counts from the end of the string.
+    fn str_slice(&self, start: i64, length: Option<u64>) -> Result<Utf8Chunked> {
+        let ca = self.as_utf8();
+        let chunks = ca
+            .downcast_iter()
+            .map(|c| Ok(substring(c, start, &length)?.into()))
+            .collect::<arrow::error::Result<_>>()?;
+
+        Ok(Utf8Chunked::from_chunks(ca.name(), chunks))
+    }
+}
+
+impl Utf8NameSpaceImpl for Utf8Chunked {}
diff --git a/py-polars/polars/internals/expr.py b/py-polars/polars/internals/expr.py
@@ -3975,6 +3975,76 @@ def extract(self, pattern: str, group_index: int = 1) -> Expr:
         """
         return wrap_expr(self._pyexpr.str_extract(pattern, group_index))
 
+    def extract_all(self, pattern: str) -> Expr:
+        r"""
+        Extract each successive non-overlapping regex match in an individual string as an array
+
+        Parameters
+        ----------
+        pattern
+            A valid regex pattern
+
+        Returns
+        -------
+        List[Utf8] array. Contain null if original value is null or regex capture nothing.
+
+        Examples
+        --------
+        >>> df = pl.DataFrame({"foo": ["123 bla 45 asd", "xyz 678 910t"]})
+        >>> df.select(
+        ...     [
+        ...         pl.col("foo").str.extract_all(r"(\d+)").alias("extracted_nrs"),
+        ...     ]
+        ... )
+        shape: (2, 1)
+        ┌────────────────┐
+        │ extracted_nrs  │
+        │ ---            │
+        │ list[str]      │
+        ╞════════════════╡
+        │ ["123", "45"]  │
+        ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+        │ ["678", "910"] │
+        └────────────────┘
+
+        """
+        return wrap_expr(self._pyexpr.str_extract_all(pattern))
+
+    def count_match(self, pattern: str) -> Expr:
+        r"""
+        Count all successive non-overlapping regex matches.
+
+        Parameters
+        ----------
+        pattern
+            A valid regex pattern
+
+        Returns
+        -------
+        UInt32 array. Contain null if original value is null or regex capture nothing.
+
+        Examples
+        --------
+        >>> df = pl.DataFrame({"foo": ["123 bla 45 asd", "xyz 678 910t"]})
+        >>> df.select(
+        ...     [
+        ...         pl.col("foo").str.count_match(r"\d").alias("count_digits"),
+        ...     ]
+        ... )
+        shape: (2, 1)
+        ┌──────────────┐
+        │ count_digits │
+        │ ---          │
+        │ u32          │
+        ╞══════════════╡
+        │ 5            │
+        ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+        │ 6            │
+        └──────────────┘
+
+        """
+        return wrap_expr(self._pyexpr.count_match(pattern))
+
     def split(self, by: str, inclusive: bool = False) -> Expr:
         """
         Split the string by a substring.