feat[rust, python]: Add splitn expression (#4373)

This splits a string value a fixed number of times and keeps remainder intact.
pola-rs · Aug 25, 2022 · 30d6270 · 30d6270
1 parent 40e4e77
commit 30d6270
Show file tree

Hide file tree

Showing 7 changed files with 280 additions and 54 deletions.
diff --git a/polars/polars-lazy/src/dsl/string.rs b/polars/polars-lazy/src/dsl/string.rs
@@ -129,8 +129,7 @@ impl StringNameSpace {
         }
     }
 
-    /// Split the string by a substring.
-    // Split exactly `n` times by a given substring. The resulting dtype is `List<Utf8>`.
+    /// Split the string by a substring. The resulting dtype is `List<Utf8>`.
     pub fn split(self, by: &str) -> Expr {
         let by = by.to_string();
 
@@ -155,8 +154,33 @@ impl StringNameSpace {
             .with_fmt("str.split")
     }
 
+    /// Split the string by a substring and keep the substring. The resulting dtype is `List<Utf8>`.
+    pub fn split_inclusive(self, by: &str) -> Expr {
+        let by = by.to_string();
+
+        let function = move |s: Series| {
+            let ca = s.utf8()?;
+
+            let mut builder = ListUtf8ChunkedBuilder::new(s.name(), s.len(), ca.get_values_size());
+            ca.into_iter().for_each(|opt_s| match opt_s {
+                None => builder.append_null(),
+                Some(s) => {
+                    let iter = s.split_inclusive(&by);
+                    builder.append_values_iter(iter);
+                }
+            });
+            Ok(builder.finish().into_series())
+        };
+        self.0
+            .map(
+                function,
+                GetOutput::from_type(DataType::List(Box::new(DataType::Utf8))),
+            )
+            .with_fmt("str.split_inclusive")
+    }
+
     #[cfg(feature = "dtype-struct")]
-    // Split exactly `n` times by a given substring. The resulting dtype is [`DataType::Struct`].
+    /// Split exactly `n` times by a given substring. The resulting dtype is [`DataType::Struct`].
     pub fn split_exact(self, by: &str, n: usize) -> Expr {
         let by = by.to_string();
 
@@ -207,8 +231,8 @@ impl StringNameSpace {
     }
 
     #[cfg(feature = "dtype-struct")]
-    // Split exactly `n` times by a given substring and keep the substring.
-    // The resulting dtype is [`DataType::Struct`].
+    /// Split exactly `n` times by a given substring and keep the substring.
+    /// The resulting dtype is [`DataType::Struct`].
     pub fn split_exact_inclusive(self, by: &str, n: usize) -> Expr {
         let by = by.to_string();
 
@@ -258,30 +282,56 @@ impl StringNameSpace {
             .with_fmt("str.split_exact")
     }
 
-    /// Split the string by a substring and keep the substring.
-    /// Split exactly `n` times by a given substring. The resulting dtype is `List<Utf8>`.
-    pub fn split_inclusive(self, by: &str) -> Expr {
+    #[cfg(feature = "dtype-struct")]
+    /// Split by a given substring, returning exactly `n` items. If there are more possible splits,
+    /// keeps the remainder of the string intact. The resulting dtype is [`DataType::Struct`].
+    pub fn splitn(self, by: &str, n: usize) -> Expr {
         let by = by.to_string();
 
         let function = move |s: Series| {
             let ca = s.utf8()?;
 
-            let mut builder = ListUtf8ChunkedBuilder::new(s.name(), s.len(), ca.get_values_size());
+            let mut arrs = (0..n)
+                .map(|_| MutableUtf8Array::<i64>::with_capacity(ca.len()))
+                .collect::<Vec<_>>();
+
             ca.into_iter().for_each(|opt_s| match opt_s {
-                None => builder.append_null(),
+                None => {
+                    for arr in &mut arrs {
+                        arr.push_null()
+                    }
+                }
                 Some(s) => {
-                    let iter = s.split_inclusive(&by);
-                    builder.append_values_iter(iter);
+                    let mut arr_iter = arrs.iter_mut();
+                    let split_iter = s.splitn(n, &by);
+                    (split_iter)
+                        .zip(&mut arr_iter)
+                        .for_each(|(splitted, arr)| arr.push(Some(splitted)));
+                    // fill the remaining with null
+                    for arr in arr_iter {
+                        arr.push_null()
+                    }
                 }
             });
-            Ok(builder.finish().into_series())
+            let fields = arrs
+                .into_iter()
+                .enumerate()
+                .map(|(i, mut arr)| {
+                    Series::try_from((format!("field_{i}").as_str(), arr.as_box())).unwrap()
+                })
+                .collect::<Vec<_>>();
+            Ok(StructChunked::new(ca.name(), &fields)?.into_series())
         };
         self.0
             .map(
                 function,
-                GetOutput::from_type(DataType::List(Box::new(DataType::Utf8))),
+                GetOutput::from_type(DataType::Struct(
+                    (0..n)
+                        .map(|i| Field::new(&format!("field_{i}"), DataType::Utf8))
+                        .collect(),
+                )),
             )
-            .with_fmt("str.split_inclusive")
+            .with_fmt("str.splitn")
     }
 
     #[cfg(feature = "regex")]

diff --git a/py-polars/docs/source/reference/expression.rst b/py-polars/docs/source/reference/expression.rst
@@ -326,6 +326,7 @@ The following methods are available under the `Expr.str` attribute.
     ExprStringNameSpace.slice
     ExprStringNameSpace.split
     ExprStringNameSpace.split_exact
+    ExprStringNameSpace.splitn
     ExprStringNameSpace.starts_with
     ExprStringNameSpace.strip
     ExprStringNameSpace.strptime

diff --git a/py-polars/docs/source/reference/series.rst b/py-polars/docs/source/reference/series.rst
@@ -281,6 +281,7 @@ The following methods are available under the `Series.str` attribute.
     StringNameSpace.slice
     StringNameSpace.split
     StringNameSpace.split_exact
+    StringNameSpace.splitn
     StringNameSpace.starts_with
     StringNameSpace.strip
     StringNameSpace.strptime

diff --git a/py-polars/polars/internals/expr/string.py b/py-polars/polars/internals/expr/string.py
@@ -809,7 +809,8 @@ def split(self, by: str, inclusive: bool = False) -> pli.Expr:
 
     def split_exact(self, by: str, n: int, inclusive: bool = False) -> pli.Expr:
         """
-        Split the string by a substring into a struct of ``n`` fields.
+        Split the string by a substring into a struct of ``n+1`` fields using
+        ``n`` splits.
 
         If it cannot make ``n`` splits, the remaining field elements will be null.
 
@@ -824,12 +825,11 @@ def split_exact(self, by: str, n: int, inclusive: bool = False) -> pli.Expr:
 
         Examples
         --------
-        >>> (
-        ...     pl.DataFrame({"x": ["a_1", None, "c", "d_4"]}).select(
-        ...         [
-        ...             pl.col("x").str.split_exact("_", 1).alias("fields"),
-        ...         ]
-        ...     )
+        >>> df = pl.DataFrame({"x": ["a_1", None, "c", "d_4"]})
+        >>> df.select(
+        ...     [
+        ...         pl.col("x").str.split_exact("_", 1).alias("fields"),
+        ...     ]
         ... )
         shape: (4, 1)
         ┌─────────────┐
@@ -850,7 +850,7 @@ def split_exact(self, by: str, n: int, inclusive: bool = False) -> pli.Expr:
         Split string values in column x in exactly 2 parts and assign
         each part to a new column.
 
-        >>> pl.DataFrame({"x": ["a_1", None, "c", "d_4"]}).with_columns(
+        >>> df.with_columns(
         ...     [
         ...         pl.col("x")
         ...         .str.split_exact("_", 1)
@@ -882,6 +882,73 @@ def split_exact(self, by: str, n: int, inclusive: bool = False) -> pli.Expr:
             return pli.wrap_expr(self._pyexpr.str_split_exact_inclusive(by, n))
         return pli.wrap_expr(self._pyexpr.str_split_exact(by, n))
 
+    def splitn(self, by: str, n: int) -> pli.Expr:
+        """
+        Split the string by a substring, restricted to returning at most ``n`` items.
+
+        If the number of possible splits is less than ``n-1``, the remaining field
+        elements will be null. If the number of possible splits is ``n-1`` or greater,
+        the last (nth) substring will contain the remainder of the string.
+
+        Parameters
+        ----------
+        by
+            Substring to split by.
+        n
+            Max number of items to return.
+
+        Examples
+        --------
+        >>> df = pl.DataFrame({"s": ["foo bar", None, "foo-bar", "foo bar baz"]})
+        >>> df.select(pl.col("s").str.splitn(" ", 2).alias("fields"))
+        shape: (4, 1)
+        ┌───────────────────┐
+        │ fields            │
+        │ ---               │
+        │ struct[2]         │
+        ╞═══════════════════╡
+        │ {"foo","bar"}     │
+        ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+        │ {null,null}       │
+        ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+        │ {"foo-bar",null}  │
+        ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+        │ {"foo","bar baz"} │
+        └───────────────────┘
+
+        Split string values in column s in exactly 2 parts and assign
+        each part to a new column.
+
+        >>> df.with_columns(
+        ...     [
+        ...         pl.col("s")
+        ...         .str.splitn(" ", 2)
+        ...         .struct.rename_fields(["first_part", "second_part"])
+        ...         .alias("fields"),
+        ...     ]
+        ... ).unnest("fields")
+        shape: (4, 3)
+        ┌─────────────┬────────────┬─────────────┐
+        │ s           ┆ first_part ┆ second_part │
+        │ ---         ┆ ---        ┆ ---         │
+        │ str         ┆ str        ┆ str         │
+        ╞═════════════╪════════════╪═════════════╡
+        │ foo bar     ┆ foo        ┆ bar         │
+        ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+        │ null        ┆ null       ┆ null        │
+        ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+        │ foo-bar     ┆ foo-bar    ┆ null        │
+        ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+        │ foo bar baz ┆ foo        ┆ bar baz     │
+        └─────────────┴────────────┴─────────────┘
+
+        Returns
+        -------
+        Struct of Utf8 type
+
+        """
+        return pli.wrap_expr(self._pyexpr.str_splitn(by, n))
+
     def replace(
         self, pattern: str | pli.Expr, value: str | pli.Expr, literal: bool = False
     ) -> pli.Expr:

diff --git a/py-polars/polars/internals/series/string.py b/py-polars/polars/internals/series/string.py
@@ -471,7 +471,8 @@ def split(self, by: str, inclusive: bool = False) -> pli.Series:
 
     def split_exact(self, by: str, n: int, inclusive: bool = False) -> pli.Series:
         """
-        Split the string by a substring into a struct of ``n`` fields.
+        Split the string by a substring into a struct of ``n+1`` fields using
+        ``n`` splits.
 
         If it cannot make ``n`` splits, the remaining field elements will be null.
 
@@ -487,37 +488,27 @@ def split_exact(self, by: str, n: int, inclusive: bool = False) -> pli.Series:
         Examples
         --------
         >>> df = pl.DataFrame({"x": ["a_1", None, "c", "d_4"]})
-        >>> df.select(
-        ...     [
-        ...         pl.col("x").str.split_exact("_", 1).alias("fields"),
-        ...     ]
+        >>> df["x"].str.split_exact("_", 1).alias("fields")
+        shape: (4,)
+        Series: 'fields' [struct[2]]
+        [
+                {"a","1"}
+                {null,null}
+                {"c",null}
+                {"d","4"}
+        ]
+
+        Split string values in column x in exactly 2 parts and assign
+        each part to a new column.
+
+        >>> (
+        ...     df["x"]
+        ...     .str.split_exact("_", 1)
+        ...     .struct.rename_fields(["first_part", "second_part"])
+        ...     .alias("fields")
+        ...     .to_frame()
+        ...     .unnest("fields")
         ... )
-        shape: (4, 1)
-        ┌─────────────┐
-        │ fields      │
-        │ ---         │
-        │ struct[2]   │
-        ╞═════════════╡
-        │ {"a","1"}   │
-        ├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
-        │ {null,null} │
-        ├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
-        │ {"c",null}  │
-        ├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
-        │ {"d","4"}   │
-        └─────────────┘
-
-        Split column in ``n`` fields, give them a proper name in the struct and add them
-        as columns.
-
-        >>> df.select(
-        ...     [
-        ...         pl.col("x")
-        ...         .str.split_exact("_", 1)
-        ...         .struct.rename_fields(["first_part", "second_part"])
-        ...         .alias("fields"),
-        ...     ]
-        ... ).unnest("fields")
         shape: (4, 2)
         ┌────────────┬─────────────┐
         │ first_part ┆ second_part │
@@ -545,6 +536,68 @@ def split_exact(self, by: str, n: int, inclusive: bool = False) -> pli.Series:
             .to_series()
         )
 
+    def splitn(self, by: str, n: int) -> pli.Series:
+        """
+        Split the string by a substring, restricted to returning at most ``n`` items.
+
+        If the number of possible splits is less than ``n-1``, the remaining field
+        elements will be null. If the number of possible splits is ``n-1`` or greater,
+        the last (nth) substring will contain the remainder of the string.
+
+        Parameters
+        ----------
+        by
+            Substring to split by.
+        n
+            Max number of items to return.
+
+        Examples
+        --------
+        >>> df = pl.DataFrame({"s": ["foo bar", None, "foo-bar", "foo bar baz"]})
+        >>> df["s"].str.splitn(" ", 2).alias("fields")
+        shape: (4,)
+        Series: 'fields' [struct[2]]
+        [
+                {"foo","bar"}
+                {null,null}
+                {"foo-bar",null}
+                {"foo","bar baz"}
+        ]
+
+        Split string values in column s in exactly 2 parts and assign
+        each part to a new column.
+
+        >>> (
+        ...     df["s"]
+        ...     .str.splitn(" ", 2)
+        ...     .struct.rename_fields(["first_part", "second_part"])
+        ...     .alias("fields")
+        ...     .to_frame()
+        ...     .unnest("fields")
+        ... )
+        shape: (4, 2)
+        ┌────────────┬─────────────┐
+        │ first_part ┆ second_part │
+        │ ---        ┆ ---         │
+        │ str        ┆ str         │
+        ╞════════════╪═════════════╡
+        │ foo        ┆ bar         │
+        ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+        │ null       ┆ null        │
+        ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+        │ foo-bar    ┆ null        │
+        ├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+        │ foo        ┆ bar baz     │
+        └────────────┴─────────────┘
+
+        Returns
+        -------
+        Struct of Utf8 type
+
+        """
+        s = pli.wrap_s(self._s)
+        return s.to_frame().select(pli.col(s.name).str.splitn(by, n)).to_series()
+
     def replace(self, pattern: str, value: str, literal: bool = False) -> pli.Series:
         r"""
         Replace first matching regex/literal substring with a new string value.

diff --git a/py-polars/src/lazy/dsl.rs b/py-polars/src/lazy/dsl.rs
@@ -726,6 +726,10 @@ impl PyExpr {
         self.inner.clone().str().split_exact_inclusive(by, n).into()
     }
 
+    pub fn str_splitn(&self, by: &str, n: usize) -> PyExpr {
+        self.inner.clone().str().splitn(by, n).into()
+    }
+
     pub fn arr_lengths(&self) -> PyExpr {
         self.inner.clone().arr().lengths().into()
     }