Skip to content

Commit

Permalink
feat[rust, python]: Add splitn expression (#4373)
Browse files Browse the repository at this point in the history
This splits a string value a fixed number of times and keeps remainder intact.
  • Loading branch information
physinet committed Aug 25, 2022
1 parent 40e4e77 commit 30d6270
Show file tree
Hide file tree
Showing 7 changed files with 280 additions and 54 deletions.
80 changes: 65 additions & 15 deletions polars/polars-lazy/src/dsl/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,7 @@ impl StringNameSpace {
}
}

/// Split the string by a substring.
// Split exactly `n` times by a given substring. The resulting dtype is `List<Utf8>`.
/// Split the string by a substring. The resulting dtype is `List<Utf8>`.
pub fn split(self, by: &str) -> Expr {
let by = by.to_string();

Expand All @@ -155,8 +154,33 @@ impl StringNameSpace {
.with_fmt("str.split")
}

/// Split the string by a substring and keep the substring. The resulting dtype is `List<Utf8>`.
pub fn split_inclusive(self, by: &str) -> Expr {
let by = by.to_string();

let function = move |s: Series| {
let ca = s.utf8()?;

let mut builder = ListUtf8ChunkedBuilder::new(s.name(), s.len(), ca.get_values_size());
ca.into_iter().for_each(|opt_s| match opt_s {
None => builder.append_null(),
Some(s) => {
let iter = s.split_inclusive(&by);
builder.append_values_iter(iter);
}
});
Ok(builder.finish().into_series())
};
self.0
.map(
function,
GetOutput::from_type(DataType::List(Box::new(DataType::Utf8))),
)
.with_fmt("str.split_inclusive")
}

#[cfg(feature = "dtype-struct")]
// Split exactly `n` times by a given substring. The resulting dtype is [`DataType::Struct`].
/// Split exactly `n` times by a given substring. The resulting dtype is [`DataType::Struct`].
pub fn split_exact(self, by: &str, n: usize) -> Expr {
let by = by.to_string();

Expand Down Expand Up @@ -207,8 +231,8 @@ impl StringNameSpace {
}

#[cfg(feature = "dtype-struct")]
// Split exactly `n` times by a given substring and keep the substring.
// The resulting dtype is [`DataType::Struct`].
/// Split exactly `n` times by a given substring and keep the substring.
/// The resulting dtype is [`DataType::Struct`].
pub fn split_exact_inclusive(self, by: &str, n: usize) -> Expr {
let by = by.to_string();

Expand Down Expand Up @@ -258,30 +282,56 @@ impl StringNameSpace {
.with_fmt("str.split_exact")
}

/// Split the string by a substring and keep the substring.
/// Split exactly `n` times by a given substring. The resulting dtype is `List<Utf8>`.
pub fn split_inclusive(self, by: &str) -> Expr {
#[cfg(feature = "dtype-struct")]
/// Split by a given substring, returning exactly `n` items. If there are more possible splits,
/// keeps the remainder of the string intact. The resulting dtype is [`DataType::Struct`].
pub fn splitn(self, by: &str, n: usize) -> Expr {
let by = by.to_string();

let function = move |s: Series| {
let ca = s.utf8()?;

let mut builder = ListUtf8ChunkedBuilder::new(s.name(), s.len(), ca.get_values_size());
let mut arrs = (0..n)
.map(|_| MutableUtf8Array::<i64>::with_capacity(ca.len()))
.collect::<Vec<_>>();

ca.into_iter().for_each(|opt_s| match opt_s {
None => builder.append_null(),
None => {
for arr in &mut arrs {
arr.push_null()
}
}
Some(s) => {
let iter = s.split_inclusive(&by);
builder.append_values_iter(iter);
let mut arr_iter = arrs.iter_mut();
let split_iter = s.splitn(n, &by);
(split_iter)
.zip(&mut arr_iter)
.for_each(|(splitted, arr)| arr.push(Some(splitted)));
// fill the remaining with null
for arr in arr_iter {
arr.push_null()
}
}
});
Ok(builder.finish().into_series())
let fields = arrs
.into_iter()
.enumerate()
.map(|(i, mut arr)| {
Series::try_from((format!("field_{i}").as_str(), arr.as_box())).unwrap()
})
.collect::<Vec<_>>();
Ok(StructChunked::new(ca.name(), &fields)?.into_series())
};
self.0
.map(
function,
GetOutput::from_type(DataType::List(Box::new(DataType::Utf8))),
GetOutput::from_type(DataType::Struct(
(0..n)
.map(|i| Field::new(&format!("field_{i}"), DataType::Utf8))
.collect(),
)),
)
.with_fmt("str.split_inclusive")
.with_fmt("str.splitn")
}

#[cfg(feature = "regex")]
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/expression.rst
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,7 @@ The following methods are available under the `Expr.str` attribute.
ExprStringNameSpace.slice
ExprStringNameSpace.split
ExprStringNameSpace.split_exact
ExprStringNameSpace.splitn
ExprStringNameSpace.starts_with
ExprStringNameSpace.strip
ExprStringNameSpace.strptime
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,7 @@ The following methods are available under the `Series.str` attribute.
StringNameSpace.slice
StringNameSpace.split
StringNameSpace.split_exact
StringNameSpace.splitn
StringNameSpace.starts_with
StringNameSpace.strip
StringNameSpace.strptime
Expand Down
83 changes: 75 additions & 8 deletions py-polars/polars/internals/expr/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -809,7 +809,8 @@ def split(self, by: str, inclusive: bool = False) -> pli.Expr:

def split_exact(self, by: str, n: int, inclusive: bool = False) -> pli.Expr:
"""
Split the string by a substring into a struct of ``n`` fields.
Split the string by a substring into a struct of ``n+1`` fields using
``n`` splits.
If it cannot make ``n`` splits, the remaining field elements will be null.
Expand All @@ -824,12 +825,11 @@ def split_exact(self, by: str, n: int, inclusive: bool = False) -> pli.Expr:
Examples
--------
>>> (
... pl.DataFrame({"x": ["a_1", None, "c", "d_4"]}).select(
... [
... pl.col("x").str.split_exact("_", 1).alias("fields"),
... ]
... )
>>> df = pl.DataFrame({"x": ["a_1", None, "c", "d_4"]})
>>> df.select(
... [
... pl.col("x").str.split_exact("_", 1).alias("fields"),
... ]
... )
shape: (4, 1)
┌─────────────┐
Expand All @@ -850,7 +850,7 @@ def split_exact(self, by: str, n: int, inclusive: bool = False) -> pli.Expr:
Split string values in column x in exactly 2 parts and assign
each part to a new column.
>>> pl.DataFrame({"x": ["a_1", None, "c", "d_4"]}).with_columns(
>>> df.with_columns(
... [
... pl.col("x")
... .str.split_exact("_", 1)
Expand Down Expand Up @@ -882,6 +882,73 @@ def split_exact(self, by: str, n: int, inclusive: bool = False) -> pli.Expr:
return pli.wrap_expr(self._pyexpr.str_split_exact_inclusive(by, n))
return pli.wrap_expr(self._pyexpr.str_split_exact(by, n))

def splitn(self, by: str, n: int) -> pli.Expr:
"""
Split the string by a substring, restricted to returning at most ``n`` items.
If the number of possible splits is less than ``n-1``, the remaining field
elements will be null. If the number of possible splits is ``n-1`` or greater,
the last (nth) substring will contain the remainder of the string.
Parameters
----------
by
Substring to split by.
n
Max number of items to return.
Examples
--------
>>> df = pl.DataFrame({"s": ["foo bar", None, "foo-bar", "foo bar baz"]})
>>> df.select(pl.col("s").str.splitn(" ", 2).alias("fields"))
shape: (4, 1)
┌───────────────────┐
│ fields │
│ --- │
│ struct[2] │
╞═══════════════════╡
│ {"foo","bar"} │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ {null,null} │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ {"foo-bar",null} │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ {"foo","bar baz"} │
└───────────────────┘
Split string values in column s in exactly 2 parts and assign
each part to a new column.
>>> df.with_columns(
... [
... pl.col("s")
... .str.splitn(" ", 2)
... .struct.rename_fields(["first_part", "second_part"])
... .alias("fields"),
... ]
... ).unnest("fields")
shape: (4, 3)
┌─────────────┬────────────┬─────────────┐
│ s ┆ first_part ┆ second_part │
│ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str │
╞═════════════╪════════════╪═════════════╡
│ foo bar ┆ foo ┆ bar │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ null ┆ null ┆ null │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ foo-bar ┆ foo-bar ┆ null │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ foo bar baz ┆ foo ┆ bar baz │
└─────────────┴────────────┴─────────────┘
Returns
-------
Struct of Utf8 type
"""
return pli.wrap_expr(self._pyexpr.str_splitn(by, n))

def replace(
self, pattern: str | pli.Expr, value: str | pli.Expr, literal: bool = False
) -> pli.Expr:
Expand Down
115 changes: 84 additions & 31 deletions py-polars/polars/internals/series/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,8 @@ def split(self, by: str, inclusive: bool = False) -> pli.Series:

def split_exact(self, by: str, n: int, inclusive: bool = False) -> pli.Series:
"""
Split the string by a substring into a struct of ``n`` fields.
Split the string by a substring into a struct of ``n+1`` fields using
``n`` splits.
If it cannot make ``n`` splits, the remaining field elements will be null.
Expand All @@ -487,37 +488,27 @@ def split_exact(self, by: str, n: int, inclusive: bool = False) -> pli.Series:
Examples
--------
>>> df = pl.DataFrame({"x": ["a_1", None, "c", "d_4"]})
>>> df.select(
... [
... pl.col("x").str.split_exact("_", 1).alias("fields"),
... ]
>>> df["x"].str.split_exact("_", 1).alias("fields")
shape: (4,)
Series: 'fields' [struct[2]]
[
{"a","1"}
{null,null}
{"c",null}
{"d","4"}
]
Split string values in column x in exactly 2 parts and assign
each part to a new column.
>>> (
... df["x"]
... .str.split_exact("_", 1)
... .struct.rename_fields(["first_part", "second_part"])
... .alias("fields")
... .to_frame()
... .unnest("fields")
... )
shape: (4, 1)
┌─────────────┐
│ fields │
│ --- │
│ struct[2] │
╞═════════════╡
│ {"a","1"} │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ {null,null} │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ {"c",null} │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ {"d","4"} │
└─────────────┘
Split column in ``n`` fields, give them a proper name in the struct and add them
as columns.
>>> df.select(
... [
... pl.col("x")
... .str.split_exact("_", 1)
... .struct.rename_fields(["first_part", "second_part"])
... .alias("fields"),
... ]
... ).unnest("fields")
shape: (4, 2)
┌────────────┬─────────────┐
│ first_part ┆ second_part │
Expand Down Expand Up @@ -545,6 +536,68 @@ def split_exact(self, by: str, n: int, inclusive: bool = False) -> pli.Series:
.to_series()
)

def splitn(self, by: str, n: int) -> pli.Series:
"""
Split the string by a substring, restricted to returning at most ``n`` items.
If the number of possible splits is less than ``n-1``, the remaining field
elements will be null. If the number of possible splits is ``n-1`` or greater,
the last (nth) substring will contain the remainder of the string.
Parameters
----------
by
Substring to split by.
n
Max number of items to return.
Examples
--------
>>> df = pl.DataFrame({"s": ["foo bar", None, "foo-bar", "foo bar baz"]})
>>> df["s"].str.splitn(" ", 2).alias("fields")
shape: (4,)
Series: 'fields' [struct[2]]
[
{"foo","bar"}
{null,null}
{"foo-bar",null}
{"foo","bar baz"}
]
Split string values in column s in exactly 2 parts and assign
each part to a new column.
>>> (
... df["s"]
... .str.splitn(" ", 2)
... .struct.rename_fields(["first_part", "second_part"])
... .alias("fields")
... .to_frame()
... .unnest("fields")
... )
shape: (4, 2)
┌────────────┬─────────────┐
│ first_part ┆ second_part │
│ --- ┆ --- │
│ str ┆ str │
╞════════════╪═════════════╡
│ foo ┆ bar │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ null ┆ null │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ foo-bar ┆ null │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ foo ┆ bar baz │
└────────────┴─────────────┘
Returns
-------
Struct of Utf8 type
"""
s = pli.wrap_s(self._s)
return s.to_frame().select(pli.col(s.name).str.splitn(by, n)).to_series()

def replace(self, pattern: str, value: str, literal: bool = False) -> pli.Series:
r"""
Replace first matching regex/literal substring with a new string value.
Expand Down
4 changes: 4 additions & 0 deletions py-polars/src/lazy/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -726,6 +726,10 @@ impl PyExpr {
self.inner.clone().str().split_exact_inclusive(by, n).into()
}

pub fn str_splitn(&self, by: &str, n: usize) -> PyExpr {
self.inner.clone().str().splitn(by, n).into()
}

pub fn arr_lengths(&self) -> PyExpr {
self.inner.clone().arr().lengths().into()
}
Expand Down

0 comments on commit 30d6270

Please sign in to comment.