Skip to content

Commit

Permalink
split exact expression (#2872)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Mar 11, 2022
1 parent 0038bd2 commit 10948a5
Show file tree
Hide file tree
Showing 7 changed files with 227 additions and 0 deletions.
99 changes: 99 additions & 0 deletions polars/polars-lazy/src/dsl/string.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use super::*;
use polars_arrow::array::ValueSize;
use polars_arrow::export::arrow::array::{MutableArray, MutableUtf8Array};

/// Specialized expressions for [`Series`] of [`DataType::Utf8`].
pub struct StringNameSpace(pub(crate) Expr);
Expand Down Expand Up @@ -109,6 +110,104 @@ impl StringNameSpace {
.with_fmt("str.split")
}

pub fn split_exact(self, by: &str, n: usize) -> Expr {
let by = by.to_string();

let function = move |s: Series| {
let ca = s.utf8()?;

let mut arrs = (0..n)
.map(|_| MutableUtf8Array::<i64>::with_capacity(ca.len()))
.collect::<Vec<_>>();

ca.into_iter().for_each(|opt_s| match opt_s {
None => {
for arr in &mut arrs {
arr.push_null()
}
}
Some(s) => {
let mut arr_iter = arrs.iter_mut();
let split_iter = s.split(&by);
(split_iter)
.zip(&mut arr_iter)
.for_each(|(splitted, arr)| arr.push(Some(splitted)));
// fill the remaining with null
for arr in arr_iter {
arr.push_null()
}
}
});
let fields = arrs
.into_iter()
.enumerate()
.map(|(i, arr)| {
Series::try_from((format!("field_{i}").as_str(), arr.into_arc())).unwrap()
})
.collect::<Vec<_>>();
Ok(StructChunked::new(ca.name(), &fields)?.into_series())
};
self.0
.map(
function,
GetOutput::from_type(DataType::Struct(
(0..n)
.map(|i| Field::new(&format!("field_{i}"), DataType::Utf8))
.collect(),
)),
)
.with_fmt("str.split_exact")
}

pub fn split_exact_inclusive(self, by: &str, n: usize) -> Expr {
let by = by.to_string();

let function = move |s: Series| {
let ca = s.utf8()?;

let mut arrs = (0..n)
.map(|_| MutableUtf8Array::<i64>::with_capacity(ca.len()))
.collect::<Vec<_>>();

ca.into_iter().for_each(|opt_s| match opt_s {
None => {
for arr in &mut arrs {
arr.push_null()
}
}
Some(s) => {
let mut arr_iter = arrs.iter_mut();
let split_iter = s.split_inclusive(&by);
(split_iter)
.zip(&mut arr_iter)
.for_each(|(splitted, arr)| arr.push(Some(splitted)));
// fill the remaining with null
for arr in arr_iter {
arr.push_null()
}
}
});
let fields = arrs
.into_iter()
.enumerate()
.map(|(i, arr)| {
Series::try_from((format!("field_{i}").as_str(), arr.into_arc())).unwrap()
})
.collect::<Vec<_>>();
Ok(StructChunked::new(ca.name(), &fields)?.into_series())
};
self.0
.map(
function,
GetOutput::from_type(DataType::Struct(
(0..n)
.map(|i| Field::new(&format!("field_{i}"), DataType::Utf8))
.collect(),
)),
)
.with_fmt("str.split_exact")
}

/// Split the string by a substring.
pub fn split_inclusive(self, by: &str) -> Expr {
let by = by.to_string();
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/expression.rst
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,7 @@ The following methods are available under the `Expr.str` attribute.
ExprStringNameSpace.json_path_match
ExprStringNameSpace.extract
ExprStringNameSpace.split
ExprStringNameSpace.split_exact
ExprStringNameSpace.replace
ExprStringNameSpace.replace_all
ExprStringNameSpace.slice
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ The following methods are available under the `Series.str` attribute.
StringNameSpace.json_path_match
StringNameSpace.extract
StringNameSpace.split
StringNameSpace.split_exact
StringNameSpace.replace
StringNameSpace.replace_all
StringNameSpace.to_lowercase
Expand Down
46 changes: 46 additions & 0 deletions py-polars/polars/internals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3587,6 +3587,52 @@ def split(self, by: str, inclusive: bool = False) -> Expr:
return wrap_expr(self._pyexpr.str_split_inclusive(by))
return wrap_expr(self._pyexpr.str_split(by))

def split_exact(self, by: str, n: int, inclusive: bool = False) -> Expr:
"""
Split the string by a substring into a struct of `n` fields.
The return type will by of type Struct<Utf8>
If it cannot make `n` splits, the remaiming field elements will be null
Parameters
----------
by
substring
n
Number of splits to make
inclusive
Include the split character/string in the results
Examples
--------
>>> (
... pl.DataFrame({"x": ["a_1", None, "c", "d_4"]}).select(
... [
... pl.col("x").str.split_exact("_", 2).alias("fields"),
... ]
... )
... )
shape: (4, 1)
┌───────────────────────────────────────────┐
│ fields │
│ --- │
│ struct[2]{'field_0': str, 'field_1': str} │
╞═══════════════════════════════════════════╡
│ {"a","1"} │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ {null,null} │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ {"c",null} │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ {"d","4"} │
└───────────────────────────────────────────┘
"""
if inclusive:
return wrap_expr(self._pyexpr.str_split_exact_inclusive(by, n))
return wrap_expr(self._pyexpr.str_split_exact(by, n))

def replace(self, pattern: str, value: str) -> Expr:
"""
Replace first regex match with a string value.
Expand Down
49 changes: 49 additions & 0 deletions py-polars/polars/internals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3868,6 +3868,55 @@ def split(self, by: str, inclusive: bool = False) -> Series:
s = wrap_s(self._s)
return s.to_frame().select(pli.col(s.name).str.split(by, inclusive)).to_series()

def split_exact(self, by: str, n: int, inclusive: bool = False) -> Series:
"""
Split the string by a substring into a struct of `n` fields.
The return type will by of type Struct<Utf8>
If it cannot make `n` splits, the remaiming field elements will be null
Parameters
----------
by
substring
n
Number of splits to make
inclusive
Include the split character/string in the results
Examples
--------
>>> (
... pl.DataFrame({"x": ["a_1", None, "c", "d_4"]}).select(
... [
... pl.col("x").str.split_exact("_", 2).alias("fields"),
... ]
... )
... )
shape: (4, 1)
┌───────────────────────────────────────────┐
│ fields │
│ --- │
│ struct[2]{'field_0': str, 'field_1': str} │
╞═══════════════════════════════════════════╡
│ {"a","1"} │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ {null,null} │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ {"c",null} │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ {"d","4"} │
└───────────────────────────────────────────┘
"""
s = wrap_s(self._s)
return (
s.to_frame()
.select(pli.col(s.name).str.split_exact(by, n, inclusive))
.to_series()
)

def replace(self, pattern: str, value: str) -> Series:
"""
Replace first regex match with a string value.
Expand Down
7 changes: 7 additions & 0 deletions py-polars/src/lazy/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -592,6 +592,13 @@ impl PyExpr {
self.inner.clone().str().split_inclusive(by).into()
}

pub fn str_split_exact(&self, by: &str, n: usize) -> PyExpr {
self.inner.clone().str().split_exact(by, n).into()
}
pub fn str_split_exact_inclusive(&self, by: &str, n: usize) -> PyExpr {
self.inner.clone().str().split_exact_inclusive(by, n).into()
}

pub fn arr_lengths(&self) -> PyExpr {
self.inner.clone().arr().lengths().into()
}
Expand Down
24 changes: 24 additions & 0 deletions py-polars/tests/test_exprs.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,3 +122,27 @@ def test_wildcard_expansion() -> None:
assert df.select(
pl.concat_str(pl.all()).str.to_lowercase()
).to_series().to_list() == ["xs", "yo", "zs"]


def test_split_exact() -> None:
df = pl.DataFrame(dict(x=["a_a", None, "b", "c_c"]))
out = df.select([pl.col("x").str.split_exact("_", 3, inclusive=False)]).unnest("x")

expected = pl.DataFrame(
{
"field_0": ["a", None, "b", "c"],
"field_1": ["a", None, None, "c"],
"field_2": [None, None, None, None],
}
)

assert out.frame_equal(expected)

out = df.select([pl.col("x").str.split_exact("_", 2, inclusive=True)]).unnest("x")

expected = pl.DataFrame(
{"field_0": ["a_", None, "b", "c_"], "field_1": ["a", None, None, "c"]}
)
assert out.frame_equal(expected)
assert df["x"].str.split_exact("_", 1).dtype == pl.Struct
assert df["x"].str.split_exact("_", 1, inclusive=False).dtype == pl.Struct

0 comments on commit 10948a5

Please sign in to comment.