Skip to content

Commit

Permalink
add split_inclusive
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Feb 4, 2022
1 parent 43d097d commit c865dcb
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 3 deletions.
25 changes: 25 additions & 0 deletions polars/polars-lazy/src/dsl/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,4 +107,29 @@ impl StringNameSpace {
)
.with_fmt("str.split")
}

/// Split the string by a substring.
pub fn split_inclusive(self, by: &str) -> Expr {
let by = by.to_string();

let function = move |s: Series| {
let ca = s.utf8()?;

let mut builder = ListUtf8ChunkedBuilder::new(s.name(), s.len(), ca.get_values_size());
ca.into_iter().for_each(|opt_s| match opt_s {
None => builder.append_null(),
Some(s) => {
let iter = s.split_inclusive(&by);
builder.append_values_iter(iter);
}
});
Ok(builder.finish().into_series())
};
self.0
.map(
function,
GetOutput::from_type(DataType::List(Box::new(DataType::Utf8))),
)
.with_fmt("str.split_inclusive")
}
}
6 changes: 5 additions & 1 deletion py-polars/polars/internals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -2739,7 +2739,7 @@ def extract(self, pattern: str, group_index: int = 1) -> Expr:
"""
return wrap_expr(self._pyexpr.str_extract(pattern, group_index))

def split(self, by: str) -> Expr:
def split(self, by: str, inclusive: bool = False) -> Expr:
"""
Split the string by a substring.
The return type will by of type List<Utf8>
Expand All @@ -2748,7 +2748,11 @@ def split(self, by: str) -> Expr:
----------
by
substring
inclusive
Include the split character/string in the results
"""
if inclusive:
return wrap_expr(self._pyexpr.str_split_inclusive(by))
return wrap_expr(self._pyexpr.str_split(by))

def replace(self, pattern: str, value: str) -> Expr:
Expand Down
6 changes: 4 additions & 2 deletions py-polars/polars/internals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3662,7 +3662,7 @@ def extract(self, pattern: str, group_index: int = 1) -> Series:
"""
return wrap_s(self._s.str_extract(pattern, group_index))

def split(self, by: str) -> Series:
def split(self, by: str, inclusive: bool = False) -> Series:
"""
Split the string by a substring.
The return type will by of type List<Utf8>
Expand All @@ -3671,9 +3671,11 @@ def split(self, by: str) -> Series:
----------
by
substring
inclusive
Include the split character/string in the results
"""
s = wrap_s(self._s)
return s.to_frame().select(pli.col(s.name).str.split(by)).to_series()
return s.to_frame().select(pli.col(s.name).str.split(by, inclusive)).to_series()

def replace(self, pattern: str, value: str) -> Series:
"""
Expand Down
3 changes: 3 additions & 0 deletions py-polars/src/lazy/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,9 @@ impl PyExpr {
pub fn str_split(&self, by: &str) -> PyExpr {
self.inner.clone().str().split(by).into()
}
pub fn str_split_inclusive(&self, by: &str) -> PyExpr {
self.inner.clone().str().split_inclusive(by).into()
}

pub fn arr_lengths(&self) -> PyExpr {
let function = |s: Series| {
Expand Down
8 changes: 8 additions & 0 deletions py-polars/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1466,3 +1466,11 @@ def test_str_split() -> None:
assert out[0].to_list() == ["a", " b"]
assert out[1].to_list() == ["a"]
assert out[2].to_list() == ["ab", "c", "de"]

for out in [
a.str.split(",", inclusive=True),
pl.select(pl.lit(a).str.split(",", inclusive=True)).to_series(),
]:
assert out[0].to_list() == ["a,", " b"]
assert out[1].to_list() == ["a"]
assert out[2].to_list() == ["ab,", "c,", "de"]

0 comments on commit c865dcb

Please sign in to comment.