Skip to content

Commit

Permalink
str.split (#2534)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Feb 3, 2022
1 parent 21f287a commit 36274ed
Show file tree
Hide file tree
Showing 14 changed files with 101 additions and 26 deletions.
4 changes: 2 additions & 2 deletions polars/polars-arrow/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ description = "Arrow interfaces for Polars DataFrame library"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
arrow = { package = "arrow2", git = "https://github.com/jorgecarleitao/arrow2", rev = "086c54d00aee7b26ba3c8d0c5c683c531e2c75c1", default-features = false }
# arrow = { package = "arrow2", git = "https://github.com/ritchie46/arrow2", default-features = false, features = ["compute"], branch = "offset_pub" }
# arrow = { package = "arrow2", git = "https://github.com/jorgecarleitao/arrow2", rev = "086c54d00aee7b26ba3c8d0c5c683c531e2c75c1", default-features = false }
arrow = { package = "arrow2", git = "https://github.com/ritchie46/arrow2", branch = "from_iter_variants2", default-features = false }
# arrow = { package = "arrow2", version = "0.9", default-features = false, features = ["compute_concatenate"] }
hashbrown = "0.12"
num = "^0.4"
Expand Down
11 changes: 7 additions & 4 deletions polars/polars-arrow/src/kernels/take.rs
Original file line number Diff line number Diff line change
Expand Up @@ -233,11 +233,14 @@ pub unsafe fn take_no_null_bool_iter_unchecked<I: IntoIterator<Item = usize>>(

let iter = indices.into_iter().map(|idx| {
debug_assert!(idx < values.len());
Some(values.get_bit_unchecked(idx))
values.get_bit_unchecked(idx)
});

// TODO: use values_iter. Need to add unchecked version for arrow
Arc::new(BooleanArray::from_trusted_len_iter_unchecked(iter))
let mutable = MutableBitmap::from_trusted_len_iter_unchecked(iter);
Arc::new(BooleanArray::from_data(
DataType::Boolean,
mutable.into(),
None,
))
}

/// Take kernel for single chunk and an iterator as index.
Expand Down
8 changes: 4 additions & 4 deletions polars/polars-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -160,10 +160,10 @@ unsafe_unwrap = "^0.1.0"

[dependencies.arrow]
package = "arrow2"
git = "https://github.com/jorgecarleitao/arrow2"
# git = "https://github.com/ritchie46/arrow2"
rev = "086c54d00aee7b26ba3c8d0c5c683c531e2c75c1"
# branch = "offset_pub"
# git = "https://github.com/jorgecarleitao/arrow2"
git = "https://github.com/ritchie46/arrow2"
# rev = "54797de5e4860cebc4eb73ad1890457cd1a658eb"
branch = "from_iter_variants2"
# version = "0.9"
default-features = false
features = [
Expand Down
18 changes: 14 additions & 4 deletions polars/polars-core/src/chunked_array/builder/list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -197,8 +197,10 @@ impl ListUtf8ChunkedBuilder {
}
}

#[inline]
pub fn append_iter<'a, I: Iterator<Item = Option<&'a str>> + TrustedLen>(&mut self, iter: I) {
pub fn append_trusted_len_iter<'a, I: Iterator<Item = Option<&'a str>> + TrustedLen>(
&mut self,
iter: I,
) {
let values = self.builder.mut_values();

if iter.size_hint().0 == 0 {
Expand All @@ -210,7 +212,16 @@ impl ListUtf8ChunkedBuilder {
self.builder.try_push_valid().unwrap();
}

#[inline]
pub fn append_values_iter<'a, I: Iterator<Item = &'a str>>(&mut self, iter: I) {
let values = self.builder.mut_values();

if iter.size_hint().0 == 0 {
self.fast_explode = false;
}
values.extend_values(iter);
self.builder.try_push_valid().unwrap();
}

pub(crate) fn append(&mut self, ca: &Utf8Chunked) {
let value_builder = self.builder.mut_values();
value_builder.try_extend(ca).unwrap();
Expand All @@ -234,7 +245,6 @@ impl ListBuilderTrait for ListUtf8ChunkedBuilder {
self.builder.push_null();
}

#[inline]
fn append_series(&mut self, s: &Series) {
if s.is_empty() {
self.fast_explode = false;
Expand Down
4 changes: 2 additions & 2 deletions polars/polars-io/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ private = []
[dependencies]
ahash = "0.7"
anyhow = "1.0"
arrow = { package = "arrow2", git = "https://github.com/jorgecarleitao/arrow2", rev = "086c54d00aee7b26ba3c8d0c5c683c531e2c75c1", default-features = false }
# arrow = { package = "arrow2", git = "https://github.com/ritchie46/arrow2", default-features = false, features = ["compute"], branch = "offset_pub" }
# arrow = { package = "arrow2", git = "https://github.com/jorgecarleitao/arrow2", rev = "086c54d00aee7b26ba3c8d0c5c683c531e2c75c1", default-features = false }
arrow = { package = "arrow2", git = "https://github.com/ritchie46/arrow2", branch = "from_iter_variants2", default-features = false }
# arrow = { package = "arrow2", version = "0.9", default-features = false }
csv-core = { version = "0.1.10", optional = true }
dirs = "4.0"
Expand Down
35 changes: 29 additions & 6 deletions polars/polars-lazy/src/dsl/string.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use super::*;
use polars_arrow::array::ValueSize;

pub struct StringNameSpace(pub(crate) Expr);

Expand All @@ -7,10 +8,7 @@ impl StringNameSpace {
let pat = pat.to_string();
let function = move |s: Series| {
let ca = s.utf8()?;
match ca.extract(&pat, group_index) {
Ok(ca) => Ok(ca.into_series()),
Err(e) => Err(PolarsError::ComputeError(format!("{:?}", e).into())),
}
ca.extract(&pat, group_index).map(|ca| ca.into_series())
};
self.0
.map(function, GetOutput::from_type(DataType::Utf8))
Expand Down Expand Up @@ -59,7 +57,7 @@ impl StringNameSpace {
};
self.0
.map(function, GetOutput::from_type(out_type))
.with_fmt("strptime")
.with_fmt("str.strptime")
}

#[cfg(feature = "concat_str")]
Expand All @@ -80,8 +78,33 @@ impl StringNameSpace {
collect_groups: ApplyOptions::ApplyGroups,
input_wildcard_expansion: false,
auto_explode: true,
fmt_str: "str_concat",
fmt_str: "str.concat",
},
}
}

/// Split the string by a substring.
pub fn split(self, by: &str) -> Expr {
let by = by.to_string();

let function = move |s: Series| {
let ca = s.utf8()?;

let mut builder = ListUtf8ChunkedBuilder::new(s.name(), s.len(), ca.get_values_size());
ca.into_iter().for_each(|opt_s| match opt_s {
None => builder.append_null(),
Some(s) => {
let iter = s.split(&by);
builder.append_values_iter(iter);
}
});
Ok(builder.finish().into_series())
};
self.0
.map(
function,
GetOutput::from_type(DataType::List(Box::new(DataType::Utf8))),
)
.with_fmt("str.split")
}
}
7 changes: 4 additions & 3 deletions py-polars/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions py-polars/docs/source/reference/expression.rst
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,7 @@ The following methods are available under the `Expr.str` attribute.
ExprStringNameSpace.contains
ExprStringNameSpace.json_path_match
ExprStringNameSpace.extract
ExprStringNameSpace.split
ExprStringNameSpace.replace
ExprStringNameSpace.replace_all
ExprStringNameSpace.slice
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ The following methods are available under the `Series.str` attribute.
StringNameSpace.contains
StringNameSpace.json_path_match
StringNameSpace.extract
StringNameSpace.split
StringNameSpace.replace
StringNameSpace.replace_all
StringNameSpace.to_lowercase
Expand Down
12 changes: 12 additions & 0 deletions py-polars/polars/internals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -2739,6 +2739,18 @@ def extract(self, pattern: str, group_index: int = 1) -> Expr:
"""
return wrap_expr(self._pyexpr.str_extract(pattern, group_index))

def split(self, by: str) -> Expr:
"""
Split the string by a substring.
The return type will by of type List<Utf8>
Parameters
----------
by
substring
"""
return wrap_expr(self._pyexpr.str_split(by))

def replace(self, pattern: str, value: str) -> Expr:
"""
Replace first regex match with a string value.
Expand Down
13 changes: 13 additions & 0 deletions py-polars/polars/internals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3662,6 +3662,19 @@ def extract(self, pattern: str, group_index: int = 1) -> Series:
"""
return wrap_s(self._s.str_extract(pattern, group_index))

def split(self, by: str) -> Series:
"""
Split the string by a substring.
The return type will by of type List<Utf8>
Parameters
----------
by
substring
"""
s = wrap_s(self._s)
return s.to_frame().select(pli.col(s.name).str.split(by)).to_series()

def replace(self, pattern: str, value: str) -> Series:
"""
Replace first regex match with a string value.
Expand Down
3 changes: 3 additions & 0 deletions py-polars/src/lazy/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -579,6 +579,9 @@ impl PyExpr {
pub fn strftime(&self, fmt: &str) -> PyExpr {
self.inner.clone().dt().strftime(fmt).into()
}
pub fn str_split(&self, by: &str) -> PyExpr {
self.inner.clone().str().split(by).into()
}

pub fn arr_lengths(&self) -> PyExpr {
let function = |s: Series| {
Expand Down
2 changes: 1 addition & 1 deletion py-polars/src/list_construction.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ pub fn py_seq_to_list(name: &str, seq: &PyAny, dtype: &PyAny) -> PyResult<Series
})
.trust_my_length(len)
};
builder.append_iter(iter)
builder.append_trusted_len_iter(iter)
}
builder.finish().into_series()
}
Expand Down
8 changes: 8 additions & 0 deletions py-polars/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1458,3 +1458,11 @@ def test_duration_extract_times() -> None:
def test_mean_overflow() -> None:
arr = np.array([255] * (1 << 17), dtype="int16")
assert arr.mean() == 255.0


def test_str_split() -> None:
a = pl.Series("a", ["a, b", "a", "ab,c,de"])
for out in [a.str.split(","), pl.select(pl.lit(a).str.split(",")).to_series()]:
assert out[0].to_list() == ["a", " b"]
assert out[1].to_list() == ["a"]
assert out[2].to_list() == ["ab", "c", "de"]

0 comments on commit 36274ed

Please sign in to comment.