Skip to content

Commit

Permalink
add vertial string concat; closes #1490
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Oct 8, 2021
1 parent 163457f commit 08f2a8d
Show file tree
Hide file tree
Showing 10 changed files with 180 additions and 8 deletions.
70 changes: 70 additions & 0 deletions polars/polars-core/src/chunked_array/ops/concat_str.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
use super::StrConcat;
use crate::prelude::*;
use arrow::buffer::MutableBuffer;
use polars_arrow::array::default_arrays::FromDataUtf8;
use std::fmt::{Display, Write};

fn fmt_and_write<T: Display>(value: Option<T>, buf: &mut String) {
match value {
None => buf.push_str("null"),
Some(v) => {
write!(buf, "{}", v).unwrap();
}
}
}

fn str_concat_impl<I, T>(mut iter: I, delimiter: &str, name: &str) -> Utf8Chunked
where
I: Iterator<Item = Option<T>>,
T: Display,
{
let mut buf = String::with_capacity(iter.size_hint().0 * 5);

if let Some(first) = iter.next() {
fmt_and_write(first, &mut buf);

for val in iter {
buf.push_str(delimiter);
fmt_and_write(val, &mut buf);
}
}
buf.shrink_to_fit();
let buf = buf.into_bytes();
let buf = MutableBuffer::from_vec(buf);
let offsets = vec![0, buf.len() as i64];
let offsets = MutableBuffer::from_vec(offsets);
let arr = unsafe { Utf8Array::from_data_unchecked_default(offsets.into(), buf.into(), None) };
Utf8Chunked::new_from_chunks(name, vec![Arc::new(arr)])
}

impl<T> StrConcat for ChunkedArray<T>
where
T: PolarsNumericType,
T::Native: Display,
{
fn str_concat(&self, delimiter: &str) -> Utf8Chunked {
let iter = self.into_iter();
str_concat_impl(iter, delimiter, self.name())
}
}

impl StrConcat for Utf8Chunked {
fn str_concat(&self, delimiter: &str) -> Utf8Chunked {
let iter = self.into_iter();
str_concat_impl(iter, delimiter, self.name())
}
}

#[cfg(test)]
mod test {
use super::*;

#[test]
fn test_str_concat() {
let ca = Int32Chunked::new_from_opt_slice("foo", &[Some(1), None, Some(3)]);
let out = ca.str_concat("-");

let out = out.get(0);
assert_eq!(out, Some("1-null-3"));
}
}
15 changes: 15 additions & 0 deletions polars/polars-core/src/chunked_array/ops/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ mod apply;
mod bit_repr;
pub(crate) mod chunkops;
pub(crate) mod compare_inner;
#[cfg(feature = "concat_str")]
mod concat_str;
#[cfg(feature = "cum_agg")]
mod cum_agg;
pub(crate) mod downcast;
Expand Down Expand Up @@ -722,6 +724,7 @@ pub trait RepeatBy {

#[cfg(feature = "is_first")]
#[cfg_attr(docsrs, doc(cfg(feature = "is_first")))]
/// Mask the first unique values as `true`
pub trait IsFirst<T: PolarsDataType> {
fn is_first(&self) -> Result<BooleanChunked> {
Err(PolarsError::InvalidOperation(
Expand All @@ -732,10 +735,22 @@ pub trait IsFirst<T: PolarsDataType> {

#[cfg(feature = "is_first")]
#[cfg_attr(docsrs, doc(cfg(feature = "is_first")))]
/// Mask the last unique values as `true`
pub trait IsLast<T: PolarsDataType> {
fn is_last(&self) -> Result<BooleanChunked> {
Err(PolarsError::InvalidOperation(
format!("operation not supported by {:?}", T::get_dtype()).into(),
))
}
}

#[cfg(feature = "concat_str")]
#[cfg_attr(docsrs, doc(cfg(feature = "concat_str")))]
/// Concat the values into a string array.
pub trait StrConcat {
/// Concat the values into a string array.
/// # Arguments
///
/// * `delimiter` - A string that will act as delimiter between values.
fn str_concat(&self, delimiter: &str) -> Utf8Chunked;
}
5 changes: 5 additions & 0 deletions polars/polars-core/src/series/implementations/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -774,6 +774,11 @@ macro_rules! impl_dyn_series {
fn mode(&self) -> Result<Series> {
Ok(self.0.mode()?.into_series())
}

#[cfg(feature = "concat_str")]
fn str_concat(&self, delimiter: &str) -> Utf8Chunked {
self.0.str_concat(delimiter)
}
}
};
}
Expand Down
5 changes: 5 additions & 0 deletions polars/polars-core/src/series/implementations/utf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -443,4 +443,9 @@ impl SeriesTrait for SeriesWrap<Utf8Chunked> {
fn mode(&self) -> Result<Series> {
Ok(self.0.mode()?.into_series())
}

#[cfg(feature = "concat_str")]
fn str_concat(&self, delimiter: &str) -> Utf8Chunked {
self.0.str_concat(delimiter)
}
}
34 changes: 26 additions & 8 deletions polars/polars-core/src/series/series_trait.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,14 @@ macro_rules! invalid_operation {
))
};
}
macro_rules! invalid_operation_panic {
($s:expr) => {
panic!(
"this operation is not implemented/valid for this dtype: {:?}",
$s._dtype()
)
};
}

pub(crate) mod private {
use super::*;
Expand Down Expand Up @@ -793,35 +801,35 @@ pub trait SeriesTrait:

/// Get the sum of the Series as a new Series of length 1.
fn sum_as_series(&self) -> Series {
unimplemented!()
invalid_operation_panic!(self)
}
/// Get the max of the Series as a new Series of length 1.
fn max_as_series(&self) -> Series {
unimplemented!()
invalid_operation_panic!(self)
}
/// Get the min of the Series as a new Series of length 1.
fn min_as_series(&self) -> Series {
unimplemented!()
invalid_operation_panic!(self)
}
/// Get the mean of the Series as a new Series of length 1.
fn mean_as_series(&self) -> Series {
unimplemented!()
invalid_operation_panic!(self)
}
/// Get the median of the Series as a new Series of length 1.
fn median_as_series(&self) -> Series {
unimplemented!()
invalid_operation_panic!(self)
}
/// Get the variance of the Series as a new Series of length 1.
fn var_as_series(&self) -> Series {
unimplemented!()
invalid_operation_panic!(self)
}
/// Get the standard deviation of the Series as a new Series of length 1.
fn std_as_series(&self) -> Series {
unimplemented!()
invalid_operation_panic!(self)
}
/// Get the quantile of the ChunkedArray as a new Series of length 1.
fn quantile_as_series(&self, _quantile: f64) -> Result<Series> {
unimplemented!()
invalid_operation_panic!(self)
}

fn fmt_list(&self) -> String {
Expand Down Expand Up @@ -1056,11 +1064,21 @@ pub trait SeriesTrait:
}

#[cfg(feature = "rolling_window")]
#[cfg_attr(docsrs, doc(cfg(feature = "rolling_window")))]
/// Apply a custom function over a rolling/ moving window of the array.
/// This has quite some dynamic dispatch, so prefer rolling_min, max, mean, sum over this.
fn rolling_apply(&self, _window_size: usize, _f: &dyn Fn(&Series) -> Series) -> Result<Series> {
panic!("rolling apply not implemented for this dtype. Only implemented for numeric data.")
}
#[cfg(feature = "concat_str")]
#[cfg_attr(docsrs, doc(cfg(feature = "concat_str")))]
/// Concat the values into a string array.
/// # Arguments
///
/// * `delimiter` - A string that will act as delimiter between values.
fn str_concat(&self, _delimiter: &str) -> Utf8Chunked {
invalid_operation_panic!(self)
}
}

impl<'a> (dyn SeriesTrait + 'a) {
Expand Down
12 changes: 12 additions & 0 deletions polars/polars-lazy/src/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1608,6 +1608,18 @@ impl Expr {
GetOutput::from_type(DataType::Float64),
)
}
#[cfg(feature = "concat_str")]
/// Concat the values into a string array.
/// # Arguments
///
/// * `delimiter` - A string that will act as delimiter between values.
pub fn str_concat(self, delimiter: &str) -> Expr {
let delimiter = delimiter.to_owned();
self.apply(
move |s| Ok(s.str_concat(&delimiter).into_series()),
GetOutput::from_type(DataType::Utf8),
)
}
}

/// Create a Column Expression based on a column name.
Expand Down
16 changes: 16 additions & 0 deletions py-polars/polars/eager/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2734,6 +2734,22 @@ def clip(self, min_val: Union[int, float], max_val: Union[int, float]) -> "Serie
pl.col(self.name).clip(min_val, max_val) # type: ignore
)[self.name]

def str_concat(self, delimiter: str = "-") -> "Series": # type: ignore
"""
Vertically concat the values in the Series to a single string value.
Returns
-------
Series of dtype Utf8
Examples
>>> assert pl.Series([1, None, 2]).str_concat("-")[0] == "1-null-2"
"""
return self.to_frame().select(
pl.col(self.name).delimiter(delimiter) # type: ignore
)[self.name]


class StringNameSpace:
"""
Expand Down
22 changes: 22 additions & 0 deletions py-polars/polars/lazy/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -1675,6 +1675,28 @@ def clip(self, min_val: Union[int, float], max_val: Union[int, float]) -> "Expr"
.otherwise(self)
).keep_name()

def str_concat(self, delimiter: str = "-") -> "Expr": # type: ignore
"""
Vertically concat the values in the Series to a single string value.
Returns
-------
Series of dtype Utf8
Examples
>>> df = pl.DataFrame({"foo": [1, None, 2]})
>>> df = df.select(col("foo").str_concat("-"))
shape: (1, 1)
┌──────────┐
│ foo │
│ --- │
│ str │
╞══════════╡
│ 1-null-2 │
└──────────┘
"""
return wrap_expr(self._pyexpr.str_concat(delimiter))


class ExprListNameSpace:
"""
Expand Down
3 changes: 3 additions & 0 deletions py-polars/src/lazy/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -954,6 +954,9 @@ impl PyExpr {
fn kurtosis(&self, fisher: bool, bias: bool) -> Self {
self.inner.clone().kurtosis(fisher, bias).into()
}
fn str_concat(&self, delimiter: &str) -> Self {
self.inner.clone().str_concat(delimiter).into()
}
}

impl From<dsl::Expr> for PyExpr {
Expand Down
6 changes: 6 additions & 0 deletions py-polars/tests/test_lazy.py
Original file line number Diff line number Diff line change
Expand Up @@ -604,3 +604,9 @@ def test_join_suffix():
assert out.columns == ["a", "b", "c", "b_bar", "c_bar"]
out = df_left.lazy().join(df_right.lazy(), on="a", suffix="_bar").collect()
assert out.columns == ["a", "b", "c", "b_bar", "c_bar"]


def test_str_concat():
df = pl.DataFrame({"foo": [1, None, 2]})
df = df.select(pl.col("foo").str_concat("-"))
assert df[0, 0] == "1-null-2"

0 comments on commit 08f2a8d

Please sign in to comment.