Skip to content

Commit

Permalink
python: pl.format str format function
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Oct 27, 2021
1 parent a850b06 commit af0028e
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 9 deletions.
44 changes: 37 additions & 7 deletions polars/polars-core/src/functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ use crate::chunked_array::ops::sort::prepare_argsort;
use crate::prelude::*;
use arrow::compute;
use arrow::types::simd::Simd;
#[cfg(feature = "concat_str")]
use itertools::Itertools;
use num::{Float, NumCast};
#[cfg(feature = "concat_str")]
use polars_arrow::prelude::ValueSize;
Expand Down Expand Up @@ -66,6 +64,24 @@ pub fn argsort_by(by: &[Series], reverse: &[bool]) -> Result<UInt32Chunked> {
first.argsort_multiple(&by, &reverse)
}

// utility to be able to also add literals ot concat_str function
#[cfg(feature = "concat_str")]
enum IterBroadCast<'a> {
Column(Box<dyn PolarsIterator<Item = Option<&'a str>> + 'a>),
Value(Option<&'a str>),
}

#[cfg(feature = "concat_str")]
impl<'a> IterBroadCast<'a> {
fn next(&mut self) -> Option<Option<&'a str>> {
use IterBroadCast::*;
match self {
Column(iter) => iter.next(),
Value(val) => Some(*val),
}
}
}

/// Casts all series to string data and will concat them in linear time.
/// The concatenated strings are separated by a `delimiter`.
/// If no `delimiter` is needed, an empty &str should be passed as argument.
Expand All @@ -92,12 +108,19 @@ pub fn concat_str(s: &[Series], delimiter: &str) -> Result<Utf8Chunked> {
})
.collect::<Result<Vec<_>>>()?;

if !s.iter().map(|s| s.len()).all_equal() {
if !s.iter().all(|s| s.len() == 1 || s.len() == len) {
return Err(PolarsError::ValueError(
"all series in concat_str function should have equal length".into(),
"all series in concat_str function should have equal length or unit length".into(),
));
}
let mut iters = cas.iter().map(|ca| ca.into_iter()).collect::<Vec<_>>();
let mut iters = cas
.iter()
.map(|ca| match ca.len() {
1 => IterBroadCast::Value(ca.get(0)),
_ => IterBroadCast::Column(ca.into_iter()),
})
.collect::<Vec<_>>();

let bytes_cap = cas.iter().map(|ca| ca.get_values_size()).sum();
let mut builder = Utf8ChunkedBuilder::new(s[0].name(), len, bytes_cap);

Expand Down Expand Up @@ -150,7 +173,14 @@ mod test {
let a = Series::new("a", &["foo", "bar"]);
let b = Series::new("b", &["spam", "ham"]);

let a = concat_str(&[a, b], "_").unwrap();
assert_eq!(Vec::from(&a), &[Some("foo_spam"), Some("bar_ham")]);
let out = concat_str(&[a.clone(), b.clone()], "_").unwrap();
assert_eq!(Vec::from(&out), &[Some("foo_spam"), Some("bar_ham")]);

let c = Series::new("b", &["literal"]);
let out = concat_str(&[a, b, c], "_").unwrap();
assert_eq!(
Vec::from(&out),
&[Some("foo_spam_literal"), Some("bar_ham_literal")]
);
}
}
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/expression.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ These functions can be used as expression and sometimes also in eager contexts.
argsort_by
concat_str
concat_list
format
when
exclude

Expand Down
51 changes: 51 additions & 0 deletions py-polars/polars/lazy/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
"concat_list",
"collect_all",
"exclude",
"format",
]


Expand Down Expand Up @@ -778,6 +779,56 @@ def concat_str(exprs: tp.List["pl.Expr"], sep: str = "") -> "pl.Expr":
return pl.lazy.expr.wrap_expr(_concat_str(exprs, sep))


def format(fstring: str, *args: Union["pl.Expr", str]) -> "pl.Expr":
"""
String format utility for expressions
Parameters
----------
fstring
A string that with placeholders.
For example: "hello_{}" or "{}_world
args
Expression(s) that fill the placeholders
Examples
--------
>>> df = pl.DataFrame({"a": ["a", "b", "c"], "b": [1, 2, 3]})
>>> df.select([
>>> pl.format("foo_{}_bar_{}", pl.col("a"), "b").alias("fmt")
>>> ])
shape: (3, 1)
┌─────────────┐
│ fmt │
│ --- │
│ str │
╞═════════════╡
│ foo_a_bar_1 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ foo_b_bar_2 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ foo_c_bar_3 │
└─────────────┘
"""
if fstring.count("{}") != len(args):
raise ValueError("number of placeholders should equal the number of arguments")

exprs = []

arguments = iter(args)
for i, s in enumerate(fstring.split("{}")):
if i > 0:
e = pl.lazy.expr_to_lit_or_expr(next(arguments), str_to_lit=False)
exprs.append(e)

if len(s) > 0:
exprs.append(pl.lit(s))

return concat_str(exprs, sep="")


def concat_list(exprs: tp.List["pl.Expr"]) -> "pl.Expr":
"""
Concat the arrays in a Series dtype List in linear time.
Expand Down
8 changes: 6 additions & 2 deletions py-polars/tests/test_lazy.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,10 @@ def test_concat_str():
out = df[[pl.concat_str(["a", "b"], sep="-")]]
assert out["a"] == ["a-1", "a-2", "a-3"]

out = df.select([pl.format("foo_{}_bar_{}", pl.col("a"), "b").alias("fmt")])

assert out["fmt"].to_list() == ["foo_a_bar_1", "foo_b_bar_2", "foo_c_bar_3"]


def test_fold_filter():
df = pl.DataFrame({"a": [1, 2, 3], "b": [0, 1, 2]})
Expand All @@ -240,7 +244,7 @@ def test_fold_filter():
pl.fold(
acc=pl.lit(True),
f=lambda a, b: a & b,
exprs=[col(c) > 1 for c in df.columns],
exprs=[pl.col(c) > 1 for c in df.columns],
)
)

Expand All @@ -250,7 +254,7 @@ def test_fold_filter():
pl.fold(
acc=pl.lit(True),
f=lambda a, b: a | b,
exprs=[col(c) > 1 for c in df.columns],
exprs=[pl.col(c) > 1 for c in df.columns],
)
)

Expand Down

0 comments on commit af0028e

Please sign in to comment.