Skip to content

Commit

Permalink
Extended with_columns to allow **kwargs style named expressions (#3917
Browse files Browse the repository at this point in the history
)
  • Loading branch information
alexander-beedie committed Jul 8, 2022
1 parent 368badc commit 08ea76c
Show file tree
Hide file tree
Showing 6 changed files with 169 additions and 21 deletions.
6 changes: 4 additions & 2 deletions py-polars/polars/cfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ class Config:
Configure polars
"""

# class-local boolean flags can be used for options that don't have
# a Rust component (so no need to register environment variables).
with_columns_kwargs: bool = False

@classmethod
def set_utf8_tables(cls) -> type[Config]:
"""
Expand Down Expand Up @@ -52,7 +56,6 @@ def set_tbl_rows(cls, n: int) -> type[Config]:
n
number of rows to print
"""

os.environ["POLARS_FMT_MAX_ROWS"] = str(n)
return cls

Expand Down Expand Up @@ -93,7 +96,6 @@ def set_tbl_cols(cls, n: int) -> type[Config]:
└─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┘
"""

os.environ["POLARS_FMT_MAX_COLS"] = str(n)
return cls

Expand Down
36 changes: 30 additions & 6 deletions py-polars/polars/internals/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4887,7 +4887,8 @@ def select(

def with_columns(
self: DF,
exprs: pli.Expr | pli.Series | list[pli.Expr | pli.Series],
exprs: pli.Expr | pli.Series | Sequence[pli.Expr | pli.Series] | None = None,
**named_exprs: pli.Expr | pli.Series,
) -> DF:
"""
Add or overwrite multiple columns in a DataFrame.
Expand All @@ -4896,10 +4897,11 @@ def with_columns(
----------
exprs
List of Expressions that evaluate to columns.
**named_exprs
Named column Expressions, provided as kwargs.
Examples
--------
>>> df = pl.DataFrame(
... {
... "a": [1, 2, 3, 4],
Expand Down Expand Up @@ -4928,13 +4930,35 @@ def with_columns(
├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
└─────┴──────┴───────┴──────┴──────┴───────┘
"""
if not isinstance(exprs, list):
...
>>> # Support for kwarg expressions is considered EXPERIMENTAL.
>>> # Currently requires opt-in via `pl.Config` boolean flag:
>>>
>>> pl.Config.with_columns_kwargs = True
>>> df.with_columns(
... d=pl.col("a") * pl.col("b"),
... e=pl.col("c").is_not(),
... )
shape: (4, 5)
┌─────┬──────┬───────┬──────┬───────┐
│ a ┆ b ┆ c ┆ d ┆ e │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │
╞═════╪══════╪═══════╪══════╪═══════╡
│ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │
├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │
├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │
├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │
└─────┴──────┴───────┴──────┴───────┘
"""
if exprs is not None and not isinstance(exprs, Sequence):
exprs = [exprs]
return (
self.lazy()
.with_columns(exprs)
.with_columns(exprs, **named_exprs)
.collect(no_optimization=True, string_cache=False)
)

Expand Down
82 changes: 76 additions & 6 deletions py-polars/polars/internals/lazy_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@


from polars import internals as pli
from polars.cfg import Config
from polars.datatypes import DataType, py_type_to_dtype
from polars.utils import (
_in_notebook,
Expand Down Expand Up @@ -1440,7 +1441,8 @@ def join(

def with_columns(
self: LDF,
exprs: pli.Expr | pli.Series | list[pli.Expr | pli.Series],
exprs: pli.Expr | pli.Series | Sequence[pli.Expr | pli.Series] | None = None,
**named_exprs: pli.Expr | pli.Series,
) -> LDF:
"""
Add or overwrite multiple columns in a DataFrame.
Expand All @@ -1449,19 +1451,87 @@ def with_columns(
----------
exprs
List of Expressions that evaluate to columns.
"""
if isinstance(exprs, pli.Expr):
return self.with_column(exprs)
**named_exprs
Named column Expressions, provided as kwargs.
pyexprs = []
Examples
--------
>>> ldf = pl.DataFrame(
... {
... "a": [1, 2, 3, 4],
... "b": [0.5, 4, 10, 13],
... "c": [True, True, False, True],
... }
... ).lazy()
>>> ldf.with_columns(
... [
... (pl.col("a") ** 2).alias("a^2"),
... (pl.col("b") / 2).alias("b/2"),
... (pl.col("c").is_not()).alias("not c"),
... ]
... ).collect()
shape: (4, 6)
┌─────┬──────┬───────┬──────┬──────┬───────┐
│ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ bool ┆ f64 ┆ f64 ┆ bool │
╞═════╪══════╪═══════╪══════╪══════╪═══════╡
│ 1 ┆ 0.5 ┆ true ┆ 1.0 ┆ 0.25 ┆ false │
├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2 ┆ 4.0 ┆ true ┆ 4.0 ┆ 2.0 ┆ false │
├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 3 ┆ 10.0 ┆ false ┆ 9.0 ┆ 5.0 ┆ true │
├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 4 ┆ 13.0 ┆ true ┆ 16.0 ┆ 6.5 ┆ false │
└─────┴──────┴───────┴──────┴──────┴───────┘
...
>>> # Support for kwarg expressions is considered EXPERIMENTAL.
>>> # Currently requires opt-in via `pl.Config` boolean flag:
>>>
>>> pl.Config.with_columns_kwargs = True
>>> ldf.with_columns(
... d=pl.col("a") * pl.col("b"),
... e=pl.col("c").is_not(),
... ).collect()
shape: (4, 5)
┌─────┬──────┬───────┬──────┬───────┐
│ a ┆ b ┆ c ┆ d ┆ e │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │
╞═════╪══════╪═══════╪══════╪═══════╡
│ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │
├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │
├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │
├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │
└─────┴──────┴───────┴──────┴───────┘
"""
if named_exprs and not Config.with_columns_kwargs:
raise RuntimeError(
"**kwargs support is experimental; requires opt-in via `pl.Config.set_with_columns_kwargs(True)`"
)
elif exprs is None and not named_exprs:
raise ValueError("Expected at least one of 'exprs' or **named_exprs")

exprs = (
[]
if exprs is None
else ([exprs] if isinstance(exprs, pli.Expr) else list(exprs))
)
exprs.extend(
(pli.lit(expr).alias(name) if isinstance(expr, str) else expr.alias(name))
for name, expr in named_exprs.items()
)
pyexprs = []
for e in exprs:
if isinstance(e, pli.Expr):
pyexprs.append(e._pyexpr)
elif isinstance(e, pli.Series):
pyexprs.append(pli.lit(e)._pyexpr)
else:
raise ValueError(f"expected and expression, got {e}")
raise ValueError(f"Expected an expression, got {e}")

return self._from_pyldf(self._ldf.with_columns(pyexprs))

Expand Down
11 changes: 6 additions & 5 deletions py-polars/polars/internals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1031,16 +1031,17 @@ def name(self) -> str:

def alias(self, name: str) -> Series:
"""
Rename the Series
Returns a copy of the Series with a new alias/name.
Parameters
----------
name
New name
Returns
-------
New name.
Examples
--------
>>> srs = pl.Series("x", [1, 2, 3])
>>> new_aliased_srs = srs.alias("y")
"""
s = self.clone()
s._s.rename(name)
Expand Down
4 changes: 2 additions & 2 deletions py-polars/polars/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,7 @@ def __post_init__(self) -> None:
self.null_probability < 0 or self.null_probability > 1
):
raise InvalidArgument(
f"null_probability should be between 0.0 and 1.0; found {self.null_probability}"
f"null_probability should be between 0.0 and 1.0, or None; found {self.null_probability}"
)
if self.dtype is None and not self.strategy:
self.dtype = random.choice(strategy_dtypes)
Expand Down Expand Up @@ -660,7 +660,7 @@ def dataframes(
max_cols : int, optional
if not passing an exact size, can set a maximum value here (defaults to MAX_COLS).
size : int, optional
if set, will create a Series of exactly this size (and ignore min/max len params).
if set, will create a DataFrame of exactly this size (and ignore min/max len params).
min_size : int, optional
if not passing an exact size, set the minimum number of rows in the DataFrame.
max_size : int, optional
Expand Down
51 changes: 51 additions & 0 deletions py-polars/tests/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -2375,3 +2375,54 @@ def test_selection_regex_and_multicol() -> None:
"b": [25, 36, 49, 64],
"c": [81, 100, 121, 144],
}


def test_with_columns() -> None:
df = pl.DataFrame(
{
"a": [1, 2, 3, 4],
"b": [0.5, 4, 10, 13],
"c": [True, True, False, True],
}
)
srs_named = pl.Series("f", [3, 2, 1, 0])
srs_unnamed = pl.Series(values=[3, 2, 1, 0])

expected = pl.DataFrame(
{
"a": [1, 2, 3, 4],
"b": [0.5, 4, 10, 13],
"c": [True, True, False, True],
"d": [0.5, 8.0, 30.0, 52.0],
"e": [False, False, True, False],
"f": [3, 2, 1, 0],
}
)

# as exprs list
dx = df.with_columns(
[(pl.col("a") * pl.col("b")).alias("d"), ~pl.col("c").alias("e"), srs_named]
)
assert_frame_equal(dx, expected)

# as **kwargs (experimental feature: requires opt-in)
pl.Config.with_columns_kwargs = True

dx = df.with_columns(
d=pl.col("a") * pl.col("b"),
e=~pl.col("c"),
f=srs_unnamed,
)
assert_frame_equal(dx, expected)

# mixed
dx = df.with_columns(
[(pl.col("a") * pl.col("b")).alias("d")],
e=~pl.col("c"),
f=srs_unnamed,
)
assert_frame_equal(dx, expected)

# at least one of exprs/**named_exprs required
with pytest.raises(ValueError):
_ = df.with_columns()

0 comments on commit 08ea76c

Please sign in to comment.