Skip to content

Commit

Permalink
feat(python): Add name parameter to GroupBy.len method (#15235)
Browse files Browse the repository at this point in the history
Co-authored-by: Stijn de Gooijer <stijndegooijer@gmail.com>
  • Loading branch information
alexander-beedie and stinodego committed Mar 23, 2024
1 parent 07538dd commit 5febd51
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 37 deletions.
13 changes: 8 additions & 5 deletions py-polars/polars/_utils/construction/dataframe.py
Expand Up @@ -196,13 +196,16 @@ def _parse_schema_overrides(
# determine column names from schema
if isinstance(schema, Mapping):
column_names: list[str] = list(schema)
# coerce schema to list[str | tuple[str, PolarsDataType | PythonDataType | None]
schema = list(schema.items())
else:
column_names = [
(col or f"column_{i}") if isinstance(col, str) else col[0]
for i, col in enumerate(schema)
]
column_names = []
for i, col in enumerate(schema):
if isinstance(col, str):
unnamed = not col and col not in schema_overrides
col = f"column_{i}" if unnamed else col
else:
col = col[0]
column_names.append(col)

# determine column dtypes from schema and lookup_names
lookup: dict[str, str] | None = (
Expand Down
41 changes: 27 additions & 14 deletions py-polars/polars/dataframe/group_by.py
Expand Up @@ -448,30 +448,43 @@ def all(self) -> DataFrame:
"""
return self.agg(F.all())

def len(self) -> DataFrame:
def len(self, name: str | None = None) -> DataFrame:
"""
Return the number of rows in each group.
Parameters
----------
name
Assign a name to the resulting column; if unset, defaults to "len".
Examples
--------
>>> df = pl.DataFrame(
... {
... "a": ["apple", "apple", "orange"],
... "b": [1, None, 2],
... }
... )
>>> df.group_by("a").len() # doctest: +SKIP
>>> df = pl.DataFrame({"a": ["Apple", "Apple", "Orange"], "b": [1, None, 2]})
>>> df.group_by("a").len() # doctest: +IGNORE_RESULT
shape: (2, 2)
┌────────┬─────┐
│ a ┆ len │
│ --- ┆ --- │
│ str ┆ u32 │
╞════════╪═════╡
│ apple ┆ 2 │
│ orange ┆ 1 │
│ Apple ┆ 2 │
│ Orange ┆ 1 │
└────────┴─────┘
>>> df.group_by("a").len(name="n") # doctest: +IGNORE_RESULT
shape: (2, 2)
┌────────┬─────┐
│ a ┆ n │
│ --- ┆ --- │
│ str ┆ u32 │
╞════════╪═════╡
│ Apple ┆ 2 │
│ Orange ┆ 1 │
└────────┴─────┘
"""
return self.agg(F.len())
len_expr = F.len()
if name is not None:
len_expr = len_expr.alias(name)
return self.agg(len_expr)

@deprecate_renamed_function("len", version="0.20.5")
def count(self) -> DataFrame:
Expand All @@ -487,7 +500,7 @@ def count(self) -> DataFrame:
--------
>>> df = pl.DataFrame(
... {
... "a": ["apple", "apple", "orange"],
... "a": ["Apple", "Apple", "Orange"],
... "b": [1, None, 2],
... }
... )
Expand All @@ -498,8 +511,8 @@ def count(self) -> DataFrame:
│ --- ┆ --- │
│ str ┆ u32 │
╞════════╪═══════╡
apple ┆ 2 │
orange ┆ 1 │
Apple ┆ 2 │
Orange ┆ 1 │
└────────┴───────┘
"""
return self.agg(F.len().alias("count"))
Expand Down
41 changes: 26 additions & 15 deletions py-polars/polars/lazyframe/group_by.py
Expand Up @@ -333,32 +333,43 @@ def all(self) -> LazyFrame:
"""
return self.agg(F.all())

def len(self) -> LazyFrame:
def len(self, name: str | None = None) -> LazyFrame:
"""
Return the number of rows in each group.
Rows containing null values count towards the total.
Parameters
----------
name
Assign a name to the resulting column; if unset, defaults to "len".
Examples
--------
>>> lf = pl.LazyFrame(
... {
... "a": ["apple", "apple", "orange"],
... "b": [1, None, 2],
... }
... )
>>> lf.group_by("a").len().collect() # doctest: +SKIP
>>> lf = pl.LazyFrame({"a": ["Apple", "Apple", "Orange"], "b": [1, None, 2]})
>>> lf.group_by("a").len().collect() # doctest: +IGNORE_RESULT
shape: (2, 2)
┌────────┬─────┐
│ a ┆ len │
│ --- ┆ --- │
│ str ┆ u32 │
╞════════╪═════╡
│ apple ┆ 2 │
│ orange ┆ 1 │
│ Apple ┆ 2 │
│ Orange ┆ 1 │
└────────┴─────┘
>>> lf.group_by("a").len(name="n").collect() # doctest: +IGNORE_RESULT
shape: (2, 2)
┌────────┬─────┐
│ a ┆ n │
│ --- ┆ --- │
│ str ┆ u32 │
╞════════╪═════╡
│ Apple ┆ 2 │
│ Orange ┆ 1 │
└────────┴─────┘
"""
return self.agg(F.len())
len_expr = F.len()
if name is not None:
len_expr = len_expr.alias(name)
return self.agg(len_expr)

@deprecate_renamed_function("len", version="0.20.5")
def count(self) -> LazyFrame:
Expand All @@ -374,7 +385,7 @@ def count(self) -> LazyFrame:
--------
>>> lf = pl.LazyFrame(
... {
... "a": ["apple", "apple", "orange"],
... "a": ["Apple", "Apple", "Orange"],
... "b": [1, None, 2],
... }
... )
Expand All @@ -385,8 +396,8 @@ def count(self) -> LazyFrame:
│ --- ┆ --- │
│ str ┆ u32 │
╞════════╪═══════╡
apple ┆ 2 │
orange ┆ 1 │
Apple ┆ 2 │
Orange ┆ 1 │
└────────┴───────┘
"""
return self.agg(F.len().alias("count"))
Expand Down
13 changes: 10 additions & 3 deletions py-polars/tests/unit/dataframe/test_df.py
Expand Up @@ -1647,14 +1647,21 @@ def __repr__(self) -> str:
assert sys.getrefcount(foos[0]) == base_count


def test_group_by_order_dispatch() -> None:
@pytest.mark.parametrize("name", [None, "n", ""])
def test_group_by_order_dispatch(name: str | None) -> None:
df = pl.DataFrame({"x": list("bab"), "y": range(3)})
lf = df.lazy()

result = df.group_by("x", maintain_order=True).len()
result = df.group_by("x", maintain_order=True).len(name=name)
lazy_result = lf.group_by("x").len(name=name).sort(by="x", descending=True)

name = "len" if name is None else name
expected = pl.DataFrame(
{"x": ["b", "a"], "len": [2, 1]}, schema_overrides={"len": pl.UInt32}
data={"x": ["b", "a"], name: [2, 1]},
schema_overrides={name: pl.UInt32},
)
assert_frame_equal(result, expected)
assert_frame_equal(lazy_result.collect(), expected)

result = df.group_by("x", maintain_order=True).all()
expected = pl.DataFrame({"x": ["b", "a"], "y": [[0, 2], [1]]})
Expand Down

0 comments on commit 5febd51

Please sign in to comment.