feat(python): Add name parameter to GroupBy.len method (#15235)

Co-authored-by: Stijn de Gooijer <stijndegooijer@gmail.com>
pola-rs · Mar 23, 2024 · 5febd51 · 5febd51
1 parent 07538dd
commit 5febd51
Show file tree

Hide file tree

Showing 4 changed files with 71 additions and 37 deletions.
diff --git a/py-polars/polars/_utils/construction/dataframe.py b/py-polars/polars/_utils/construction/dataframe.py
@@ -196,13 +196,16 @@ def _parse_schema_overrides(
     # determine column names from schema
     if isinstance(schema, Mapping):
         column_names: list[str] = list(schema)
-        # coerce schema to list[str | tuple[str, PolarsDataType | PythonDataType | None]
         schema = list(schema.items())
     else:
-        column_names = [
-            (col or f"column_{i}") if isinstance(col, str) else col[0]
-            for i, col in enumerate(schema)
-        ]
+        column_names = []
+        for i, col in enumerate(schema):
+            if isinstance(col, str):
+                unnamed = not col and col not in schema_overrides
+                col = f"column_{i}" if unnamed else col
+            else:
+                col = col[0]
+            column_names.append(col)
 
     # determine column dtypes from schema and lookup_names
     lookup: dict[str, str] | None = (

diff --git a/py-polars/polars/dataframe/group_by.py b/py-polars/polars/dataframe/group_by.py
@@ -448,30 +448,43 @@ def all(self) -> DataFrame:
         """
         return self.agg(F.all())
 
-    def len(self) -> DataFrame:
+    def len(self, name: str | None = None) -> DataFrame:
         """
         Return the number of rows in each group.
 
+        Parameters
+        ----------
+        name
+            Assign a name to the resulting column; if unset, defaults to "len".
+
         Examples
         --------
-        >>> df = pl.DataFrame(
-        ...     {
-        ...         "a": ["apple", "apple", "orange"],
-        ...         "b": [1, None, 2],
-        ...     }
-        ... )
-        >>> df.group_by("a").len()  # doctest: +SKIP
+        >>> df = pl.DataFrame({"a": ["Apple", "Apple", "Orange"], "b": [1, None, 2]})
+        >>> df.group_by("a").len()  # doctest: +IGNORE_RESULT
         shape: (2, 2)
         ┌────────┬─────┐
         │ a      ┆ len │
         │ ---    ┆ --- │
         │ str    ┆ u32 │
         ╞════════╪═════╡
-        │ apple  ┆ 2   │
-        │ orange ┆ 1   │
+        │ Apple  ┆ 2   │
+        │ Orange ┆ 1   │
+        └────────┴─────┘
+        >>> df.group_by("a").len(name="n")  # doctest: +IGNORE_RESULT
+        shape: (2, 2)
+        ┌────────┬─────┐
+        │ a      ┆ n   │
+        │ ---    ┆ --- │
+        │ str    ┆ u32 │
+        ╞════════╪═════╡
+        │ Apple  ┆ 2   │
+        │ Orange ┆ 1   │
         └────────┴─────┘
         """
-        return self.agg(F.len())
+        len_expr = F.len()
+        if name is not None:
+            len_expr = len_expr.alias(name)
+        return self.agg(len_expr)
 
     @deprecate_renamed_function("len", version="0.20.5")
     def count(self) -> DataFrame:
@@ -487,7 +500,7 @@ def count(self) -> DataFrame:
         --------
         >>> df = pl.DataFrame(
         ...     {
-        ...         "a": ["apple", "apple", "orange"],
+        ...         "a": ["Apple", "Apple", "Orange"],
         ...         "b": [1, None, 2],
         ...     }
         ... )
@@ -498,8 +511,8 @@ def count(self) -> DataFrame:
         │ ---    ┆ ---   │
         │ str    ┆ u32   │
         ╞════════╪═══════╡
-        │ apple  ┆ 2     │
-        │ orange ┆ 1     │
+        │ Apple  ┆ 2     │
+        │ Orange ┆ 1     │
         └────────┴───────┘
         """
         return self.agg(F.len().alias("count"))

diff --git a/py-polars/polars/lazyframe/group_by.py b/py-polars/polars/lazyframe/group_by.py
@@ -333,32 +333,43 @@ def all(self) -> LazyFrame:
         """
         return self.agg(F.all())
 
-    def len(self) -> LazyFrame:
+    def len(self, name: str | None = None) -> LazyFrame:
         """
         Return the number of rows in each group.
 
-        Rows containing null values count towards the total.
+        Parameters
+        ----------
+        name
+            Assign a name to the resulting column; if unset, defaults to "len".
 
         Examples
         --------
-        >>> lf = pl.LazyFrame(
-        ...     {
-        ...         "a": ["apple", "apple", "orange"],
-        ...         "b": [1, None, 2],
-        ...     }
-        ... )
-        >>> lf.group_by("a").len().collect()  # doctest: +SKIP
+        >>> lf = pl.LazyFrame({"a": ["Apple", "Apple", "Orange"], "b": [1, None, 2]})
+        >>> lf.group_by("a").len().collect()  # doctest: +IGNORE_RESULT
         shape: (2, 2)
         ┌────────┬─────┐
         │ a      ┆ len │
         │ ---    ┆ --- │
         │ str    ┆ u32 │
         ╞════════╪═════╡
-        │ apple  ┆ 2   │
-        │ orange ┆ 1   │
+        │ Apple  ┆ 2   │
+        │ Orange ┆ 1   │
+        └────────┴─────┘
+        >>> lf.group_by("a").len(name="n").collect()  # doctest: +IGNORE_RESULT
+        shape: (2, 2)
+        ┌────────┬─────┐
+        │ a      ┆ n   │
+        │ ---    ┆ --- │
+        │ str    ┆ u32 │
+        ╞════════╪═════╡
+        │ Apple  ┆ 2   │
+        │ Orange ┆ 1   │
         └────────┴─────┘
         """
-        return self.agg(F.len())
+        len_expr = F.len()
+        if name is not None:
+            len_expr = len_expr.alias(name)
+        return self.agg(len_expr)
 
     @deprecate_renamed_function("len", version="0.20.5")
     def count(self) -> LazyFrame:
@@ -374,7 +385,7 @@ def count(self) -> LazyFrame:
         --------
         >>> lf = pl.LazyFrame(
         ...     {
-        ...         "a": ["apple", "apple", "orange"],
+        ...         "a": ["Apple", "Apple", "Orange"],
         ...         "b": [1, None, 2],
         ...     }
         ... )
@@ -385,8 +396,8 @@ def count(self) -> LazyFrame:
         │ ---    ┆ ---   │
         │ str    ┆ u32   │
         ╞════════╪═══════╡
-        │ apple  ┆ 2     │
-        │ orange ┆ 1     │
+        │ Apple  ┆ 2     │
+        │ Orange ┆ 1     │
         └────────┴───────┘
         """
         return self.agg(F.len().alias("count"))

diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py
@@ -1647,14 +1647,21 @@ def __repr__(self) -> str:
     assert sys.getrefcount(foos[0]) == base_count
 
 
-def test_group_by_order_dispatch() -> None:
+@pytest.mark.parametrize("name", [None, "n", ""])
+def test_group_by_order_dispatch(name: str | None) -> None:
     df = pl.DataFrame({"x": list("bab"), "y": range(3)})
+    lf = df.lazy()
 
-    result = df.group_by("x", maintain_order=True).len()
+    result = df.group_by("x", maintain_order=True).len(name=name)
+    lazy_result = lf.group_by("x").len(name=name).sort(by="x", descending=True)
+
+    name = "len" if name is None else name
     expected = pl.DataFrame(
-        {"x": ["b", "a"], "len": [2, 1]}, schema_overrides={"len": pl.UInt32}
+        data={"x": ["b", "a"], name: [2, 1]},
+        schema_overrides={name: pl.UInt32},
     )
     assert_frame_equal(result, expected)
+    assert_frame_equal(lazy_result.collect(), expected)
 
     result = df.group_by("x", maintain_order=True).all()
     expected = pl.DataFrame({"x": ["b", "a"], "y": [[0, 2], [1]]})