feat: Allow designation of a custom name for the value_counts "coun…

…t" column (#16434)
pola-rs · May 23, 2024 · d5f9c3b · d5f9c3b
1 parent 30a5534
commit d5f9c3b
Show file tree

Hide file tree

Showing 14 changed files with 105 additions and 49 deletions.
diff --git a/crates/polars-ops/src/series/ops/various.rs b/crates/polars-ops/src/series/ops/various.rs
@@ -11,21 +11,21 @@ use crate::series::ops::SeriesSealed;
 pub trait SeriesMethods: SeriesSealed {
     /// Create a [`DataFrame`] with the unique `values` of this [`Series`] and a column `"counts"`
     /// with dtype [`IdxType`]
-    fn value_counts(&self, sort: bool, parallel: bool) -> PolarsResult<DataFrame> {
+    fn value_counts(&self, sort: bool, parallel: bool, name: String) -> PolarsResult<DataFrame> {
         let s = self.as_series();
         polars_ensure!(
-            s.name() != "count",
-            Duplicate: "using `value_counts` on a column named 'count' would lead to duplicate column names"
+            s.name() != name,
+            Duplicate: "using `value_counts` on a column/series named '{}' would lead to duplicate column names; change `name` to fix", name,
         );
         // we need to sort here as well in case of `maintain_order` because duplicates behavior is undefined
         let groups = s.group_tuples(parallel, sort)?;
         let values = unsafe { s.agg_first(&groups) };
-        let counts = groups.group_count().with_name("count");
+        let counts = groups.group_count().with_name(name.as_str());
         let cols = vec![values, counts.into_series()];
         let df = unsafe { DataFrame::new_no_checks(cols) };
         if sort {
             df.sort(
-                ["count"],
+                [name],
                 SortMultipleOptions::default()
                     .with_order_descending(true)
                     .with_multithreaded(parallel),

diff --git a/crates/polars-plan/src/dsl/function_expr/dispatch.rs b/crates/polars-plan/src/dsl/function_expr/dispatch.rs
@@ -54,8 +54,13 @@ pub(super) fn replace_time_zone(
 }
 
 #[cfg(feature = "dtype-struct")]
-pub(super) fn value_counts(s: &Series, sort: bool, parallel: bool) -> PolarsResult<Series> {
-    s.value_counts(sort, parallel)
+pub(super) fn value_counts(
+    s: &Series,
+    sort: bool,
+    parallel: bool,
+    name: String,
+) -> PolarsResult<Series> {
+    s.value_counts(sort, parallel, name)
         .map(|df| df.into_struct(s.name()).into_series())
 }
 

diff --git a/crates/polars-plan/src/dsl/function_expr/mod.rs b/crates/polars-plan/src/dsl/function_expr/mod.rs
@@ -221,6 +221,7 @@ pub enum FunctionExpr {
     ValueCounts {
         sort: bool,
         parallel: bool,
+        name: String,
     },
     #[cfg(feature = "unique_counts")]
     UniqueCounts,
@@ -463,9 +464,14 @@ impl Hash for FunctionExpr {
             #[cfg(feature = "cum_agg")]
             CumMax { reverse } => reverse.hash(state),
             #[cfg(feature = "dtype-struct")]
-            ValueCounts { sort, parallel } => {
+            ValueCounts {
+                sort,
+                parallel,
+                name,
+            } => {
                 sort.hash(state);
                 parallel.hash(state);
+                name.hash(state);
             },
             #[cfg(feature = "unique_counts")]
             UniqueCounts => {},
@@ -999,7 +1005,11 @@ impl From<FunctionExpr> for SpecialEq<Arc<dyn SeriesUdf>> {
             #[cfg(feature = "cum_agg")]
             CumMax { reverse } => map!(cum::cum_max, reverse),
             #[cfg(feature = "dtype-struct")]
-            ValueCounts { sort, parallel } => map!(dispatch::value_counts, sort, parallel),
+            ValueCounts {
+                sort,
+                parallel,
+                name,
+            } => map!(dispatch::value_counts, sort, parallel, name.clone()),
             #[cfg(feature = "unique_counts")]
             UniqueCounts => map!(dispatch::unique_counts),
             Reverse => map!(dispatch::reverse),

diff --git a/crates/polars-plan/src/dsl/function_expr/schema.rs b/crates/polars-plan/src/dsl/function_expr/schema.rs
@@ -105,10 +105,14 @@ impl FunctionExpr {
             #[cfg(feature = "top_k")]
             TopKBy { .. } => mapper.with_same_dtype(),
             #[cfg(feature = "dtype-struct")]
-            ValueCounts { .. } => mapper.map_dtype(|dt| {
+            ValueCounts {
+                sort: _,
+                parallel: _,
+                name,
+            } => mapper.map_dtype(|dt| {
                 DataType::Struct(vec![
                     Field::new(fields[0].name().as_str(), dt.clone()),
-                    Field::new("count", IDX_DTYPE),
+                    Field::new(name, IDX_DTYPE),
                 ])
             }),
             #[cfg(feature = "unique_counts")]

diff --git a/crates/polars-plan/src/dsl/mod.rs b/crates/polars-plan/src/dsl/mod.rs
@@ -1714,12 +1714,16 @@ impl Expr {
     #[cfg(feature = "dtype-struct")]
     /// Count all unique values and create a struct mapping value to count.
     /// (Note that it is better to turn parallel off in the aggregation context).
-    pub fn value_counts(self, sort: bool, parallel: bool) -> Self {
-        self.apply_private(FunctionExpr::ValueCounts { sort, parallel })
-            .with_function_options(|mut opts| {
-                opts.pass_name_to_apply = true;
-                opts
-            })
+    pub fn value_counts(self, sort: bool, parallel: bool, name: String) -> Self {
+        self.apply_private(FunctionExpr::ValueCounts {
+            sort,
+            parallel,
+            name,
+        })
+        .with_function_options(|mut opts| {
+            opts.pass_name_to_apply = true;
+            opts
+        })
     }
 
     #[cfg(feature = "unique_counts")]

diff --git a/docs/src/rust/user-guide/expressions/structs.rs b/docs/src/rust/user-guide/expressions/structs.rs
@@ -17,7 +17,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     let out = ratings
         .clone()
         .lazy()
-        .select([col("Theatre").value_counts(true, true)])
+        .select([col("Theatre").value_counts(true, true, "count".to_string())])
         .collect()?;
     println!("{}", &out);
     // --8<-- [end:state_value_counts]
@@ -26,7 +26,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     let out = ratings
         .clone()
         .lazy()
-        .select([col("Theatre").value_counts(true, true)])
+        .select([col("Theatre").value_counts(true, true, "count".to_string())])
         .unnest(["Theatre"])
         .collect()?;
     println!("{}", &out);

diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
@@ -2975,7 +2975,7 @@ def write_excel(
         ...     ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title)
 
         Export a table containing two different types of sparklines. Use default
-        options for the "trend" sparkline and customised options (and positioning)
+        options for the "trend" sparkline and customized options (and positioning)
         for the "+/-" win_loss sparkline, with non-default integer dtype formatting,
         column totals, a subtle two-tone heatmap and hidden worksheet gridlines:
 
@@ -2995,7 +2995,7 @@ def write_excel(
         ...     sparklines={
         ...         # default options; just provide source cols
         ...         "trend": ["q1", "q2", "q3", "q4"],
-        ...         # customised sparkline type, with positioning directive
+        ...         # customized sparkline type, with positioning directive
         ...         "+/-": {
         ...             "columns": ["q1", "q2", "q3", "q4"],
         ...             "insert_after": "id",

diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py
@@ -10909,7 +10909,9 @@ def extend_constant(self, value: IntoExpr, n: int | IntoExprColumn) -> Self:
         return self._from_pyexpr(self._pyexpr.extend_constant(value, n))
 
     @deprecate_renamed_parameter("multithreaded", "parallel", version="0.19.0")
-    def value_counts(self, *, sort: bool = False, parallel: bool = False) -> Self:
+    def value_counts(
+        self, *, sort: bool = False, parallel: bool = False, name: str = "count"
+    ) -> Self:
         """
         Count the occurrences of unique values.
 
@@ -10924,6 +10926,8 @@ def value_counts(self, *, sort: bool = False, parallel: bool = False) -> Self:
             .. note::
                 This option should likely not be enabled in a group by context,
                 as the computation is already parallelized per group.
+        name
+            Give the resulting count field a specific name; defaults to "count".
 
         Returns
         -------
@@ -10948,9 +10952,10 @@ def value_counts(self, *, sort: bool = False, parallel: bool = False) -> Self:
         │ {"blue",3}  │
         └─────────────┘
 
-        Sort the output by count.
+        Sort the output by (descending) count and customize the count field name.
 
-        >>> df.select(pl.col("color").value_counts(sort=True))
+        >>> df = df.select(pl.col("color").value_counts(sort=True, name="n"))
+        >>> df
         shape: (3, 1)
         ┌─────────────┐
         │ color       │
@@ -10961,8 +10966,20 @@ def value_counts(self, *, sort: bool = False, parallel: bool = False) -> Self:
         │ {"red",2}   │
         │ {"green",1} │
         └─────────────┘
-        """
-        return self._from_pyexpr(self._pyexpr.value_counts(sort, parallel))
+
+        >>> df.unnest("color")
+        shape: (3, 2)
+        ┌───────┬─────┐
+        │ color ┆ n   │
+        │ ---   ┆ --- │
+        │ str   ┆ u32 │
+        ╞═══════╪═════╡
+        │ blue  ┆ 3   │
+        │ red   ┆ 2   │
+        │ green ┆ 1   │
+        └───────┴─────┘
+        """
+        return self._from_pyexpr(self._pyexpr.value_counts(sort, parallel, name))
 
     def unique_counts(self) -> Self:
         """

diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py
@@ -2762,7 +2762,9 @@ def hist(
         else:
             return out.struct.unnest()
 
-    def value_counts(self, *, sort: bool = False, parallel: bool = False) -> DataFrame:
+    def value_counts(
+        self, *, sort: bool = False, parallel: bool = False, name: str = "count"
+    ) -> DataFrame:
         """
         Count the occurrences of unique values.
 
@@ -2777,6 +2779,8 @@ def value_counts(self, *, sort: bool = False, parallel: bool = False) -> DataFra
             .. note::
                 This option should likely not be enabled in a group by context,
                 as the computation is already parallelized per group.
+        name
+            Give the resulting count column a specific name; defaults to "count".
 
         Returns
         -------
@@ -2798,22 +2802,22 @@ def value_counts(self, *, sort: bool = False, parallel: bool = False) -> DataFra
         │ blue  ┆ 3     │
         └───────┴───────┘
 
-        Sort the output by count.
+        Sort the output by count and customize the count column name.
 
-        >>> s.value_counts(sort=True)
+        >>> s.value_counts(sort=True, name="n")
         shape: (3, 2)
-        ┌───────┬───────┐
-        │ color ┆ count │
-        │ ---   ┆ ---   │
-        │ str   ┆ u32   │
-        ╞═══════╪═══════╡
-        │ blue  ┆ 3     │
-        │ red   ┆ 2     │
-        │ green ┆ 1     │
-        └───────┴───────┘
+        ┌───────┬─────┐
+        │ color ┆ n   │
+        │ ---   ┆ --- │
+        │ str   ┆ u32 │
+        ╞═══════╪═════╡
+        │ blue  ┆ 3   │
+        │ red   ┆ 2   │
+        │ green ┆ 1   │
+        └───────┴─────┘
         """
         return pl.DataFrame._from_pydf(
-            self._s.value_counts(sort=sort, parallel=parallel)
+            self._s.value_counts(sort=sort, parallel=parallel, name=name)
         )
 
     def unique_counts(self) -> Series:

diff --git a/py-polars/src/expr/general.rs b/py-polars/src/expr/general.rs
@@ -250,8 +250,8 @@ impl PyExpr {
     fn len(&self) -> Self {
         self.inner.clone().len().into()
     }
-    fn value_counts(&self, sort: bool, parallel: bool) -> Self {
-        self.inner.clone().value_counts(sort, parallel).into()
+    fn value_counts(&self, sort: bool, parallel: bool, name: String) -> Self {
+        self.inner.clone().value_counts(sort, parallel, name).into()
     }
     fn unique_counts(&self) -> Self {
         self.inner.clone().unique_counts().into()

diff --git a/py-polars/src/lazyframe/visitor/expr_nodes.rs b/py-polars/src/lazyframe/visitor/expr_nodes.rs
@@ -957,6 +957,7 @@ pub(crate) fn into_py(py: Python<'_>, expr: &AExpr) -> PyResult<PyObject> {
                 FunctionExpr::ValueCounts {
                     sort: _,
                     parallel: _,
+                    name: _,
                 } => return Err(PyNotImplementedError::new_err("value counts")),
                 FunctionExpr::UniqueCounts => {
                     return Err(PyNotImplementedError::new_err("unique counts"))

diff --git a/py-polars/src/series/mod.rs b/py-polars/src/series/mod.rs
@@ -732,10 +732,10 @@ impl PySeries {
         self.series.tail(Some(n)).into()
     }
 
-    fn value_counts(&self, sort: bool, parallel: bool) -> PyResult<PyDataFrame> {
+    fn value_counts(&self, sort: bool, parallel: bool, name: String) -> PyResult<PyDataFrame> {
         let out = self
             .series
-            .value_counts(sort, parallel)
+            .value_counts(sort, parallel, name)
             .map_err(PyPolarsErr::from)?;
         Ok(out.into())
     }

diff --git a/py-polars/tests/unit/io/test_spreadsheet.py b/py-polars/tests/unit/io/test_spreadsheet.py
@@ -526,7 +526,7 @@ def test_read_excel_all_sheets_with_sheet_name(path_xlsx: Path, engine: str) ->
             "column_totals": True,
             "float_precision": 0,
         },
-        # slightly customised formatting, with some formulas
+        # slightly customized formatting, with some formulas
         {
             "position": (0, 0),
             "table_style": {
@@ -555,7 +555,7 @@ def test_read_excel_all_sheets_with_sheet_name(path_xlsx: Path, engine: str) ->
             "column_totals": True,
             "row_totals": True,
         },
-        # heavily customised formatting/definition
+        # heavily customized formatting/definition
         {
             "position": "A1",
             "table_name": "PolarsFrameData",

diff --git a/py-polars/tests/unit/operations/test_value_counts.py b/py-polars/tests/unit/operations/test_value_counts.py
@@ -51,13 +51,20 @@ def test_value_counts_expr() -> None:
 
 
 def test_value_counts_duplicate_name() -> None:
-    s = pl.Series("count", [1])
+    s = pl.Series("count", [1, 0, 1])
 
-    with pytest.raises(pl.DuplicateError, match="count"):
+    # default name is 'count' ...
+    with pytest.raises(
+        pl.DuplicateError,
+        match="duplicate column names; change `name` to fix",
+    ):
         s.value_counts()
 
-    def test_count() -> None:
-        assert pl.Series([None, 1, None, 2, 3]).count() == 3
+    # ... but can customize that
+    assert_frame_equal(
+        pl.DataFrame({"count": [1, 0], "n": [2, 1]}, schema_overrides={"n": pl.UInt32}),
+        s.value_counts(name="n", sort=True),
+    )
 
     df = pl.DataFrame({"a": [None, 1, None, 2, 3]})
     assert df.select(pl.col("a").count()).item() == 3
@@ -66,3 +73,7 @@ def test_count() -> None:
         "literal": [1],
         "a": [3],
     }
+
+
+def test_count() -> None:
+    assert pl.Series([None, 1, None, 2, 3]).count() == 3