value_counts add sorted argument (#4094)

pola-rs · Jul 20, 2022 · 46c40a8 · 46c40a8
1 parent 547229d
commit 46c40a8
Show file tree

Hide file tree

Showing 10 changed files with 69 additions and 52 deletions.
diff --git a/polars/polars-core/src/series/ops/unique.rs b/polars/polars-core/src/series/ops/unique.rs
@@ -26,17 +26,6 @@ where
 }
 
 impl Series {
-    /// Create a [`DataFrame`] with the unique `values` of this [`Series`] and a column `"counts"`
-    /// with dtype [`IdxType`]
-    pub fn value_counts(&self, multithreaded: bool) -> Result<DataFrame> {
-        let groups = self.group_tuples(multithreaded, false);
-        let values = unsafe { self.agg_first(&groups) };
-        let counts = groups.group_lengths("counts");
-        let cols = vec![values.into_series(), counts.into_series()];
-        let df = DataFrame::new_no_checks(cols);
-        df.sort(&["counts"], true)
-    }
-
     /// Returns a count of the unique values in the order of appearance.
     #[cfg(feature = "unique_counts")]
     #[cfg_attr(docsrs, doc(cfg(feature = "unique_counts")))]

diff --git a/polars/polars-lazy/src/dsl/mod.rs b/polars/polars-lazy/src/dsl/mod.rs
@@ -2110,10 +2110,10 @@ impl Expr {
     #[cfg_attr(docsrs, doc(cfg(feature = "dtype-struct")))]
     /// Count all unique values and create a struct mapping value to count
     /// Note that it is better to turn multithreaded off in the aggregation context
-    pub fn value_counts(self, multithreaded: bool) -> Self {
+    pub fn value_counts(self, multithreaded: bool, sorted: bool) -> Self {
         self.apply(
             move |s| {
-                s.value_counts(multithreaded)
+                s.value_counts(multithreaded, sorted)
                     .map(|df| df.into_struct(s.name()).into_series())
             },
             GetOutput::map_field(|fld| {

diff --git a/polars/polars-ops/src/series/_trait.rs b/polars/polars-ops/src/series/_trait.rs
@@ -1,6 +1,4 @@
 use super::*;
-#[cfg(feature = "hash")]
-use polars_core::export::ahash;
 use std::ops::Deref;
 
 #[cfg(feature = "to_dummies")]
@@ -16,28 +14,13 @@ macro_rules! invalid_operation {
     };
 }
 
-#[cfg(feature = "hash")]
-macro_rules! invalid_operation_panic {
-    ($s:expr) => {
-        panic!(
-            "this operation is not implemented/valid for this dtype: {:?}",
-            $s.dtype()
-        )
-    };
-}
-
 pub trait SeriesOps {
     fn dtype(&self) -> &DataType;
 
     #[cfg(feature = "to_dummies")]
     fn to_dummies(&self) -> Result<DataFrame> {
         invalid_operation!(self)
     }
-
-    #[cfg(feature = "hash")]
-    fn hash(&self, _build_hasher: ahash::RandomState) -> UInt64Chunked {
-        invalid_operation_panic!(self)
-    }
 }
 
 impl SeriesOps for Series {
@@ -48,15 +31,4 @@ impl SeriesOps for Series {
     fn to_dummies(&self) -> Result<DataFrame> {
         self.to_ops().to_dummies()
     }
-
-    #[cfg(feature = "hash")]
-    fn hash(&self, build_hasher: ahash::RandomState) -> UInt64Chunked {
-        match self.dtype() {
-            DataType::List(_) => {
-                let ca = self.list().unwrap();
-                crate::chunked_array::hash::hash(ca, build_hasher)
-            }
-            _ => UInt64Chunked::from_vec(self.name(), self.0.vec_hash(build_hasher)),
-        }
-    }
 }
diff --git a/polars/polars-ops/src/series/ops/mod.rs b/polars/polars-ops/src/series/ops/mod.rs
@@ -1,8 +1,11 @@
 #[cfg(feature = "log")]
 mod log;
+mod various;
+
 #[cfg(feature = "log")]
 pub use log::*;
 use polars_core::prelude::*;
+pub use various::*;
 
 pub trait SeriesSealed {
     fn as_series(&self) -> &Series;

diff --git a/polars/polars-ops/src/series/ops/various.rs b/polars/polars-ops/src/series/ops/various.rs
@@ -0,0 +1,39 @@
+use crate::series::ops::SeriesSealed;
+use polars_core::prelude::*;
+
+#[cfg(feature = "hash")]
+use polars_core::export::ahash;
+
+pub trait SeriesMethods: SeriesSealed {
+    /// Create a [`DataFrame`] with the unique `values` of this [`Series`] and a column `"counts"`
+    /// with dtype [`IdxType`]
+    fn value_counts(&self, multithreaded: bool, sorted: bool) -> Result<DataFrame> {
+        let s = self.as_series().to_physical_repr();
+        let s = s.as_ref();
+        // we need to sort here as well in case of `maintain_order` because duplicates behavior is undefined
+        let groups = s.group_tuples(multithreaded, sorted);
+        let values = unsafe { s.agg_first(&groups) };
+        let counts = groups.group_lengths("counts");
+        let cols = vec![values.into_series(), counts.into_series()];
+        let df = DataFrame::new_no_checks(cols);
+        if sorted {
+            df.sort(&["counts"], true)
+        } else {
+            Ok(df)
+        }
+    }
+
+    #[cfg(feature = "hash")]
+    fn hash(&self, build_hasher: ahash::RandomState) -> UInt64Chunked {
+        let s = self.as_series().to_physical_repr();
+        match s.dtype() {
+            DataType::List(_) => {
+                let ca = s.list().unwrap();
+                crate::chunked_array::hash::hash(ca, build_hasher)
+            }
+            _ => UInt64Chunked::from_vec(s.name(), s.0.vec_hash(build_hasher)),
+        }
+    }
+}
+
+impl SeriesMethods for Series {}
diff --git a/py-polars/polars/internals/expr.py b/py-polars/polars/internals/expr.py
@@ -4738,14 +4738,16 @@ def extend_constant(self, value: int | float | str | bool | None, n: int) -> Exp
         """
         return wrap_expr(self._pyexpr.extend_constant(value, n))
 
-    def value_counts(self, multithreaded: bool = False) -> Expr:
+    def value_counts(self, multithreaded: bool = False, sort: bool = False) -> Expr:
         """
         Count all unique values and create a struct mapping value to count
 
         Parameters
         ----------
         multithreaded:
             Better to turn this off in the aggregation context, as it can lead to contention.
+        sort:
+            Ensure the output is sorted from most values to least.
 
         Returns
         -------
@@ -4760,7 +4762,7 @@ def value_counts(self, multithreaded: bool = False) -> Expr:
         ... )
         >>> df.select(
         ...     [
-        ...         pl.col("id").value_counts(),
+        ...         pl.col("id").value_counts(sort=True),
         ...     ]
         ... )
         shape: (3, 1)
@@ -4777,7 +4779,7 @@ def value_counts(self, multithreaded: bool = False) -> Expr:
         └───────────┘
 
         """
-        return wrap_expr(self._pyexpr.value_counts(multithreaded))
+        return wrap_expr(self._pyexpr.value_counts(multithreaded, sort))
 
     def unique_counts(self) -> Expr:
         """

diff --git a/py-polars/polars/internals/series.py b/py-polars/polars/internals/series.py
@@ -910,10 +910,15 @@ def to_dummies(self) -> pli.DataFrame:
         """
         return pli.wrap_df(self._s.to_dummies())
 
-    def value_counts(self) -> pli.DataFrame:
+    def value_counts(self, sort: bool = False) -> pli.DataFrame:
         """
         Count the unique values in a Series.
 
+        Parameters
+        ----------
+        sort:
+            Ensure the output is sorted from most values to least.
+
         Examples
         --------
         >>> s = pl.Series("a", [1, 2, 2, 3])
@@ -932,7 +937,7 @@ def value_counts(self) -> pli.DataFrame:
         └─────┴────────┘
 
         """
-        return pli.wrap_df(self._s.value_counts())
+        return pli.wrap_df(self._s.value_counts(sort))
 
     def unique_counts(self) -> Series:
         """

diff --git a/py-polars/src/lazy/dsl.rs b/py-polars/src/lazy/dsl.rs
@@ -195,8 +195,11 @@ impl PyExpr {
     pub fn count(&self) -> PyExpr {
         self.clone().inner.count().into()
     }
-    pub fn value_counts(&self, multithreaded: bool) -> PyExpr {
-        self.inner.clone().value_counts(multithreaded).into()
+    pub fn value_counts(&self, multithreaded: bool, sorted: bool) -> PyExpr {
+        self.inner
+            .clone()
+            .value_counts(multithreaded, sorted)
+            .into()
     }
     pub fn unique_counts(&self) -> PyExpr {
         self.inner.clone().unique_counts().into()

diff --git a/py-polars/src/series.rs b/py-polars/src/series.rs
@@ -559,8 +559,11 @@ impl PySeries {
         Ok(unique.into())
     }
 
-    pub fn value_counts(&self) -> PyResult<PyDataFrame> {
-        let df = self.series.value_counts(true).map_err(PyPolarsErr::from)?;
+    pub fn value_counts(&self, sorted: bool) -> PyResult<PyDataFrame> {
+        let df = self
+            .series
+            .value_counts(true, sorted)
+            .map_err(PyPolarsErr::from)?;
         Ok(df.into())
     }
 

diff --git a/py-polars/tests/test_struct.py b/py-polars/tests/test_struct.py
@@ -116,14 +116,14 @@ def test_struct_function_expansion() -> None:
 def test_value_counts_expr() -> None:
     df = pl.DataFrame(
         {
-            "id": ["a", "b", "b", "c", "c", "c"],
+            "id": ["a", "b", "b", "c", "c", "c", "d", "d"],
         }
     )
 
     out = (
         df.select(
             [
-                pl.col("id").value_counts(),
+                pl.col("id").value_counts(sort=True),
             ]
         )
         .to_series()
@@ -132,6 +132,7 @@ def test_value_counts_expr() -> None:
     assert out == [
         {"id": "c", "counts": 3},
         {"id": "b", "counts": 2},
+        {"id": "d", "counts": 2},
         {"id": "a", "counts": 1},
     ]