Skip to content

Commit

Permalink
value_counts add sorted argument (#4094)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Jul 20, 2022
1 parent 547229d commit 46c40a8
Show file tree
Hide file tree
Showing 10 changed files with 69 additions and 52 deletions.
11 changes: 0 additions & 11 deletions polars/polars-core/src/series/ops/unique.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,6 @@ where
}

impl Series {
/// Create a [`DataFrame`] with the unique `values` of this [`Series`] and a column `"counts"`
/// with dtype [`IdxType`]
pub fn value_counts(&self, multithreaded: bool) -> Result<DataFrame> {
let groups = self.group_tuples(multithreaded, false);
let values = unsafe { self.agg_first(&groups) };
let counts = groups.group_lengths("counts");
let cols = vec![values.into_series(), counts.into_series()];
let df = DataFrame::new_no_checks(cols);
df.sort(&["counts"], true)
}

/// Returns a count of the unique values in the order of appearance.
#[cfg(feature = "unique_counts")]
#[cfg_attr(docsrs, doc(cfg(feature = "unique_counts")))]
Expand Down
4 changes: 2 additions & 2 deletions polars/polars-lazy/src/dsl/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2110,10 +2110,10 @@ impl Expr {
#[cfg_attr(docsrs, doc(cfg(feature = "dtype-struct")))]
/// Count all unique values and create a struct mapping value to count
/// Note that it is better to turn multithreaded off in the aggregation context
pub fn value_counts(self, multithreaded: bool) -> Self {
pub fn value_counts(self, multithreaded: bool, sorted: bool) -> Self {
self.apply(
move |s| {
s.value_counts(multithreaded)
s.value_counts(multithreaded, sorted)
.map(|df| df.into_struct(s.name()).into_series())
},
GetOutput::map_field(|fld| {
Expand Down
28 changes: 0 additions & 28 deletions polars/polars-ops/src/series/_trait.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
use super::*;
#[cfg(feature = "hash")]
use polars_core::export::ahash;
use std::ops::Deref;

#[cfg(feature = "to_dummies")]
Expand All @@ -16,28 +14,13 @@ macro_rules! invalid_operation {
};
}

#[cfg(feature = "hash")]
macro_rules! invalid_operation_panic {
($s:expr) => {
panic!(
"this operation is not implemented/valid for this dtype: {:?}",
$s.dtype()
)
};
}

pub trait SeriesOps {
fn dtype(&self) -> &DataType;

#[cfg(feature = "to_dummies")]
fn to_dummies(&self) -> Result<DataFrame> {
invalid_operation!(self)
}

#[cfg(feature = "hash")]
fn hash(&self, _build_hasher: ahash::RandomState) -> UInt64Chunked {
invalid_operation_panic!(self)
}
}

impl SeriesOps for Series {
Expand All @@ -48,15 +31,4 @@ impl SeriesOps for Series {
fn to_dummies(&self) -> Result<DataFrame> {
self.to_ops().to_dummies()
}

#[cfg(feature = "hash")]
fn hash(&self, build_hasher: ahash::RandomState) -> UInt64Chunked {
match self.dtype() {
DataType::List(_) => {
let ca = self.list().unwrap();
crate::chunked_array::hash::hash(ca, build_hasher)
}
_ => UInt64Chunked::from_vec(self.name(), self.0.vec_hash(build_hasher)),
}
}
}
3 changes: 3 additions & 0 deletions polars/polars-ops/src/series/ops/mod.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
#[cfg(feature = "log")]
mod log;
mod various;

#[cfg(feature = "log")]
pub use log::*;
use polars_core::prelude::*;
pub use various::*;

pub trait SeriesSealed {
fn as_series(&self) -> &Series;
Expand Down
39 changes: 39 additions & 0 deletions polars/polars-ops/src/series/ops/various.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
use crate::series::ops::SeriesSealed;
use polars_core::prelude::*;

#[cfg(feature = "hash")]
use polars_core::export::ahash;

pub trait SeriesMethods: SeriesSealed {
/// Create a [`DataFrame`] with the unique `values` of this [`Series`] and a column `"counts"`
/// with dtype [`IdxType`]
fn value_counts(&self, multithreaded: bool, sorted: bool) -> Result<DataFrame> {
let s = self.as_series().to_physical_repr();
let s = s.as_ref();
// we need to sort here as well in case of `maintain_order` because duplicates behavior is undefined
let groups = s.group_tuples(multithreaded, sorted);
let values = unsafe { s.agg_first(&groups) };
let counts = groups.group_lengths("counts");
let cols = vec![values.into_series(), counts.into_series()];
let df = DataFrame::new_no_checks(cols);
if sorted {
df.sort(&["counts"], true)
} else {
Ok(df)
}
}

#[cfg(feature = "hash")]
fn hash(&self, build_hasher: ahash::RandomState) -> UInt64Chunked {
let s = self.as_series().to_physical_repr();
match s.dtype() {
DataType::List(_) => {
let ca = s.list().unwrap();
crate::chunked_array::hash::hash(ca, build_hasher)
}
_ => UInt64Chunked::from_vec(s.name(), s.0.vec_hash(build_hasher)),
}
}
}

impl SeriesMethods for Series {}
8 changes: 5 additions & 3 deletions py-polars/polars/internals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -4738,14 +4738,16 @@ def extend_constant(self, value: int | float | str | bool | None, n: int) -> Exp
"""
return wrap_expr(self._pyexpr.extend_constant(value, n))

def value_counts(self, multithreaded: bool = False) -> Expr:
def value_counts(self, multithreaded: bool = False, sort: bool = False) -> Expr:
"""
Count all unique values and create a struct mapping value to count
Parameters
----------
multithreaded:
Better to turn this off in the aggregation context, as it can lead to contention.
sort:
Ensure the output is sorted from most values to least.
Returns
-------
Expand All @@ -4760,7 +4762,7 @@ def value_counts(self, multithreaded: bool = False) -> Expr:
... )
>>> df.select(
... [
... pl.col("id").value_counts(),
... pl.col("id").value_counts(sort=True),
... ]
... )
shape: (3, 1)
Expand All @@ -4777,7 +4779,7 @@ def value_counts(self, multithreaded: bool = False) -> Expr:
└───────────┘
"""
return wrap_expr(self._pyexpr.value_counts(multithreaded))
return wrap_expr(self._pyexpr.value_counts(multithreaded, sort))

def unique_counts(self) -> Expr:
"""
Expand Down
9 changes: 7 additions & 2 deletions py-polars/polars/internals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -910,10 +910,15 @@ def to_dummies(self) -> pli.DataFrame:
"""
return pli.wrap_df(self._s.to_dummies())

def value_counts(self) -> pli.DataFrame:
def value_counts(self, sort: bool = False) -> pli.DataFrame:
"""
Count the unique values in a Series.
Parameters
----------
sort:
Ensure the output is sorted from most values to least.
Examples
--------
>>> s = pl.Series("a", [1, 2, 2, 3])
Expand All @@ -932,7 +937,7 @@ def value_counts(self) -> pli.DataFrame:
└─────┴────────┘
"""
return pli.wrap_df(self._s.value_counts())
return pli.wrap_df(self._s.value_counts(sort))

def unique_counts(self) -> Series:
"""
Expand Down
7 changes: 5 additions & 2 deletions py-polars/src/lazy/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -195,8 +195,11 @@ impl PyExpr {
pub fn count(&self) -> PyExpr {
self.clone().inner.count().into()
}
pub fn value_counts(&self, multithreaded: bool) -> PyExpr {
self.inner.clone().value_counts(multithreaded).into()
pub fn value_counts(&self, multithreaded: bool, sorted: bool) -> PyExpr {
self.inner
.clone()
.value_counts(multithreaded, sorted)
.into()
}
pub fn unique_counts(&self) -> PyExpr {
self.inner.clone().unique_counts().into()
Expand Down
7 changes: 5 additions & 2 deletions py-polars/src/series.rs
Original file line number Diff line number Diff line change
Expand Up @@ -559,8 +559,11 @@ impl PySeries {
Ok(unique.into())
}

pub fn value_counts(&self) -> PyResult<PyDataFrame> {
let df = self.series.value_counts(true).map_err(PyPolarsErr::from)?;
pub fn value_counts(&self, sorted: bool) -> PyResult<PyDataFrame> {
let df = self
.series
.value_counts(true, sorted)
.map_err(PyPolarsErr::from)?;
Ok(df.into())
}

Expand Down
5 changes: 3 additions & 2 deletions py-polars/tests/test_struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,14 +116,14 @@ def test_struct_function_expansion() -> None:
def test_value_counts_expr() -> None:
df = pl.DataFrame(
{
"id": ["a", "b", "b", "c", "c", "c"],
"id": ["a", "b", "b", "c", "c", "c", "d", "d"],
}
)

out = (
df.select(
[
pl.col("id").value_counts(),
pl.col("id").value_counts(sort=True),
]
)
.to_series()
Expand All @@ -132,6 +132,7 @@ def test_value_counts_expr() -> None:
assert out == [
{"id": "c", "counts": 3},
{"id": "b", "counts": 2},
{"id": "d", "counts": 2},
{"id": "a", "counts": 1},
]

Expand Down

0 comments on commit 46c40a8

Please sign in to comment.