Skip to content

Commit

Permalink
add unique_counts
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Mar 23, 2022
1 parent 99bb515 commit ad6d121
Show file tree
Hide file tree
Showing 14 changed files with 149 additions and 10 deletions.
1 change: 1 addition & 0 deletions polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ ewma = ["polars-core/ewma", "polars-lazy/ewma"]
dot_diagram = ["polars-lazy/dot_diagram"]
dataframe_arithmetic = ["polars-core/dataframe_arithmetic"]
product = ["polars-core/product"]
unique_counts = ["polars-core/unique_counts", "polars-lazy/unique_counts"]

series_from_anyvalue = ["polars-core/series_from_anyvalue"]

Expand Down
2 changes: 2 additions & 0 deletions polars/polars-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ abs = []
ewma = ["polars-utils"]
dataframe_arithmetic = []
product = []
unique_counts = []

dynamic_groupby = ["dtype-datetime", "dtype-date"]

Expand Down Expand Up @@ -133,6 +134,7 @@ docs-selection = [
"dataframe_arithmetic",
"string_encoding",
"product",
"unique_counts",
]

[dependencies]
Expand Down
51 changes: 49 additions & 2 deletions polars/polars-core/src/series/ops/unique.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,61 @@
#[cfg(feature = "unique_counts")]
use crate::frame::groupby::hashing::HASHMAP_INIT_SIZE;
use crate::prelude::*;
#[cfg(feature = "unique_counts")]
use crate::utils::NoNull;
#[cfg(feature = "unique_counts")]
use std::hash::Hash;

#[cfg(feature = "unique_counts")]
#[cfg_attr(docsrs, doc(cfg(feature = "unique_counts")))]
fn unique_counts<I, J>(items: I) -> IdxCa
where
I: Iterator<Item = J>,
J: Hash + Eq,
{
let mut map = PlIndexMap::with_capacity_and_hasher(HASHMAP_INIT_SIZE, Default::default());
for item in items {
map.entry(item)
.and_modify(|cnt| {
*cnt += 1;
})
.or_insert(1 as IdxSize);
}
let out: NoNull<IdxCa> = map.into_values().collect();
out.into_inner()
}

impl Series {
/// Create a [`DataFrame`] with the unique `values` of this [`Series`] and a column `"counts"`
/// with dtype [`IdxType`]
pub fn value_counts(&self) -> Result<DataFrame> {
let groups = self.group_tuples(true, false);
pub fn value_counts(&self, multithreaded: bool) -> Result<DataFrame> {
let groups = self.group_tuples(multithreaded, false);
let values = self.agg_first(&groups);
let counts = groups.group_lengths("counts");
let cols = vec![values.into_series(), counts.into_series()];
let df = DataFrame::new_no_checks(cols);
df.sort(&["counts"], true)
}

/// Returns a count of the unique values in the order of appearance.
#[cfg(feature = "unique_counts")]
#[cfg_attr(docsrs, doc(cfg(feature = "unique_counts")))]
pub fn unique_counts(&self) -> IdxCa {
if self.dtype().is_numeric() {
if self.bit_repr_is_large() {
let ca = self.bit_repr_large();
unique_counts(ca.into_iter())
} else {
let ca = self.bit_repr_small();
unique_counts(ca.into_iter())
}
} else {
match self.dtype() {
DataType::Utf8 => unique_counts(self.utf8().unwrap().into_iter()),
dt => {
panic!("'unique_counts' not implemented for {} data types", dt)
}
}
}
}
}
1 change: 1 addition & 0 deletions polars/polars-lazy/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ random = ["polars-core/random"]
dynamic_groupby = ["polars-core/dynamic_groupby"]
ewma = ["polars-core/ewma"]
dot_diagram = []
unique_counts = ["polars-core/unique_counts"]

# no guarantees whatsoever
private = ["polars-time/private"]
Expand Down
21 changes: 18 additions & 3 deletions polars/polars-lazy/src/dsl/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2073,10 +2073,11 @@ impl Expr {
#[cfg(feature = "dtype-struct")]
#[cfg_attr(docsrs, doc(cfg(feature = "dtype-struct")))]
/// Count all unique values and create a struct mapping value to count
pub fn value_counts(self) -> Self {
/// Note that it is better to turn multithreaded off in the aggregation context
pub fn value_counts(self, multithreaded: bool) -> Self {
self.apply(
|s| {
s.value_counts()
move |s| {
s.value_counts(multithreaded)
.map(|df| df.into_struct(s.name()).into_series())
},
GetOutput::map_field(|fld| {
Expand All @@ -2086,6 +2087,20 @@ impl Expr {
)
}),
)
.with_fmt("value_counts")
}

#[cfg(feature = "unique_counts")]
#[cfg_attr(docsrs, doc(cfg(feature = "unique_counts")))]
/// Returns a count of the unique values in the order of appearance.
/// This method differs from [`Expr::value_counts]` in that it does not return the
/// values, only the counts and might be faster
pub fn unique_counts(self) -> Self {
self.apply(
|s| Ok(s.unique_counts().into_series()),
GetOutput::from_type(IDX_DTYPE),
)
.with_fmt("unique_counts")
}

#[cfg(feature = "strings")]
Expand Down
1 change: 1 addition & 0 deletions polars/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@
//! - `product` - Compute the product of a Series.
//! - `diff` - `diff` operation.
//! - `pct_change` - Compute change percentages.
//! - `unique_counts` - Count unique values in expressions.
//! * `DataFrame` pretty printing (Choose one or none, but not both):
//! - `fmt` - Activate DataFrame formatting
//!
Expand Down
1 change: 1 addition & 0 deletions py-polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ features = [
"product",
"ndarray",
"series_from_anyvalue",
"unique_counts",
]

# [patch.crates-io]
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/expression.rst
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ Computations
Expr.log
Expr.log10
Expr.exp
Expr.unique_counts
Expr.value_counts

Manipulation/ selection
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ Descriptive stats
:toctree: api/

Series.describe
Series.unique_counts
Series.value_counts
Series.chunk_lengths
Series.n_chunks
Expand Down
45 changes: 43 additions & 2 deletions py-polars/polars/internals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -2786,10 +2786,15 @@ def extend_constant(
"""
return wrap_expr(self._pyexpr.extend_constant(value, n))

def value_counts(self) -> "Expr":
def value_counts(self, multithreaded: bool = False) -> "Expr":
"""
Count all unique values and create a struct mapping value to count
Parameters
----------
multithreaded:
Better to turn this off in the aggregation context, as it can lead to contention.
Returns
-------
Dtype Struct
Expand Down Expand Up @@ -2821,7 +2826,43 @@ def value_counts(self) -> "Expr":
└─────────────────────────────────────┘
"""
return wrap_expr(self._pyexpr.value_counts())
return wrap_expr(self._pyexpr.value_counts(multithreaded))

def unique_counts(self) -> "Expr":
"""
Returns a count of the unique values in the order of appearance.
This method differs from `value_counts` in that it does not return the
values, only the counts and might be faster
Examples
--------
>>> df = pl.DataFrame(
... {
... "id": ["a", "b", "b", "c", "c", "c"],
... }
... )
>>> df.select(
... [
... pl.col("id").unique_counts(),
... ]
... )
shape: (3, 1)
┌─────┐
│ id │
│ --- │
│ u32 │
╞═════╡
│ 1 │
├╌╌╌╌╌┤
│ 2 │
├╌╌╌╌╌┤
│ 3 │
└─────┘
"""
return wrap_expr(self._pyexpr.unique_counts())

# Below are the namespaces defined. Keep these at the end of the definition of Expr, as to not confuse mypy with
# the type annotation `str` with the namespace "str"
Expand Down
19 changes: 19 additions & 0 deletions py-polars/polars/internals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -901,6 +901,25 @@ def value_counts(self) -> "pli.DataFrame":
"""
return pli.wrap_df(self._s.value_counts())

def unique_counts(self) -> "Series":
"""
Returns a count of the unique values in the order of appearance.
Examples
--------
>>> s = pl.Series("id", ["a", "b", "b", "c", "c", "c"])
>>> s.unique_counts()
shape: (3,)
Series: 'id' [u32]
[
1
2
3
]
"""
return pli.select(pli.lit(self).unique_counts()).to_series()

@property
def name(self) -> str:
"""
Expand Down
7 changes: 5 additions & 2 deletions py-polars/src/lazy/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -175,8 +175,11 @@ impl PyExpr {
pub fn count(&self) -> PyExpr {
self.clone().inner.count().into()
}
pub fn value_counts(&self) -> PyExpr {
self.inner.clone().value_counts().into()
pub fn value_counts(&self, multithreaded: bool) -> PyExpr {
self.inner.clone().value_counts(multithreaded).into()
}
pub fn unique_counts(&self) -> PyExpr {
self.inner.clone().unique_counts().into()
}
pub fn cast(&self, data_type: Wrap<DataType>, strict: bool) -> PyExpr {
let dt = data_type.0;
Expand Down
2 changes: 1 addition & 1 deletion py-polars/src/series.rs
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,7 @@ impl PySeries {
}

pub fn value_counts(&self) -> PyResult<PyDataFrame> {
let df = self.series.value_counts().map_err(PyPolarsErr::from)?;
let df = self.series.value_counts(true).map_err(PyPolarsErr::from)?;
Ok(df.into())
}

Expand Down
6 changes: 6 additions & 0 deletions py-polars/tests/test_exprs.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,3 +156,9 @@ def test_unique_and_drop_stability() -> None:
# meaning that the a.unique was executed twice, which is an unstable algorithm
df = pl.DataFrame({"a": [1, None, 1, None]})
assert df.select(pl.col("a").unique().drop_nulls()).to_series()[0] == 1


def test_unique_counts() -> None:
s = pl.Series("id", ["a", "b", "b", "c", "c", "c"])
expected = pl.Series("id", [1, 2, 3], dtype=pl.UInt32)
verify_series_and_expr_api(s, expected, "unique_counts")

0 comments on commit ad6d121

Please sign in to comment.