Skip to content

Commit

Permalink
refactor value_counts (#2781)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Feb 26, 2022
1 parent c6c443e commit 1e73b5f
Show file tree
Hide file tree
Showing 14 changed files with 28 additions and 75 deletions.
7 changes: 0 additions & 7 deletions polars/polars-core/src/chunked_array/ops/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -484,13 +484,6 @@ pub trait ChunkUnique<T> {
))
}

/// Count the unique values.
fn value_counts(&self) -> Result<DataFrame> {
Err(PolarsError::InvalidOperation(
"is_duplicated is not implemented for this dtype".into(),
))
}

/// The most occurring value(s). Can return multiple Values
#[cfg(feature = "mode")]
#[cfg_attr(docsrs, doc(cfg(feature = "mode")))]
Expand Down
32 changes: 0 additions & 32 deletions polars/polars-core/src/chunked_array/ops/unique/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ use crate::chunked_array::object::ObjectType;
use crate::datatypes::PlHashSet;
use crate::frame::groupby::{GroupsProxy, IntoGroupsProxy};
use crate::prelude::*;
use crate::utils::NoNull;
use rayon::prelude::*;
use std::hash::Hash;

Expand Down Expand Up @@ -167,22 +166,6 @@ macro_rules! arg_unique_ca {
}};
}

macro_rules! impl_value_counts {
($self:expr) => {{
let group_tuples = $self.group_tuples(true, false).into_idx();
let values =
unsafe { $self.take_unchecked(group_tuples.iter().map(|t| t.0 as usize).into()) };
let mut counts: NoNull<IdxCa> = group_tuples
.into_iter()
.map(|(_, groups)| groups.len() as IdxSize)
.collect();
counts.rename("counts");
let cols = vec![values.into_series(), counts.into_inner().into_series()];
let df = DataFrame::new_no_checks(cols);
df.sort(&["counts"], true)
}};
}

impl<T> ChunkUnique<T> for ChunkedArray<T>
where
T: PolarsIntegerType,
Expand All @@ -206,11 +189,6 @@ where
is_unique_duplicated!(self, true)
}

// TODO! implement on series. Not worth the compile times here.
fn value_counts(&self) -> Result<DataFrame> {
impl_value_counts!(self)
}

fn n_unique(&self) -> Result<usize> {
if self.null_count() > 0 {
Ok(fill_set(self.into_iter().flatten()).len() + 1)
Expand Down Expand Up @@ -245,10 +223,6 @@ impl ChunkUnique<Utf8Type> for Utf8Chunked {
is_unique_duplicated!(self, true)
}

fn value_counts(&self) -> Result<DataFrame> {
impl_value_counts!(self)
}

fn n_unique(&self) -> Result<usize> {
if self.null_count() > 0 {
Ok(fill_set(self.into_iter().flatten()).len() + 1)
Expand Down Expand Up @@ -396,9 +370,6 @@ impl ChunkUnique<Float32Type> for Float32Chunked {
fn is_duplicated(&self) -> Result<BooleanChunked> {
self.bit_repr_small().is_duplicated()
}
fn value_counts(&self) -> Result<DataFrame> {
impl_value_counts!(self)
}
}

impl ChunkUnique<Float64Type> for Float64Chunked {
Expand All @@ -421,9 +392,6 @@ impl ChunkUnique<Float64Type> for Float64Chunked {
fn is_duplicated(&self) -> Result<BooleanChunked> {
self.bit_repr_large().is_duplicated()
}
fn value_counts(&self) -> Result<DataFrame> {
impl_value_counts!(self)
}
}

#[cfg(feature = "is_first")]
Expand Down
13 changes: 13 additions & 0 deletions polars/polars-core/src/frame/groupby/proxy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,19 @@ impl GroupsProxy {
}
}

pub fn group_lengths(&self, name: &str) -> IdxCa {
let ca: NoNull<IdxCa> = match self {
GroupsProxy::Idx(groups) => groups
.iter()
.map(|(_, groups)| groups.len() as IdxSize)
.collect_trusted(),
GroupsProxy::Slice(groups) => groups.iter().map(|g| g[1]).collect_trusted(),
};
let mut ca = ca.into_inner();
ca.rename(name);
ca
}

#[cfg(feature = "private")]
pub fn par_iter(&self) -> GroupsProxyParIter {
GroupsProxyParIter::new(self)
Expand Down
4 changes: 0 additions & 4 deletions polars/polars-core/src/series/implementations/categorical.rs
Original file line number Diff line number Diff line change
Expand Up @@ -303,10 +303,6 @@ impl SeriesTrait for SeriesWrap<CategoricalChunked> {
self.0.cast(data_type)
}

fn value_counts(&self) -> Result<DataFrame> {
self.0.value_counts()
}

fn get(&self, index: usize) -> AnyValue {
self.0.get_any_value(index)
}
Expand Down
4 changes: 0 additions & 4 deletions polars/polars-core/src/series/implementations/dates_time.rs
Original file line number Diff line number Diff line change
Expand Up @@ -391,10 +391,6 @@ macro_rules! impl_dyn_series {
self.0.to_dummies()
}

fn value_counts(&self) -> Result<DataFrame> {
self.0.value_counts()
}

fn get(&self, index: usize) -> AnyValue {
self.0.get_any_value(index)
}
Expand Down
4 changes: 0 additions & 4 deletions polars/polars-core/src/series/implementations/datetime.rs
Original file line number Diff line number Diff line change
Expand Up @@ -397,10 +397,6 @@ impl SeriesTrait for SeriesWrap<DatetimeChunked> {
self.0.to_dummies()
}

fn value_counts(&self) -> Result<DataFrame> {
self.0.value_counts()
}

fn get(&self, index: usize) -> AnyValue {
self.0.get_any_value(index)
}
Expand Down
4 changes: 0 additions & 4 deletions polars/polars-core/src/series/implementations/duration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -388,10 +388,6 @@ impl SeriesTrait for SeriesWrap<DurationChunked> {
self.0.to_dummies()
}

fn value_counts(&self) -> Result<DataFrame> {
self.0.value_counts()
}

fn get(&self, index: usize) -> AnyValue {
self.0.get_any_value(index)
}
Expand Down
4 changes: 0 additions & 4 deletions polars/polars-core/src/series/implementations/floats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -413,10 +413,6 @@ macro_rules! impl_dyn_series {
ToDummies::to_dummies(&self.0)
}

fn value_counts(&self) -> Result<DataFrame> {
ChunkUnique::value_counts(&self.0)
}

fn get(&self, index: usize) -> AnyValue {
self.0.get_any_value(index)
}
Expand Down
4 changes: 0 additions & 4 deletions polars/polars-core/src/series/implementations/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -580,10 +580,6 @@ macro_rules! impl_dyn_series {
ToDummies::to_dummies(&self.0)
}

fn value_counts(&self) -> Result<DataFrame> {
ChunkUnique::value_counts(&self.0)
}

fn get(&self, index: usize) -> AnyValue {
self.0.get_any_value(index)
}
Expand Down
4 changes: 0 additions & 4 deletions polars/polars-core/src/series/implementations/object.rs
Original file line number Diff line number Diff line change
Expand Up @@ -170,10 +170,6 @@ where
))
}

fn value_counts(&self) -> Result<DataFrame> {
ChunkUnique::value_counts(&self.0)
}

fn get(&self, index: usize) -> AnyValue {
ObjectChunked::get_any_value(&self.0, index)
}
Expand Down
4 changes: 0 additions & 4 deletions polars/polars-core/src/series/implementations/utf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -252,10 +252,6 @@ impl SeriesTrait for SeriesWrap<Utf8Chunked> {
ToDummies::to_dummies(&self.0)
}

fn value_counts(&self) -> Result<DataFrame> {
ChunkUnique::value_counts(&self.0)
}

fn get(&self, index: usize) -> AnyValue {
self.0.get_any_value(index)
}
Expand Down
1 change: 1 addition & 0 deletions polars/polars-core/src/series/ops/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ pub mod pct_change;
#[cfg(feature = "round_series")]
mod round;
mod to_list;
mod unique;

#[derive(Copy, Clone)]
pub enum NullBehavior {
Expand Down
14 changes: 14 additions & 0 deletions polars/polars-core/src/series/ops/unique.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
use crate::prelude::*;

impl Series {
/// Create a [`DataFrame`] with the unique `values` of this [`Series`] and a column `"counts"`
/// with dtype [`IdxType`]
pub fn value_counts(&self) -> Result<DataFrame> {
let groups = self.group_tuples(true, false);
let values = self.agg_first(&groups);
let counts = groups.group_lengths("counts");
let cols = vec![values.into_series(), counts.into_series()];
let df = DataFrame::new_no_checks(cols);
df.sort(&["counts"], true)
}
}
4 changes: 0 additions & 4 deletions polars/polars-core/src/series/series_trait.rs
Original file line number Diff line number Diff line change
Expand Up @@ -607,10 +607,6 @@ pub trait SeriesTrait:
invalid_operation_panic!(self)
}

fn value_counts(&self) -> Result<DataFrame> {
invalid_operation_panic!(self)
}

/// Get a single value by index. Don't use this operation for loops as a runtime cast is
/// needed for every iteration.
fn get(&self, _index: usize) -> AnyValue {
Expand Down

0 comments on commit 1e73b5f

Please sign in to comment.