Skip to content

Commit

Permalink
add Series::mode
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Jul 23, 2021
1 parent 70fc44f commit a5e96ce
Show file tree
Hide file tree
Showing 15 changed files with 113 additions and 2 deletions.
4 changes: 3 additions & 1 deletion polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ concat_str = ["polars-core/concat_str", "polars-lazy/concat_str"]
row_hash = ["polars-core/row_hash"]
reinterpret = ["polars-core/reinterpret", "polars-core/dtype-u64"]
decompress = ["polars-io/decompress"]
mode = ["polars-core/mode", "polars-lazy/mode"]

# don't use this
private = []
Expand Down Expand Up @@ -143,7 +144,8 @@ docs-selection = [
"asof_join",
"cross_join",
"concat_str",
"decompress"
"decompress",
"mode"
]

[dependencies]
Expand Down
4 changes: 3 additions & 1 deletion polars/polars-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ dot_product = []
concat_str = []
row_hash = []
reinterpret = []
mode = []


# opt-in datatypes for Series
Expand Down Expand Up @@ -99,7 +100,8 @@ docs-selection = [
"cross_join",
"dot_product",
"concat_str",
"row_hash"
"row_hash",
"mode"
]

[dependencies]
Expand Down
9 changes: 9 additions & 0 deletions polars/polars-core/src/chunked_array/ops/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -618,6 +618,15 @@ pub trait ChunkUnique<T> {
"is_duplicated is not implemented for this dtype".into(),
))
}

/// The most occurring value(s). Can return multiple Values
#[cfg(feature = "mode")]
#[cfg_attr(docsrs, doc(cfg(feature = "mode")))]
fn mode(&self) -> Result<ChunkedArray<T>> {
Err(PolarsError::InvalidOperation(
"mode is not implemented for this dtype".into(),
))
}
}

pub trait ToDummies<T>: ChunkUnique<T> {
Expand Down
49 changes: 49 additions & 0 deletions polars/polars-core/src/chunked_array/ops/unique.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,39 @@ where
unique
}

#[cfg(feature = "mode")]
#[allow(clippy::needless_collect)]
fn mode<T>(ca: &ChunkedArray<T>) -> ChunkedArray<T>
where
ChunkedArray<T>: IntoGroupTuples + ChunkTake,
{
if ca.is_empty() {
return ca.clone();
}
let mut groups = ca.group_tuples(true);
groups.sort_unstable_by_key(|k| k.1.len());
let first = &groups[0];

let max_occur = first.1.len();

// collect until we don't take with trusted len anymore
// TODO! take directly from iter, but first remove standard trusted-length collect.
let mut was_equal = true;
let idx = groups
.iter()
.rev()
.take_while(|v| {
let current = was_equal;
was_equal = v.1.len() == max_occur;
current
})
.map(|v| v.0)
.collect::<Vec<_>>();
// Safety:
// group indices are in bounds
unsafe { ca.take_unchecked(idx.into_iter().map(|i| i as usize).into()) }
}

macro_rules! arg_unique_ca {
($ca:expr) => {{
match $ca.null_count() {
Expand Down Expand Up @@ -193,6 +226,11 @@ where
fn n_unique(&self) -> Result<usize> {
Ok(fill_set(self.into_iter()).len())
}

#[cfg(feature = "mode")]
fn mode(&self) -> Result<Self> {
Ok(mode(self))
}
}

impl ChunkUnique<Utf8Type> for Utf8Chunked {
Expand Down Expand Up @@ -225,6 +263,11 @@ impl ChunkUnique<Utf8Type> for Utf8Chunked {
fn n_unique(&self) -> Result<usize> {
Ok(fill_set(self.into_iter()).len())
}

#[cfg(feature = "mode")]
fn mode(&self) -> Result<Self> {
Ok(mode(self))
}
}

impl ChunkUnique<CategoricalType> for CategoricalChunked {
Expand Down Expand Up @@ -257,6 +300,12 @@ impl ChunkUnique<CategoricalType> for CategoricalChunked {
fn n_unique(&self) -> Result<usize> {
Ok(self.categorical_map.as_ref().unwrap().len())
}
#[cfg(feature = "mode")]
fn mode(&self) -> Result<Self> {
let mut ca = self.cast::<UInt32Type>()?.mode()?;
ca.categorical_map = self.categorical_map.clone();
ca.cast()
}
}

#[cfg(feature = "dtype-u8")]
Expand Down
4 changes: 4 additions & 0 deletions polars/polars-core/src/series/implementations/dates.rs
Original file line number Diff line number Diff line change
Expand Up @@ -715,6 +715,10 @@ macro_rules! impl_dyn_series {
fn as_any(&self) -> &dyn Any {
&self.0
}
#[cfg(feature = "mode")]
fn mode(&self) -> Result<Series> {
try_physical_dispatch!(self, mode,)
}
}
};
}
Expand Down
4 changes: 4 additions & 0 deletions polars/polars-core/src/series/implementations/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -866,6 +866,10 @@ macro_rules! impl_dyn_series {
fn as_any(&self) -> &dyn Any {
&self.0
}
#[cfg(feature = "mode")]
fn mode(&self) -> Result<Series> {
Ok(self.0.mode()?.into_series())
}
}
};
}
Expand Down
6 changes: 6 additions & 0 deletions polars/polars-core/src/series/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1053,6 +1053,12 @@ pub trait SeriesTrait:
fn is_first(&self) -> Result<BooleanChunked> {
unimplemented!()
}

#[cfg(feature = "mode")]
#[cfg_attr(docsrs, doc(cfg(feature = "mode")))]
fn mode(&self) -> Result<Series> {
unimplemented!()
}
}

impl<'a> (dyn SeriesTrait + 'a) {
Expand Down
1 change: 1 addition & 0 deletions polars/polars-lazy/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ is_first = ["polars-core/is_first"]
cross_join = ["polars-core/cross_join"]
dot_product = ["polars-core/dot_product"]
concat_str = ["polars-core/concat_str"]
mode = ["polars-core/mode"]

# no guarantees whatsoever
private = []
Expand Down
6 changes: 6 additions & 0 deletions polars/polars-lazy/src/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1143,6 +1143,12 @@ impl Expr {
Some(Field::new(l.name(), l.data_type().clone()))
})
}

#[cfg(feature = "mode")]
#[cfg_attr(docsrs, doc(cfg(feature = "mode")))]
pub fn mode(self) -> Expr {
self.map(|s| s.mode().map(|ca| ca.into_series()), None)
}
}

/// Create a Column Expression based on a column name.
Expand Down
1 change: 1 addition & 0 deletions py-polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ features = [
"row_hash",
"reinterpret",
"decompress",
"mode"
]

#[patch.crates-io]
Expand Down
6 changes: 6 additions & 0 deletions py-polars/polars/eager/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1348,6 +1348,12 @@ def dot(self, other: "Series") -> Optional[float]:
"""
return self._s.dot(other._s)

def mode(self) -> "Series":
"""
Compute the most occurring value(s). Can return multiple Values
"""
return wrap_s(self._s.mode())

def apply(
self,
func: Callable[[Any], Any],
Expand Down
6 changes: 6 additions & 0 deletions py-polars/polars/lazy/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,12 @@ def dot(self, other: "Expr") -> "Expr":
other = expr_to_lit_or_expr(other, str_to_lit=False)
return wrap_expr(self._pyexpr.dot(other._pyexpr))

def mode(self) -> "Expr":
"""
Compute the most occurring value(s). Can return multiple Values
"""
return wrap_expr(self._pyexpr.mode())

def cast(self, dtype: Type[Any]) -> "Expr":
"""
Cast an expression to a different data types.
Expand Down
3 changes: 3 additions & 0 deletions py-polars/src/lazy/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,9 @@ impl PyExpr {
};
self.clone().inner.map(function, Some(dt)).into()
}
pub fn mode(&self) -> PyExpr {
self.inner.clone().mode().into()
}
}

impl From<dsl::Expr> for PyExpr {
Expand Down
5 changes: 5 additions & 0 deletions py-polars/src/series.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1123,6 +1123,11 @@ impl PySeries {
let s = reinterpret(&self.series, signed).map_err(PyPolarsEr::from)?;
Ok(s.into())
}

pub fn mode(&self) -> PyResult<Self> {
let s = self.series.mode().map_err(PyPolarsEr::from)?;
Ok(s.into())
}
}

macro_rules! impl_ufuncs {
Expand Down
7 changes: 7 additions & 0 deletions py-polars/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,3 +451,10 @@ def test_reinterpret():
assert s.reinterpret(signed=True).dtype == pl.Int64
df = pl.DataFrame([s])
assert df[[pl.col("a").reinterpret(signed=True)]]["a"].dtype == pl.Int64


def test_mode():
s = pl.Series("a", [1, 1, 2])
assert s.mode() == [1]
df = pl.DataFrame([s])
assert df[[pl.col("a").mode()]]["a"] == [1]

0 comments on commit a5e96ce

Please sign in to comment.