Skip to content

Commit

Permalink
improve unique performance (#4070)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Jul 18, 2022
1 parent 083745a commit b062e40
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 24 deletions.
12 changes: 6 additions & 6 deletions polars/polars-core/src/chunked_array/ops/bit_repr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ impl Reinterpret for Int64Chunked {
}

impl UInt64Chunked {
pub(crate) fn reinterpret_float(&self) -> Series {
pub(crate) fn reinterpret_float(&self) -> Float64Chunked {
let chunks = self
.downcast_iter()
.map(|array| {
Expand All @@ -145,11 +145,11 @@ impl UInt64Chunked {
)) as ArrayRef
})
.collect::<Vec<_>>();
Float64Chunked::from_chunks(self.name(), chunks).into()
Float64Chunked::from_chunks(self.name(), chunks)
}
}
impl UInt32Chunked {
pub(crate) fn reinterpret_float(&self) -> Series {
pub(crate) fn reinterpret_float(&self) -> Float32Chunked {
let chunks = self
.downcast_iter()
.map(|array| {
Expand All @@ -173,7 +173,7 @@ impl UInt32Chunked {
)) as ArrayRef
})
.collect::<Vec<_>>();
Float32Chunked::from_chunks(self.name(), chunks).into()
Float32Chunked::from_chunks(self.name(), chunks)
}
}

Expand All @@ -187,7 +187,7 @@ impl Float32Chunked {
let s = self.bit_repr_small().into_series();
let out = f(&s);
let out = out.u32().unwrap();
out.reinterpret_float()
out.reinterpret_float().into()
}
}
impl Float64Chunked {
Expand All @@ -198,6 +198,6 @@ impl Float64Chunked {
let s = self.bit_repr_large().into_series();
let out = f(&s);
let out = out.u64().unwrap();
out.reinterpret_float()
out.reinterpret_float().into()
}
}
59 changes: 41 additions & 18 deletions polars/polars-core/src/chunked_array/ops/unique/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@ pub(crate) mod rank;
#[cfg(feature = "object")]
use crate::chunked_array::object::ObjectType;
use crate::datatypes::PlHashSet;
use crate::frame::groupby::hashing::HASHMAP_INIT_SIZE;
use crate::frame::groupby::GroupsProxy;
#[cfg(feature = "mode")]
use crate::frame::groupby::IntoGroupsProxy;
use crate::prelude::*;
use crate::series::IsSorted;
use std::hash::Hash;

fn finish_is_unique_helper(
Expand Down Expand Up @@ -171,12 +173,20 @@ macro_rules! arg_unique_ca {
impl<T> ChunkUnique<T> for ChunkedArray<T>
where
T: PolarsIntegerType,
T::Native: Hash + Eq,
T::Native: Hash + Eq + Ord,
ChunkedArray<T>: ChunkOps + IntoSeries,
{
fn unique(&self) -> Result<Self> {
let set = fill_set(self.into_iter());
Ok(Self::from_iter_options(self.name(), set.iter().copied()))
match self.is_sorted2() {
IsSorted::Ascending | IsSorted::Descending => {
let mask = self.not_equal(&self.shift(1));
self.filter(&mask)
}
IsSorted::Not => {
let sorted = self.sort(false);
sorted.unique()
}
}
}

fn arg_unique(&self) -> Result<IdxCa> {
Expand Down Expand Up @@ -207,11 +217,30 @@ where

impl ChunkUnique<Utf8Type> for Utf8Chunked {
fn unique(&self) -> Result<Self> {
let set = fill_set(self.into_iter());
Ok(Utf8Chunked::from_iter_options(
self.name(),
set.iter().copied(),
))
match self.null_count() {
0 => {
let mut set =
PlHashSet::with_capacity(std::cmp::min(HASHMAP_INIT_SIZE, self.len()));
for arr in self.downcast_iter() {
set.extend(arr.values_iter())
}
Ok(Utf8Chunked::from_iter_values(
self.name(),
set.iter().copied(),
))
}
_ => {
let mut set =
PlHashSet::with_capacity(std::cmp::min(HASHMAP_INIT_SIZE, self.len()));
for arr in self.downcast_iter() {
set.extend(arr.iter())
}
Ok(Utf8Chunked::from_iter_options(
self.name(),
set.iter().copied(),
))
}
}
}

fn arg_unique(&self) -> Result<IdxCa> {
Expand Down Expand Up @@ -269,11 +298,8 @@ impl ChunkUnique<BooleanType> for BooleanChunked {
impl ChunkUnique<Float32Type> for Float32Chunked {
fn unique(&self) -> Result<ChunkedArray<Float32Type>> {
let ca = self.bit_repr_small();
let set = fill_set(ca.into_iter());
Ok(set
.into_iter()
.map(|opt_v| opt_v.map(f32::from_bits))
.collect())
let ca = ca.unique()?;
Ok(ca.reinterpret_float())
}

fn arg_unique(&self) -> Result<IdxCa> {
Expand All @@ -291,11 +317,8 @@ impl ChunkUnique<Float32Type> for Float32Chunked {
impl ChunkUnique<Float64Type> for Float64Chunked {
fn unique(&self) -> Result<ChunkedArray<Float64Type>> {
let ca = self.bit_repr_large();
let set = fill_set(ca.into_iter());
Ok(set
.into_iter()
.map(|opt_v| opt_v.map(f64::from_bits))
.collect())
let ca = ca.unique()?;
Ok(ca.reinterpret_float())
}

fn arg_unique(&self) -> Result<IdxCa> {
Expand Down

0 comments on commit b062e40

Please sign in to comment.