Skip to content

Commit

Permalink
perf: create UniqueKernel and improve bool implementation (#17160)
Browse files Browse the repository at this point in the history
  • Loading branch information
coastalwhite committed Jun 24, 2024
1 parent 00ceed7 commit 8f987c6
Show file tree
Hide file tree
Showing 6 changed files with 181 additions and 88 deletions.
74 changes: 0 additions & 74 deletions crates/polars-compute/src/distinct_count.rs
Original file line number Diff line number Diff line change
@@ -1,74 +0,0 @@
use arrow::array::{Array, BooleanArray};

/// Kernel to calculate the number of unique elements
pub trait DistinctCountKernel {
/// Calculate the number of unique elements in [`Self`]
///
/// A null is also considered a unique value
fn distinct_count(&self) -> usize;

/// Calculate the number of unique non-null elements in [`Self`]
fn distinct_non_null_count(&self) -> usize;
}

impl DistinctCountKernel for BooleanArray {
fn distinct_count(&self) -> usize {
if self.len() == 0 {
return 0;
}

let null_count = self.null_count();

if self.len() == null_count {
return 1;
}

let values = self.values();

if null_count == 0 {
let unset_bits = values.unset_bits();
let is_uniform = unset_bits == 0 || unset_bits == values.len();
return 2 - usize::from(is_uniform);
}

let validity = self.validity().unwrap();
let set_bits = values.num_intersections_with(validity);
let is_uniform = set_bits == 0 || set_bits == validity.set_bits();
2 + usize::from(!is_uniform)
}

#[inline]
fn distinct_non_null_count(&self) -> usize {
self.distinct_count() - usize::from(self.null_count() > 0)
}
}

#[test]
fn test_boolean_distinct_count() {
use arrow::bitmap::Bitmap;
use arrow::datatypes::ArrowDataType;

macro_rules! assert_bool_dc {
($values:expr, $validity:expr => $dc:expr) => {
let validity: Option<Bitmap> =
<Option<Vec<bool>>>::map($validity, |v| Bitmap::from_iter(v));
let arr =
BooleanArray::new(ArrowDataType::Boolean, Bitmap::from_iter($values), validity);
assert_eq!(arr.distinct_count(), $dc);
};
}

assert_bool_dc!(vec![], None => 0);
assert_bool_dc!(vec![], Some(vec![]) => 0);
assert_bool_dc!(vec![true], None => 1);
assert_bool_dc!(vec![true], Some(vec![true]) => 1);
assert_bool_dc!(vec![true], Some(vec![false]) => 1);
assert_bool_dc!(vec![true, false], None => 2);
assert_bool_dc!(vec![true, false, false], None => 2);
assert_bool_dc!(vec![true, false, false], Some(vec![true, true, false]) => 3);

// Copied from https://github.com/pola-rs/polars/pull/16765#discussion_r1629426159
assert_bool_dc!(vec![true, true, true, true, true], Some(vec![true, false, true, false, false]) => 2);
assert_bool_dc!(vec![false, true, false, true, true], Some(vec![true, false, true, false, false]) => 2);
assert_bool_dc!(vec![true, false, true, false, true, true], Some(vec![true, true, false, true, false, false]) => 3);
}
2 changes: 1 addition & 1 deletion crates/polars-compute/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ use arrow::types::NativeType;

pub mod arithmetic;
pub mod comparisons;
pub mod distinct_count;
pub mod filter;
pub mod float_sum;
pub mod if_then_else;
pub mod min_max;
pub mod unique;

pub mod arity;

Expand Down
142 changes: 142 additions & 0 deletions crates/polars-compute/src/unique/boolean.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
use arrow::array::{Array, BooleanArray};
use arrow::bitmap::MutableBitmap;

use super::UniqueKernel;

fn bool_unique_fold<'a>(
fst: &'a BooleanArray,
arrs: impl Iterator<Item = &'a BooleanArray>,
) -> BooleanArray {
// can be None, Some(true), Some(false)
//
// We assign values to each value
// None = 1
// Some(false) = 2
// Some(true) = 3
//
// And keep track of 2 things
// - `found_set`: which values have already appeared
// - `order`: in which order did the values appear

#[inline(always)]
fn append_arr(arr: &BooleanArray, found_set: &mut u32, order: &mut u32) {
for v in arr {
let value = v.map_or(1, |v| 2 + u32::from(v));
let nulled_value = if *found_set & (1 << value) != 0 {
0
} else {
value
};

*order |= nulled_value << (found_set.count_ones() * 2);
*found_set |= 1 << value;

if *found_set == 0b1110 {
break;
}
}
}

let mut found_set = 0u32;
let mut order = 0u32;

append_arr(fst, &mut found_set, &mut order);
for arr in arrs {
append_arr(arr, &mut found_set, &mut order);
}

let mut values = MutableBitmap::with_capacity(3);
let validity = if found_set & 0b10 != 0 {
let mut validity = MutableBitmap::with_capacity(3);
while order != 0 {
values.push(order & 0b11 > 2);
validity.push(order & 0b11 > 1);
order >>= 2;
}
Some(validity.freeze())
} else {
while order != 0 {
values.push(order & 0b11 > 2);
order >>= 2;
}
None
};

let values = values.freeze();

BooleanArray::new(fst.data_type().clone(), values, validity)
}

impl UniqueKernel for BooleanArray {
fn unique_fold<'a>(fst: &'a Self, others: impl Iterator<Item = &'a Self>) -> Self {
bool_unique_fold(fst, others)
}

fn unique(&self) -> Self {
Self::unique_fold(self, [].iter())
}

fn unique_sorted(&self) -> Self {
Self::unique_fold(self, [].iter())
}

fn n_unique(&self) -> usize {
if self.len() == 0 {
return 0;
}

let null_count = self.null_count();

if self.len() == null_count {
return 1;
}

let values = self.values();

if null_count == 0 {
let unset_bits = values.unset_bits();
let is_uniform = unset_bits == 0 || unset_bits == values.len();
return 2 - usize::from(is_uniform);
}

let validity = self.validity().unwrap();
let set_bits = values.num_intersections_with(validity);
let is_uniform = set_bits == 0 || set_bits == validity.set_bits();
2 + usize::from(!is_uniform)
}

#[inline]
fn n_unique_non_null(&self) -> usize {
self.n_unique() - usize::from(self.null_count() > 0)
}
}

#[test]
fn test_boolean_distinct_count() {
use arrow::bitmap::Bitmap;
use arrow::datatypes::ArrowDataType;

macro_rules! assert_bool_dc {
($values:expr, $validity:expr => $dc:expr) => {
let validity: Option<Bitmap> =
<Option<Vec<bool>>>::map($validity, |v| Bitmap::from_iter(v));
let arr =
BooleanArray::new(ArrowDataType::Boolean, Bitmap::from_iter($values), validity);
assert_eq!(arr.n_unique(), $dc);
};
}

assert_bool_dc!(vec![], None => 0);
assert_bool_dc!(vec![], Some(vec![]) => 0);
assert_bool_dc!(vec![true], None => 1);
assert_bool_dc!(vec![true], Some(vec![true]) => 1);
assert_bool_dc!(vec![true], Some(vec![false]) => 1);
assert_bool_dc!(vec![true, false], None => 2);
assert_bool_dc!(vec![true, false, false], None => 2);
assert_bool_dc!(vec![true, false, false], Some(vec![true, true, false]) => 3);

// Copied from https://github.com/pola-rs/polars/pull/16765#discussion_r1629426159
assert_bool_dc!(vec![true, true, true, true, true], Some(vec![true, false, true, false, false]) => 2);
assert_bool_dc!(vec![false, true, false, true, true], Some(vec![true, false, true, false, false]) => 2);
assert_bool_dc!(vec![true, false, true, false, true, true], Some(vec![true, true, false, true, false, false]) => 3);
}
25 changes: 25 additions & 0 deletions crates/polars-compute/src/unique/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
use arrow::array::Array;

/// Kernel to calculate the number of unique elements
pub trait UniqueKernel: Array {
/// Calculate the set of unique elements in `fst` and `others` and fold the result into one
/// array.
fn unique_fold<'a>(fst: &'a Self, others: impl Iterator<Item = &'a Self>) -> Self;

/// Calculate the set of unique elements in [`Self`] where we have no further information about
/// `self`.
fn unique(&self) -> Self;

/// Calculate the set of unique elements in [`Self`] where `self` is sorted.
fn unique_sorted(&self) -> Self;

/// Calculate the number of unique elements in [`Self`]
///
/// A null is also considered a unique value
fn n_unique(&self) -> usize;

/// Calculate the number of unique non-null elements in [`Self`]
fn n_unique_non_null(&self) -> usize;
}

mod boolean;
22 changes: 11 additions & 11 deletions crates/polars-core/src/chunked_array/ops/unique/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -236,17 +236,17 @@ impl ChunkUnique for BinaryChunked {

impl ChunkUnique for BooleanChunked {
fn unique(&self) -> PolarsResult<Self> {
// can be None, Some(true), Some(false)
let mut unique = Vec::with_capacity(3);
for v in self {
if unique.len() == 3 {
break;
}
if !unique.contains(&v) {
unique.push(v)
}
}
Ok(ChunkedArray::new(self.name(), &unique))
let mut iter = self.downcast_iter();
let Some(arr) = iter.next() else {
return Ok(Self::with_chunk(
self.name(),
BooleanArray::new_empty(self.field.as_ref().data_type().to_arrow(false)),
));
};

let unique = polars_compute::unique::UniqueKernel::unique_fold(arr, iter);

Ok(Self::with_chunk(self.name(), unique))
}

fn arg_unique(&self) -> PolarsResult<IdxCa> {
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-parquet/src/arrow/write/boolean/basic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,14 @@ pub(super) fn build_statistics(
array: &BooleanArray,
options: &StatisticsOptions,
) -> ParquetStatistics {
use polars_compute::distinct_count::DistinctCountKernel;
use polars_compute::min_max::MinMaxKernel;
use polars_compute::unique::UniqueKernel;

BooleanStatistics {
null_count: options.null_count.then(|| array.null_count() as i64),
distinct_count: options
.distinct_count
.then(|| array.distinct_non_null_count().try_into().ok())
.then(|| array.n_unique_non_null().try_into().ok())
.flatten(),
max_value: options
.max_value
Expand Down

0 comments on commit 8f987c6

Please sign in to comment.