Skip to content

Commit

Permalink
fix unique/n_unique for Categorical
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Dec 1, 2021
1 parent d4c9e26 commit 1c655bf
Show file tree
Hide file tree
Showing 9 changed files with 61 additions and 15 deletions.
4 changes: 2 additions & 2 deletions polars/polars-arrow/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ description = "Arrow interfaces for Polars DataFrame library"

[dependencies]
# arrow = { package = "arrow2", git = "https://github.com/jorgecarleitao/arrow2", rev = "e9a6c3ef7e1a328c298bd45e36ac2abf8ae44ebb", default-features = false }
# arrow = { package = "arrow2", git = "https://github.com/ritchie46/arrow2", default-features = false, features = ["compute"], branch = "fn_to" }
arrow = { package = "arrow2", version = "0.8", default-features = false }
arrow = { package = "arrow2", git = "https://github.com/ritchie46/arrow2", default-features = false, features = ["compute"], branch = "offset_pub" }
# arrow = { package = "arrow2", version = "0.8", default-features = false }
num = "^0.4"
thiserror = "^1.0"

Expand Down
4 changes: 3 additions & 1 deletion polars/polars-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,9 @@ unsafe_unwrap = "^0.1.0"

[dependencies.arrow]
package = "arrow2"
version = "0.8"
git = "https://github.com/ritchie46/arrow2"
branch = "offset_pub"
# version = "0.8"
default-features = false
features = [
"compute_aggregate",
Expand Down
8 changes: 7 additions & 1 deletion polars/polars-core/src/chunked_array/builder/from.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::prelude::{BooleanChunked, ChunkedArray, PolarsNumericType};
use crate::prelude::*;
use arrow::array::{BooleanArray, PrimitiveArray, Utf8Array};
use std::sync::Arc;

Expand All @@ -11,6 +11,12 @@ impl<T: PolarsNumericType> From<(&str, PrimitiveArray<T::Native>)> for ChunkedAr
}
}

impl<T: PolarsNumericType> From<&[T::Native]> for ChunkedArray<T> {
fn from(slice: &[T::Native]) -> Self {
ChunkedArray::new_from_slice("", slice)
}
}

impl From<(&str, BooleanArray)> for BooleanChunked {
fn from(tpl: (&str, BooleanArray)) -> Self {
let name = tpl.0;
Expand Down
5 changes: 4 additions & 1 deletion polars/polars-core/src/chunked_array/categorical/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -174,12 +174,15 @@ impl CategoricalChunkedBuilder {
}

pub fn finish(self) -> ChunkedArray<CategoricalType> {
// both for the local and the global map, we own a map that has all unique keys
let bit_settings = 1u8 << 4;

ChunkedArray {
field: Arc::new(self.field),
chunks: vec![self.array_builder.into_arc()],
phantom: PhantomData,
categorical_map: Some(Arc::new(self.reverse_mapping.finish())),
..Default::default()
bit_settings,
}
}
}
Expand Down
21 changes: 21 additions & 0 deletions polars/polars-core/src/chunked_array/categorical/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,13 @@ impl CategoricalChunked {
self
}

pub(crate) fn can_fast_unique(&self) -> bool {
self.bit_settings & 1 << 4 != 0 && self.chunks.len() == 1 && {
let arr = self.downcast_iter().next().unwrap();
arr.values().offset() == 0
}
}

/// Create an `[Iterator]` that iterates over the `&str` values of the `[CategoricalChunked]`.
pub fn iter_str(&self) -> CatIter<'_> {
let iter = self.deref().into_iter();
Expand Down Expand Up @@ -178,4 +185,18 @@ mod test {
assert_eq!(appended.str_value(4), "\"x\"");
assert_eq!(appended.str_value(5), "\"y\"");
}

#[test]
fn test_fast_unique() {
let mut s = Series::new("1", vec!["a", "b", "c"])
.cast(&DataType::Categorical)
.unwrap();

assert_eq!(s.n_unique().unwrap(), 3);
// make sure that it does not take the fast path after take/ slice
let out = s.take(&([1, 2].as_ref()).into()).unwrap();
assert_eq!(out.n_unique().unwrap(), 2);
let out = s.slice(1, 2);
assert_eq!(out.n_unique().unwrap(), 2);
}
}
2 changes: 2 additions & 0 deletions polars/polars-core/src/chunked_array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,8 @@ pub struct ChunkedArray<T> {
/// third bit dtype list: fast_explode
/// - unset: unknown or not all arrays have at least one value
/// - set: all list arrays are filled (this allows for cheap explode)
/// fourth bit: original local categorical
/// meaning that n_unique is the same as the cat map length
pub(crate) bit_settings: u8,
}

Expand Down
20 changes: 15 additions & 5 deletions polars/polars-core/src/chunked_array/ops/unique/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -264,11 +264,17 @@ impl ChunkUnique<Utf8Type> for Utf8Chunked {
impl ChunkUnique<CategoricalType> for CategoricalChunked {
fn unique(&self) -> Result<Self> {
let cat_map = self.categorical_map.as_ref().unwrap();
let mut ca = match &**cat_map {
RevMapping::Local(a) => UInt32Chunked::new_from_iter(self.name(), 0..(a.len() as u32)),
RevMapping::Global(map, _, _) => {
UInt32Chunked::new_from_iter(self.name(), map.keys().copied())
let mut ca = if self.can_fast_unique() {
match &**cat_map {
RevMapping::Local(a) => {
UInt32Chunked::new_from_iter(self.name(), 0..(a.len() as u32))
}
RevMapping::Global(map, _, _) => {
UInt32Chunked::new_from_iter(self.name(), map.keys().copied())
}
}
} else {
self.deref().unique()?
};
ca.categorical_map = self.categorical_map.clone();
Ok(ca.into())
Expand All @@ -289,7 +295,11 @@ impl ChunkUnique<CategoricalType> for CategoricalChunked {
impl_value_counts!(self)
}
fn n_unique(&self) -> Result<usize> {
Ok(self.categorical_map.as_ref().unwrap().len())
if self.can_fast_unique() {
Ok(self.categorical_map.as_ref().unwrap().len())
} else {
self.deref().n_unique()
}
}
#[cfg(feature = "mode")]
fn mode(&self) -> Result<Self> {
Expand Down
4 changes: 2 additions & 2 deletions polars/polars-io/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ private = []
ahash = "0.7"
anyhow = "1.0"
# arrow = { package = "arrow2", git = "https://github.com/jorgecarleitao/arrow2", rev = "e9a6c3ef7e1a328c298bd45e36ac2abf8ae44ebb", default-features = false }
# arrow = { package = "arrow2", git = "https://github.com/ritchie46/arrow2", default-features = false, features = ["compute"], branch = "fn_to" }
arrow = { package = "arrow2", version = "0.8", default-features = false }
arrow = { package = "arrow2", git = "https://github.com/ritchie46/arrow2", default-features = false, features = ["compute"], branch = "offset_pub" }
# arrow = { package = "arrow2", version = "0.8", default-features = false }
csv-core = { version = "0.1.10", optional = true }
dirs = "4.0"
flate2 = { version = "1", optional = true, default-features = false }
Expand Down
8 changes: 5 additions & 3 deletions py-polars/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 1c655bf

Please sign in to comment.