Skip to content

Commit

Permalink
implement list hash for simply nested lists (#4090)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Jul 20, 2022
1 parent 2cbe793 commit 453af3c
Show file tree
Hide file tree
Showing 10 changed files with 128 additions and 17 deletions.
4 changes: 4 additions & 0 deletions polars/polars-core/src/export.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ pub use arrow;
#[cfg(feature = "temporal")]
pub use chrono;

#[cfg(feature = "private")]
pub use ahash;
#[cfg(feature = "private")]
pub use num;
#[cfg(feature = "private")]
Expand All @@ -11,3 +13,5 @@ pub use rayon;
#[cfg(feature = "private")]
#[cfg(any(feature = "strings", feature = "temporal"))]
pub use regex;

pub use crate::vector_hasher::_boost_hash_combine;
6 changes: 3 additions & 3 deletions polars/polars-core/src/frame/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ mod upstream_traits;

#[cfg(feature = "sort_multiple")]
use crate::prelude::sort::prepare_argsort;
use crate::vector_hasher::boost_hash_combine;
use crate::vector_hasher::_boost_hash_combine;
#[cfg(feature = "row_hash")]
use crate::vector_hasher::df_rows_to_hashes_threaded;
use crate::POOL;
Expand Down Expand Up @@ -435,8 +435,8 @@ impl DataFrame {
rval.hash(&mut h);
let rh2 = h.finish();
(
boost_hash_combine(lhash, rhash),
boost_hash_combine(lh2, rh2),
_boost_hash_combine(lhash, rhash),
_boost_hash_combine(lh2, rh2),
n,
)
},
Expand Down
7 changes: 0 additions & 7 deletions polars/polars-core/src/series/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -489,13 +489,6 @@ impl Series {
(self * other).sum::<f64>()
}

#[cfg(feature = "row_hash")]
#[cfg_attr(docsrs, doc(cfg(feature = "row_hash")))]
/// Get a hash of this Series
pub fn hash(&self, build_hasher: ahash::RandomState) -> UInt64Chunked {
UInt64Chunked::from_vec(self.name(), self.0.vec_hash(build_hasher))
}

/// Get the sum of the Series as a new Series of length 1.
///
/// If the [`DataType`] is one of `{Int8, UInt8, Int16, UInt16}` the `Series` is
Expand Down
12 changes: 6 additions & 6 deletions polars/polars-core/src/vector_hasher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ where
.zip(&mut hashes[offset..])
.for_each(|(v, h)| {
let l = T::Native::get_hash(v, &random_state);
*h = boost_hash_combine(l, *h)
*h = _boost_hash_combine(l, *h)
}),
_ => {
let validity = arr.validity().unwrap();
Expand All @@ -104,7 +104,7 @@ where
.zip(&mut hashes[offset..])
.zip(arr.values().as_slice())
.for_each(|((valid, h), l)| {
*h = boost_hash_combine(
*h = _boost_hash_combine(
[null_h, T::Native::get_hash(l, &random_state)][valid as usize],
*h,
)
Expand Down Expand Up @@ -137,7 +137,7 @@ impl VecHash for Utf8Chunked {
Some(v) => str::get_hash(v, &random_state),
None => null_h,
};
boost_hash_combine(l, *h)
_boost_hash_combine(l, *h)
},
hashes,
)
Expand All @@ -162,7 +162,7 @@ impl VecHash for BooleanChunked {
|opt_v, h| {
let mut hasher = random_state.build_hasher();
opt_v.hash(&mut hasher);
boost_hash_combine(hasher.finish(), *h)
_boost_hash_combine(hasher.finish(), *h)
},
hashes,
)
Expand Down Expand Up @@ -216,7 +216,7 @@ where
|opt_v, h| {
let mut hasher = random_state.build_hasher();
opt_v.hash(&mut hasher);
boost_hash_combine(hasher.finish(), *h)
_boost_hash_combine(hasher.finish(), *h)
},
hashes,
)
Expand Down Expand Up @@ -493,7 +493,7 @@ where

// hash combine from c++' boost lib
#[inline]
pub(crate) fn boost_hash_combine(l: u64, r: u64) -> u64 {
pub fn _boost_hash_combine(l: u64, r: u64) -> u64 {
l ^ r.wrapping_add(0x9e3779b9u64.wrapping_add(l << 6).wrapping_add(r >> 2))
}

Expand Down
2 changes: 1 addition & 1 deletion polars/polars-lazy/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ cumulative_eval = []
chunked_ids = []
list_to_struct = ["polars-ops/list_to_struct"]
python = ["pyo3"]
row_hash = ["polars-core/row_hash"]
row_hash = ["polars-core/row_hash", "polars-ops/hash"]
string_justify = ["polars-ops/string_justify"]
arg_where = []

Expand Down
1 change: 1 addition & 0 deletions polars/polars-ops/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,4 @@ diff = ["polars-core/diff"]
strings = ["polars-core/strings"]
string_justify = ["polars-core/strings"]
log = []
hash = []
75 changes: 75 additions & 0 deletions polars/polars-ops/src/chunked_array/list/hash.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
use super::*;
use polars_core::export::{
_boost_hash_combine,
ahash::{self, CallHasher},
rayon::prelude::*,
};
use polars_core::utils::NoNull;
use std::hash::Hash;

fn hash_agg<T>(ca: &ChunkedArray<T>, random_state: &ahash::RandomState) -> u64
where
T: PolarsIntegerType,
T::Native: Hash + CallHasher,
{
// Note that we don't use the no null branch! This can break in unexpected ways.
// for instance with threading we split an array in n_threads, this may lead to
// splits that have no nulls and splits that have nulls. Then one array is hashed with
// Option<T> and the other array with T.
// Meaning that they cannot be compared. By always hashing on Option<T> the random_state is
// the only deterministic seed.

// just some large prime
let mut hash_agg = 9069731903u64;

// just some large prime
let null_hash = 2413670057;

ca.downcast_iter().for_each(|arr| {
for opt_v in arr.iter() {
match opt_v {
Some(v) => {
let r = T::Native::get_hash(v, random_state);
hash_agg = _boost_hash_combine(hash_agg, r);
}
None => {
hash_agg = _boost_hash_combine(hash_agg, null_hash);
}
}
}
});
hash_agg
}

pub(crate) fn hash(ca: &ListChunked, build_hasher: ahash::RandomState) -> UInt64Chunked {
if !ca.inner_dtype().to_physical().is_numeric() {
panic!(
"Hashing a list with a non-numeric inner type not supported. Got dtype: {:?}",
ca.dtype()
);
}

// just some large prime
let null_hash = 1969099309u64;

let out: NoNull<UInt64Chunked> = ca
.par_iter()
.map(|opt_s: Option<Series>| match opt_s {
None => null_hash,
Some(s) => {
let s = s.to_physical_repr();
if s.bit_repr_is_large() {
let ca = s.bit_repr_large();
hash_agg(&ca, &build_hasher)
} else {
let ca = s.bit_repr_small();
hash_agg(&ca, &build_hasher)
}
}
})
.collect();

let mut out = out.into_inner();
out.rename(ca.name());
out
}
2 changes: 2 additions & 0 deletions polars/polars-ops/src/chunked_array/list/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use polars_core::prelude::*;

#[cfg(feature = "hash")]
pub(crate) mod hash;
#[cfg(feature = "list")]
#[cfg_attr(docsrs, doc(cfg(feature = "list")))]
mod namespace;
Expand Down
28 changes: 28 additions & 0 deletions polars/polars-ops/src/series/_trait.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
use super::*;
#[cfg(feature = "hash")]
use polars_core::export::ahash;
use std::ops::Deref;

#[cfg(feature = "to_dummies")]
Expand All @@ -14,13 +16,28 @@ macro_rules! invalid_operation {
};
}

#[cfg(feature = "hash")]
macro_rules! invalid_operation_panic {
($s:expr) => {
panic!(
"this operation is not implemented/valid for this dtype: {:?}",
$s.dtype()
)
};
}

pub trait SeriesOps {
fn dtype(&self) -> &DataType;

#[cfg(feature = "to_dummies")]
fn to_dummies(&self) -> Result<DataFrame> {
invalid_operation!(self)
}

#[cfg(feature = "hash")]
fn hash(&self, _build_hasher: ahash::RandomState) -> UInt64Chunked {
invalid_operation_panic!(self)
}
}

impl SeriesOps for Series {
Expand All @@ -31,4 +48,15 @@ impl SeriesOps for Series {
fn to_dummies(&self) -> Result<DataFrame> {
self.to_ops().to_dummies()
}

#[cfg(feature = "hash")]
fn hash(&self, build_hasher: ahash::RandomState) -> UInt64Chunked {
match self.dtype() {
DataType::List(_) => {
let ca = self.list().unwrap();
crate::chunked_array::hash::hash(ca, build_hasher)
}
_ => UInt64Chunked::from_vec(self.name(), self.0.vec_hash(build_hasher)),
}
}
}
8 changes: 8 additions & 0 deletions py-polars/tests/test_lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,3 +378,11 @@ def test_list_concat_supertype() -> None:
assert df.with_column(pl.concat_list(pl.col(["a", "b"])).alias("concat_list"))[
"concat_list"
].to_list() == [[1, 10000], [2, 20000]]


def test_list_hash() -> None:
out = pl.DataFrame({"a": [[1, 2, 3], [3, 4], [1, 2, 3]]}).with_column(
pl.col("a").hash().alias("b")
)
assert out.dtypes == [pl.List(pl.Int64), pl.UInt64]
assert out[0, "b"] == out[2, "b"]

0 comments on commit 453af3c

Please sign in to comment.