Skip to content

Commit

Permalink
refactor(rust, python): use xxhash3 for string types (#5617)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Nov 24, 2022
1 parent 6e07e82 commit a05be6e
Show file tree
Hide file tree
Showing 7 changed files with 32 additions and 19 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ rayon = "1.5"
thiserror = "^1"
num = "0.4"
ahash = "0.8"
xxhash-rust = { version = "0.8.6", features = ["xxh3"] }
# todo! remove
anyhow = "1"
hashbrown = { version = "0.13.1", features = ["rayon", "ahash"] }
Expand Down
1 change: 1 addition & 0 deletions polars/polars-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ serde = { version = "1", features = ["derive"], optional = true }
serde_json = { version = "1", optional = true }
smartstring = { version = "1" }
thiserror.workspace = true
xxhash-rust.workspace = true

[dev-dependencies]
bincode = "1"
Expand Down
24 changes: 15 additions & 9 deletions polars/polars-core/src/vector_hasher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use hashbrown::HashMap;
use polars_arrow::utils::CustomIterTools;
use polars_utils::HashSingle;
use rayon::prelude::*;
use xxhash_rust::xxh3::xxh3_64_with_seed;

use crate::datatypes::UInt64Chunked;
use crate::prelude::*;
Expand Down Expand Up @@ -178,25 +179,30 @@ impl VecHash for Utf8Chunked {
fn vec_hash(&self, random_state: RandomState, buf: &mut Vec<u64>) {
buf.clear();
buf.reserve(self.len());
let null_h = get_null_hash_value(random_state.clone());
let null_h = get_null_hash_value(random_state);

self.downcast_iter().for_each(|arr| {
if arr.null_count() == 0 {
buf.extend(arr.values_iter().map(|v| random_state.hash_single(v)))
// simply use the null_hash as seed to get a hash determined by `random_state` that is passed
buf.extend(
arr.values_iter()
.map(|v| xxh3_64_with_seed(v.as_bytes(), null_h)),
)
} else {
buf.extend(arr.into_iter().map(|opt_v| match opt_v {
Some(v) => random_state.hash_single(v),
Some(v) => xxh3_64_with_seed(v.as_bytes(), null_h),
None => null_h,
}))
}
});
}

fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) {
let null_h = get_null_hash_value(random_state.clone());
let null_h = get_null_hash_value(random_state);
self.apply_to_slice(
|opt_v, h| {
let l = match opt_v {
Some(v) => random_state.hash_single(v),
Some(v) => xxh3_64_with_seed(v.as_bytes(), null_h),
None => null_h,
};
_boost_hash_combine(l, *h)
Expand All @@ -211,21 +217,21 @@ impl VecHash for BinaryChunked {
fn vec_hash(&self, random_state: RandomState, buf: &mut Vec<u64>) {
buf.clear();
buf.reserve(self.len());
let null_h = get_null_hash_value(random_state.clone());
let null_h = get_null_hash_value(random_state);
self.downcast_iter().for_each(|arr| {
buf.extend(arr.into_iter().map(|opt_v| match opt_v {
Some(v) => random_state.hash_single(v),
Some(v) => xxh3_64_with_seed(v, null_h),
None => null_h,
}))
});
}

fn vec_hash_combine(&self, random_state: RandomState, hashes: &mut [u64]) {
let null_h = get_null_hash_value(random_state.clone());
let null_h = get_null_hash_value(random_state);
self.apply_to_slice(
|opt_v, h| {
let l = match opt_v {
Some(v) => random_state.hash_single(v),
Some(v) => xxh3_64_with_seed(v, null_h),
None => null_h,
};
_boost_hash_combine(l, *h)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ use polars_core::frame::row::AnyValueBuffer;
use polars_core::prelude::*;
use polars_core::utils::{_set_partition_size, accumulate_dataframes_vertical_unchecked};
use polars_core::POOL;
use polars_utils::hash_to_partition;
use polars_utils::slice::GetSaferUnchecked;
use polars_utils::unwrap::UnwrapUncheckedRelease;
use polars_utils::{hash_to_partition, HashSingle};
use rayon::prelude::*;

use super::aggregates::AggregateFn;
Expand Down Expand Up @@ -174,7 +174,6 @@ impl Sink for Utf8GroupbySink {
state.set_input_schema(self.input_schema.clone())
}
let num_aggs = self.number_of_aggs();
self.hashes.reserve(chunk.data.height());

// todo! amortize allocation
for phys_e in self.aggregation_columns.iter() {
Expand All @@ -186,6 +185,7 @@ impl Sink for Utf8GroupbySink {
.key_column
.evaluate(&chunk, context.execution_state.as_any())?;
let s = s.rechunk();
s.vec_hash(self.hb.clone(), &mut self.hashes).unwrap();
// write the hashes to self.hashes buffer
// s.vec_hash(self.hb.clone(), &mut self.hashes).unwrap();
// now we have written hashes, we take the pointer to this buffer
Expand All @@ -196,9 +196,7 @@ impl Sink for Utf8GroupbySink {
// array of the keys
let keys_arr = s.utf8().unwrap().downcast_iter().next().unwrap().clone();

for (iteration_idx, key_val) in keys_arr.iter().enumerate() {
let h = self.hb.hash_single(key_val);

for (iteration_idx, (key_val, &h)) in keys_arr.iter().zip(&self.hashes).enumerate() {
let partition = hash_to_partition(h, self.pre_agg_partitions.len());
let current_partition =
unsafe { self.pre_agg_partitions.get_unchecked_release_mut(partition) };
Expand Down
7 changes: 7 additions & 0 deletions py-polars/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions py-polars/polars/internals/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6493,10 +6493,10 @@ def hash_rows(
shape: (4,)
Series: '' [u64]
[
12239174968153954787
17976148875586754089
10783150408545073287
1438741209321515184
10047419486152048166
13766281409932363907
2047317070637311557
]
"""
Expand Down
4 changes: 2 additions & 2 deletions py-polars/polars/internals/expr/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3629,11 +3629,11 @@ def hash(
│ --- ┆ --- │
│ u64 ┆ u64 │
╞══════════════════════╪══════════════════════╡
│ 9774092659964970114 ┆ 6959506404929392568
│ 9774092659964970114 ┆ 13614470193936745724
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1101441246220388612 ┆ 11638928888656214026 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 11638928888656214026 ┆ 11040941213715918520
│ 11638928888656214026 ┆ 13382926553367784577
└──────────────────────┴──────────────────────┘
"""
Expand Down

0 comments on commit a05be6e

Please sign in to comment.