Skip to content

Commit bf461cd

Browse files
committed
Use CPython hash algorithm for frozenset
The original hash algorithm just XOR'd all the hashes of the elements of the set, which is problematic. The CPython algorithm is required to pass the tests. - Replace `PyFrozenSet::hash` with CPython's algorithm - Remove unused `hash_iter_unorded` functions - Add `frozenset` benchmark - Enable tests - Lower performance expectations on effectiveness test - Adjust `slot::hash_wrapper` so that it doesn't rehash the computed hash value in the process of converting PyInt to PyHash.
1 parent bdf228e commit bf461cd

File tree

7 files changed

+41
-28
lines changed

7 files changed

+41
-28
lines changed

Lib/test/test_collections.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1838,8 +1838,6 @@ def __repr__(self):
18381838
self.assertTrue(f1 != l1)
18391839
self.assertTrue(f1 != l2)
18401840

1841-
# TODO: RUSTPYTHON
1842-
@unittest.expectedFailure
18431841
def test_Set_hash_matches_frozenset(self):
18441842
sets = [
18451843
{}, {1}, {None}, {-1}, {0.0}, {"abc"}, {1, 2, 3},

Lib/test/test_set.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -707,8 +707,6 @@ def test_hash_caching(self):
707707
f = self.thetype('abcdcda')
708708
self.assertEqual(hash(f), hash(f))
709709

710-
# TODO: RUSTPYTHON
711-
@unittest.expectedFailure
712710
def test_hash_effectiveness(self):
713711
n = 13
714712
hashvalues = set()
@@ -730,7 +728,14 @@ def powerset(s):
730728
for i in range(len(s)+1):
731729
yield from map(frozenset, itertools.combinations(s, i))
732730

733-
for n in range(18):
731+
# TODO the original test has:
732+
# for n in range(18):
733+
# Due to general performance overhead, hashing a frozenset takes
734+
# about 50 times longer than in CPython. This test amplifies that
735+
# exponentially, so the best we can do here reasonably is 13.
736+
# Even if the internal hash function did nothing, it would still be
737+
# about 40 times slower than CPython.
738+
for n in range(13):
734739
t = 2 ** n
735740
mask = t - 1
736741
for nums in (range, zf_range):
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
fs = frozenset(range(0, ITERATIONS))
2+
3+
# ---
4+
5+
hash(fs)

common/src/hash.rs

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -130,20 +130,6 @@ pub fn hash_float(value: f64) -> Option<PyHash> {
130130
Some(fix_sentinel(x as PyHash * value.signum() as PyHash))
131131
}
132132

133-
pub fn hash_iter_unordered<'a, T: 'a, I, F, E>(iter: I, hashf: F) -> Result<PyHash, E>
134-
where
135-
I: IntoIterator<Item = &'a T>,
136-
F: Fn(&'a T) -> Result<PyHash, E>,
137-
{
138-
let mut hash: PyHash = 0;
139-
for element in iter {
140-
let item_hash = hashf(element)?;
141-
// xor is commutative and hash should be independent of order
142-
hash ^= item_hash;
143-
}
144-
Ok(fix_sentinel(mod_int(hash)))
145-
}
146-
147133
pub fn hash_bigint(value: &BigInt) -> PyHash {
148134
let ret = match value.to_i64() {
149135
Some(i) => mod_int(i),

vm/src/builtins/set.rs

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -434,7 +434,28 @@ impl PySetInner {
434434
}
435435

436436
fn hash(&self, vm: &VirtualMachine) -> PyResult<PyHash> {
437-
crate::utils::hash_iter_unordered(self.elements().iter(), vm)
437+
// Work to increase the bit dispersion for closely spaced hash values.
438+
// This is important because some use cases have many combinations of a
439+
// small number of elements with nearby hashes so that many distinct
440+
// combinations collapse to only a handful of distinct hash values.
441+
fn _shuffle_bits(h: u64) -> u64 {
442+
((h ^ 89869747) ^ (h.wrapping_shl(16))).wrapping_mul(3644798167)
443+
}
444+
// Factor in the number of active entries
445+
let mut hash: u64 = (self.elements().len() as u64 + 1).wrapping_mul(1927868237);
446+
// Xor-in shuffled bits from every entry's hash field because xor is
447+
// commutative and a frozenset hash should be independent of order.
448+
for element in self.elements().iter() {
449+
hash ^= _shuffle_bits(element.hash(vm)? as u64);
450+
}
451+
// Disperse patterns arising in nested frozensets
452+
hash ^= (hash >> 11) ^ (hash >> 25);
453+
hash = hash.wrapping_mul(69069).wrapping_add(907133923);
454+
// -1 is reserved as an error code
455+
if hash == u64::MAX {
456+
hash = 590923713;
457+
}
458+
Ok(hash as PyHash)
438459
}
439460

440461
// Run operation, on failure, if item is a set/set subclass, convert it

vm/src/types/slot.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ use crate::{
1515
AsObject, Py, PyObject, PyObjectRef, PyPayload, PyRef, PyResult, VirtualMachine,
1616
};
1717
use crossbeam_utils::atomic::AtomicCell;
18+
use malachite_bigint::BigInt;
1819
use num_traits::{Signed, ToPrimitive};
1920
use std::{borrow::Borrow, cmp::Ordering, ops::Deref};
2021

@@ -254,7 +255,11 @@ fn hash_wrapper(zelf: &PyObject, vm: &VirtualMachine) -> PyResult<PyHash> {
254255
let py_int = hash_obj
255256
.payload_if_subclass::<PyInt>(vm)
256257
.ok_or_else(|| vm.new_type_error("__hash__ method should return an integer".to_owned()))?;
257-
Ok(rustpython_common::hash::hash_bigint(py_int.as_bigint()))
258+
let big_int = py_int.as_bigint();
259+
let hash: PyHash = big_int
260+
.to_i64()
261+
.unwrap_or_else(|| (big_int % BigInt::from(u64::MAX)).to_i64().unwrap());
262+
Ok(hash)
258263
}
259264

260265
/// Marks a type as unhashable. Similar to PyObject_HashNotImplemented in CPython

vm/src/utils.rs

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,6 @@ pub fn hash_iter<'a, I: IntoIterator<Item = &'a PyObjectRef>>(
1111
vm.state.hash_secret.hash_iter(iter, |obj| obj.hash(vm))
1212
}
1313

14-
pub fn hash_iter_unordered<'a, I: IntoIterator<Item = &'a PyObjectRef>>(
15-
iter: I,
16-
vm: &VirtualMachine,
17-
) -> PyResult<rustpython_common::hash::PyHash> {
18-
rustpython_common::hash::hash_iter_unordered(iter, |obj| obj.hash(vm))
19-
}
20-
2114
impl ToPyObject for std::convert::Infallible {
2215
fn to_pyobject(self, _vm: &VirtualMachine) -> PyObjectRef {
2316
match self {}

0 commit comments

Comments
 (0)