From a300bc7b09b362b98a66be2bb2726bd8c94edaa2 Mon Sep 17 00:00:00 2001
From: Clark Gaebel <cg.wowus.cg@gmail.com>
Date: Fri, 16 Jan 2015 15:13:10 -0500
Subject: [PATCH] [collections] Increase HashMap Load Factor

I previously pulled some sketchy math out of my ass to figure out the
hashmap load factor. Turns out, the paper has a chapter on this and
I didn't use it. As a result, turns out we can push this hashmap all
the way up to a load factor of 98%. That's pretty cool.

Paper: https://cs.uwaterloo.ca/research/tr/1986/CS-86-14.pdf
Relevant discussion starts on page 31 of the pdf.

r? @Gankro
r? [someone that's good at closely reading papers]
---
 src/libstd/collections/hash/map.rs | 52 +++++++++++-------------------
 1 file changed, 19 insertions(+), 33 deletions(-)

diff --git a/src/libstd/collections/hash/map.rs b/src/libstd/collections/hash/map.rs
index 80ae3076df37a..2ed8c0076fdf6 100644
--- a/src/libstd/collections/hash/map.rs
+++ b/src/libstd/collections/hash/map.rs
@@ -48,10 +48,10 @@ use super::state::HashState;
 const INITIAL_LOG2_CAP: uint = 5;
 pub const INITIAL_CAPACITY: uint = 1 << INITIAL_LOG2_CAP; // 2^5
 
-/// The default behavior of HashMap implements a load factor of 90.9%.
+/// The default behavior of HashMap implements a load factor of 98%.
 /// This behavior is characterized by the following condition:
 ///
-/// - if size > 0.909 * capacity: grow the map
+/// - if size > 0.98 * capacity: grow the map
 #[derive(Clone)]
 struct DefaultResizePolicy;
 
@@ -66,7 +66,7 @@ impl DefaultResizePolicy {
         // on capacity:
         //
         // - if `cap < size * 1.1`: grow the map
-        usable_size * 11 / 10
+        (usable_size as f64 * (1.0/0.98)) as uint
     }
 
     /// An inverse of `min_capacity`, approximately.
@@ -76,12 +76,7 @@ impl DefaultResizePolicy {
         // min_capacity(size) must be smaller than the internal capacity,
         // so that the map is not resized:
         // `min_capacity(usable_capacity(x)) <= x`.
-        // The lef-hand side can only be smaller due to flooring by integer
-        // division.
-        //
-        // This doesn't have to be checked for overflow since allocation size
-        // in bytes will overflow earlier than multiplication by 10.
-        cap * 10 / 11
+        (cap as f64 * 0.98) as uint
     }
 }
 
@@ -103,36 +98,25 @@ fn test_resize_policy() {
 //    is higher than how far we've already probed, swap the elements.
 //
 // This massively lowers variance in probe distance, and allows us to get very
-// high load factors with good performance. The 90% load factor I use is rather
+// high load factors with good performance. The 98% load factor I use is rather
 // conservative.
 //
-// > Why a load factor of approximately 90%?
+// > Why a load factor of 98%?
+//
+// Let the expected value of a probe sequence be Epsl, and the load factor be α.
 //
-// In general, all the distances to initial buckets will converge on the mean.
-// At a load factor of α, the odds of finding the target bucket after k
-// probes is approximately 1-α^k. If we set this equal to 50% (since we converge
-// on the mean) and set k=8 (64-byte cache line / 8-byte hash), α=0.92. I round
-// this down to make the math easier on the CPU and avoid its FPU.
-// Since on average we start the probing in the middle of a cache line, this
-// strategy pulls in two cache lines of hashes on every lookup. I think that's
-// pretty good, but if you want to trade off some space, it could go down to one
-// cache line on average with an α of 0.84.
+// From the robin hood hashing paper (page 35) [1],
 //
-// > Wait, what? Where did you get 1-α^k from?
+//   Epsl[α] = -ln(1 - α)/α
 //
-// On the first probe, your odds of a collision with an existing element is α.
-// The odds of doing this twice in a row is approximately α^2. For three times,
-// α^3, etc. Therefore, the odds of colliding k times is α^k. The odds of NOT
-// colliding after k tries is 1-α^k.
+// Since one cache line is 8 usizes on 64-bit systems, to keep probes within the
+// same cache line, on average, we should shoot for a probe sequence length of 4.
+// Solving the above equation:
 //
-// The paper from 1986 cited below mentions an implementation which keeps track
-// of the distance-to-initial-bucket histogram. This approach is not suitable
-// for modern architectures because it requires maintaining an internal data
-// structure. This allows very good first guesses, but we are most concerned
-// with guessing entire cache lines, not individual indexes. Furthermore, array
-// accesses are no longer linear and in one direction, as we have now. There
-// is also memory and cache pressure that this would entail that would be very
-// difficult to properly see in a microbenchmark.
+//   Epsl[0.98] = -ln(1 - 0.98)/0.98 = 3.9919
+//
+// we get a load factor of 98%! For reference, the variance of probe sequence
+// length will be around 1.5 (from the graph on page 30 of the paper).
 //
 // ## Future Improvements (FIXME!)
 //
@@ -203,6 +187,8 @@ fn test_resize_policy() {
 // produces identical results to a linear naive reinsertion from the same
 // element.
 //
+// 1. Pedro Celis. ["Robin Hood Hashing"](https://cs.uwaterloo.ca/research/tr/1986/CS-86-14.pdf)
+
 // FIXME(Gankro, pczarn): review the proof and put it all in a separate doc.rs
 
 /// A hash map implementation which uses linear probing with Robin