Skip to content

Commit

Permalink
Replace bit shift with deruijn in hyperloglog
Browse files Browse the repository at this point in the history
  • Loading branch information
panzhongxian committed Apr 17, 2024
1 parent 804110a commit fe2ae98
Showing 1 changed file with 28 additions and 7 deletions.
35 changes: 28 additions & 7 deletions src/hyperloglog.c
Expand Up @@ -186,6 +186,7 @@ struct hllhdr {

static char *invalid_hll_err = "-INVALIDOBJ Corrupted HLL object detected";


/* =========================== Low level bit macros ========================= */

/* Macros to access the dense representation.
Expand Down Expand Up @@ -425,11 +426,19 @@ uint64_t MurmurHash64A (const void * key, int len, unsigned int seed) {
return h;
}

static int hll_de_bruijn_64_lookup[64] = {
0, 1, 56, 2, 57, 49, 28, 3, 61, 58, 42, 50, 38, 29, 17, 4,
62, 47, 59, 36, 45, 43, 51, 22, 53, 39, 33, 30, 24, 18, 12, 5,
63, 55, 48, 27, 60, 41, 37, 16, 46, 35, 44, 21, 52, 32, 23, 11,
54, 26, 40, 15, 34, 20, 31, 10, 25, 14, 19, 9, 13, 8, 7, 6,
};


/* Given a string element to add to the HyperLogLog, returns the length
* of the pattern 000..1 of the element hash. As a side effect 'regp' is
* set to the register index this element hashes to. */
int hllPatLen(unsigned char *ele, size_t elesize, long *regp) {
uint64_t hash, bit, index;
uint64_t hash, index;
int count;

/* Count the number of zeroes starting from bit HLL_REGISTERS
Expand All @@ -448,12 +457,24 @@ int hllPatLen(unsigned char *ele, size_t elesize, long *regp) {
hash >>= HLL_P; /* Remove bits used to address the register. */
hash |= ((uint64_t)1<<HLL_Q); /* Make sure the loop terminates
and count will be <= Q+1. */
bit = 1;
count = 1; /* Initialized to 1 since we count the "00000...1" pattern. */
while((hash & bit) == 0) {
count++;
bit <<= 1;
}


/* x & -x leaves only the right-most bit set in the word.
*
* Let k be the index of that bit. Since only a single bit is set, the value
* is two to the power of k. Multiplying by a power of two is equivalent to
* left shifting, in this case by k bits. The de Bruijn (64 bit) constant is
* such that all six bit, consecutive substrings are distinct. Therefore, if
* we have a left shifted version of this constant we can find by how many
* bits it was shifted by looking at which six bit substring ended up at the
* top of the word. (Knuth, volume 4, section 7.3.1)
*/
hash &= -hash;
hash *= 0x03f79d71b4ca8b09;
hash >>= 58;
/* Initialized to 1 since we count the "00000...1" pattern. */
count = hll_de_bruijn_64_lookup[hash] + 1;

*regp = (int) index;
return count;
}
Expand Down

0 comments on commit fe2ae98

Please sign in to comment.