# The reference

http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf

(page 140 or 14)
- It has range corrections
- Bias corrections

In [1]:
import hashlib
import math
import numpy as np
import random
import statistics as st

In [67]:
error_rate = 0.01
class BaseAlg:
            
    def __init__(self):
        self.k = 1
        self.m = 2**self.k

        
    def hashString(self, value):
        if not isinstance(value, str):
            value = str(value)
        
        hashcode=hashlib.sha1(value.encode('utf-8')).hexdigest()
        bin_code = bin(int(hashcode, 16))[-24:].zfill(24)
    
        return bin_code

    
    def initialise_registers(self):
        self.k = 4
        if error_rate is not None:
            self.m = (1.04/error_rate)**2
            self.k = math.ceil(math.log(self.m, 2))
        self.m = 2**self.k
        self.hll_struct = {r: None for r in range(self.m)}
    

    def alg(self, stream):
        self.initialise_registers()
        for d in test_stream:
            dh = self.hashString(d)
            r = int(dh[:self.k], 2)
            q = dh[self.k:]
            
            if q.find("1") == -1:
                count = 1
            else:
                count = q.find("1")+1
        
            if (self.hll_struct[r] is None) or (count > self.hll_struct[r])  :
                self.hll_struct[r] = count
        #v = [x for x in self.hll_struct.values() if x is not None]
        v = [2**x for x in self.hll_struct.values() if x is not None]
        print("Cardinality lower threshold: %1.1f" % (2.5*self.m))
        print("%d/%d registers holding some count" % (len(v), self.m))
        
        z = st.harmonic_mean(v)
    
        alfa_dict = {16: 0.673, 32: 0.697, 64: 0.709, 128: 0.7213/(1+(1.079/self.m))}
        alfa = 0.7213/(1+(1.079/self.m))
        #return alfa*(self.m**2)*z
        return alfa*self.m*z
        

    def verify(self, stream):
        print("Estimated distinct values: %f" % self.alg(stream))
        print("Actual distinct values: %d" % len(set(stream)))

In [54]:
# max_range = random.randint(0, 512)
# stream_size = 1024
# test_stream = [random.randint(0, max_range) for x in range(stream_size)]
test_stream = random.sample(range(0, 100000), 50000)

In [55]:
len(set(test_stream))

50000

In [68]:
hll_alg = BaseAlg()
hll_alg.verify(test_stream)

Cardinality lower threshold: 40960.0
15626/16384 registers holding some count
Estimated distinct values: 59942.821634
Actual distinct values: 50000
