# The reference

http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf

(page 140 or 14)
- It has range corrections
- Bias corrections

In [19]:
import hashlib
import math
import numpy as np
import random
import statistics as st

In [40]:
error_rate = 0.01
class BaseAlg:
            
    def __init__(self):
        self.k = 1
        self.m = 2**self.k

        
    def hashString(self, value):
        if not isinstance(value, str):
            value = str(value)
        
        hashcode=hashlib.sha1(value.encode('utf-8')).hexdigest()
        bin_code = bin(int(hashcode, 16))[-24:].zfill(24)
    
        return bin_code

    
    def initialise_registers(self):
        self.k = 4
        if error_rate is not None:
            self.m = (1.04/error_rate)**2
            self.k = math.ceil(math.log(self.m, 2))
        self.m = 2**self.k
        self.hll_struct = {r: 0 for r in range(self.m)}
    

    def alg(self, stream):
        self.initialise_registers()
        for d in test_stream:
            dh = self.hashString(d)
            r = int(dh[:self.k], 2)
            q = dh[self.k:]
            
            if q.find("1") == -1:
                count = 1
            else:
                count = q.find("1")+1
        
            if (count > self.hll_struct[r])  :
                self.hll_struct[r] = count
        #v = [x for x in self.hll_struct.values() if x is not None]
        v = [2**x for x in self.hll_struct.values()]
        print(v)
        print("%d/%d registers holding some count" % (len(v), self.m))
        
        z = st.harmonic_mean(v)
    
        alfa_dict = {16: 0.673, 32: 0.697, 64: 0.709, 128: 0.7213/(1+(1.079/self.m))}
        alfa = alfa_dict[128]

        raw = alfa*self.m*z
        
        if raw <= 2.5*self.m:
            print("Small range correction")
            u = len([x for x in self.hll_struct.values() if x==0])
            if u != 0:
                return self.m*math.log(self.m/u)
            else:
                return raw
        elif raw <= (1/30)*2**32:
            print("Intermediate range correction")
            return raw
        else:
            print("Large range correction")
            return -(2**32)*math.log(1-raw/(2**32))
                
        return 0
      

    def verify(self, stream):
        print("Estimated distinct values: %f" % self.alg(stream))
        print("Actual distinct values: %d" % len(set(stream)))

In [41]:
# max_range = random.randint(0, 512)
# stream_size = 1024
# test_stream = [random.randint(0, max_range) for x in range(stream_size)]
test_stream = random.sample(range(0, 100000), 50000)

In [42]:
len(set(test_stream))

50000

In [43]:
hll_alg = BaseAlg()
hll_alg.verify(test_stream)

[16, 16, 8, 16, 1, 2, 256, 4, 32, 4, 4, 32, 1024, 2, 32, 8, 2, 4, 4, 8, 128, 32, 32, 2, 32, 16, 8, 16, 8, 512, 8, 16, 16, 16, 16, 8, 8, 64, 2, 4, 16, 2, 2, 64, 8, 32, 8, 8, 16, 16, 64, 2, 4, 4, 4, 8, 16, 8, 8, 16, 16, 16, 4, 4, 16, 4, 64, 1, 16, 32, 1, 4, 16, 256, 4, 32, 4, 4, 4, 4, 16, 4, 2, 8, 4, 16, 4, 8, 32, 32, 16, 16, 2, 2, 8, 32, 2, 32, 64, 32, 32, 32, 8, 2, 2, 2, 16, 8, 32, 32, 2, 8, 2, 4, 8, 8, 32, 8, 16, 16, 16, 32, 16, 4, 4, 2, 2, 1, 2, 2, 4, 16, 8, 4, 8, 2, 8, 64, 8, 2, 1024, 16, 128, 2, 16, 8, 2, 2, 2, 2, 8, 2, 32, 4, 2, 8, 32, 4, 4, 4, 2, 256, 4, 4, 64, 16, 8, 32, 2, 8, 8, 8, 8, 32, 8, 2, 64, 1, 8, 2, 16, 16, 32, 8, 32, 4, 128, 16, 16, 4, 256, 2, 2, 2, 1, 1, 2, 2, 4, 2, 16, 2, 8, 16, 2, 4, 4, 32, 2, 4, 4, 8, 32, 16, 8, 2, 4, 8, 4, 4, 4, 16, 4, 8, 2, 32, 4, 16, 4, 32, 4, 64, 2, 16, 2, 4, 4, 128, 2, 8, 4, 32, 8, 32, 64, 8, 64, 8, 256, 2, 32, 8, 2, 4, 4, 4, 64, 16, 2, 16, 4, 128, 8, 2, 4, 32, 2, 16, 16, 4, 2, 16, 16, 4, 8, 8, 4, 8, 128, 8, 64, 1024, 2, 16, 16, 32, 4, 2, 2, 2