# The reference

http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf

(page 140 or 14)
- It has range corrections
- Bias corrections

In [3]:
import hashlib
import math
import numpy as np
import random
import statistics as st

In [4]:
error_rate = 0.0001
class BaseAlg:
            
    def __init__(self):
        self.k = 1
        self.m = 2**self.k

        
    def hashString(self, value):
        if not isinstance(value, str):
            value = str(value)
        
        hashcode=hashlib.sha1(value.encode('utf-8')).hexdigest()
        bin_code = bin(int(hashcode, 16))[-24:].zfill(24)
    
        return bin_code

    
    def initialise_registers(self):
        self.k = 4
        if error_rate is not None:
            self.m = (1.04/error_rate)**2
            self.k = math.ceil(math.log(self.m, 2))
        self.m = 2**self.k
        self.hll_struct = {r: 0 for r in range(self.m)}
    

    def alg(self, stream):
        self.initialise_registers()
        for d in stream:
            dh = self.hashString(d)
            r = int(dh[:self.k], 2)
            q = dh[self.k:]
            
            if q.find("1") == -1:
                count = 1
            else:
                count = q.find("1")+1
        
            if (count > self.hll_struct[r])  :
                self.hll_struct[r] = count
        v = [x for x in self.hll_struct.values() if x is not None]
        v = [2**x for x in self.hll_struct.values()]
        #print(v)
        print("%d/%d registers holding some count" % (len(v), self.m))
        
        z = st.harmonic_mean(v)
    
        alfa_dict = {16: 0.673, 32: 0.697, 64: 0.709, 128: 0.7213/(1+(1.079/self.m))}
        alfa = alfa_dict[128]

        raw = alfa*self.m*z
        
        if raw <= 2.5*self.m:
            print("Small range correction")
            u = len([x for x in self.hll_struct.values() if x==0])
            if u != 0:
                return self.m*math.log(self.m/u)
            else:
                return raw
        elif raw <= (1/30)*2**32:
            print("Intermediate range correction")
            return raw
        else:
            print("Large range correction")
            return -(2**32)*math.log(1-raw/(2**32))
                
        return 0
      

    def verify(self, stream):
        print("Estimated distinct values: %f" % self.alg(stream))
        print("Actual distinct values: %d" % len(set(stream)))

In [5]:
ip_stream = []
for i in range(50000):
    ip = ".".join(map(str, (random.randint(0, 255) 
                        for _ in range(4))))
    ip_stream.append(ip)

#test_stream = random.sample(range(0, 100000), 50000)

In [6]:
len(set(ip_stream))

50000

In [7]:
hll_alg = BaseAlg()
hll_alg.verify(ip_stream)

134217728/134217728 registers holding some count
Small range correction
Estimated distinct values: 49933.287238
Actual distinct values: 50000


In [9]:
ip_stream[:3]

['166.103.235.52', '104.255.224.140', '88.95.22.55']

In [4]:
import random, socket, struct
ip_stream = []

total_num = 1000
ip_stream = [(socket.inet_ntoa(struct.pack('>I', random.randint(1, 0xffffffff)))) for _ in range(10000)]

'7.108.237.29'