# The reference

http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf

(page 140 or 14)
- It has range corrections
- Bias corrections

In [25]:
import hashlib
import math
import numpy as np
import random
import statistics as st

In [26]:
# Lets follow the article here

# error rate allows us to determine the number of registers (m) to be used
error_rate = 0.01
class BaseAlg:
    
                
    #def __init__(self):
        #self.k = 1
        #self.m = 2**self.k

    # First we need to define the hash function, converting a string with any length 
    # to a string of binary characters (0, 1) with fixed length (24)
    def hashString(self, value):
        if not isinstance(value, str):
            value = str(value)
        
        hashcode=hashlib.sha1(value.encode('utf-8')).hexdigest()
        bin_code = bin(int(hashcode, 16))[-24:].zfill(24)
    
        return bin_code

    # Hereregisters are initialized as 0, with m being determined from the desired error rate
    def initialise_registers(self):
        #self.k = 4
        if error_rate is not None:
            self.m = (1.04/error_rate)**2
            self.k = math.ceil(math.log(self.m, 2))
        self.m = 2**self.k
        self.hll_struct = {r: 0 for r in range(self.m)}
    

    # Now for the algorithm itself
    def alg(self, stream):
        self.initialise_registers()
        for d in stream:
            # Map the object d from a data stream using the defined hash function
            dh = self.hashString(d)
            # Selecting the first k digits of dh to determine the register (converting the value to int)
            r = int(dh[:self.k], 2)
            # The remaining digits are where we count the leading 0s
            q = dh[self.k:]
            
            # If the hashed string is 000...0, we register its length+1, 
            # as if the first 1 was just outside the string
            if q.find("1") == -1:
                count = len(q)+1
            # Otherwise we find the position of the 1 in the string and just add 1 to it.
            # Equivalent to counting from 1 instead of from 0
            else:
                count = q.find("1")+1
                
            # Now, if the count is higher than what is already in the designated register
            # we keep the new count, otherwise we won't change it
            if (count > self.hll_struct[r]):
                self.hll_struct[r] = count
                
        # This is where the fun part happens
        # 
        #v = [x for x in self.hll_struct.values() if x is not None]
        
        # We take all registered counts and use them to make a list of powers of 2
        # Imagine the register has 0, that means there were no leading zeros in a string of length L
        # Which means that there are 2^(L-1) possible values for it
        # Or of the 2^L possible values for said string, half of them don't start with a zero
        # This 2^x gives us the inverse of that probability
        v = [2**x for x in self.hll_struct.values()]
        #print(v)
        # This was here just for sanity check
        # print("%d/%d registers holding some count" % (len(v), self.m))
        
        # According to the original article, we now take the harmonic mean of the entire list of powers of 2
        z = st.harmonic_mean(v)
        
        # Alfa is a correction parameter also introduced in the article    
        alfa_dict = {16: 0.673, 32: 0.697, 64: 0.709, 128: 0.7213/(1+(1.079/self.m))}
        alfa = alfa_dict[128]

        # So the raw estimation is alfa*m*z. I know the article says m^2 instead of m.
        # But in the article z is defined as the harmonic mean/m
        # Which means that this is equivalent
        raw = alfa*self.m*z
        
        # These are the range corrections also introduced in the article
        # The algorithm works really well for that intermediate range. 
        # Below or above that it needs some help
        if raw <= 2.5*self.m:
            print("Small range correction")
            u = len([x for x in self.hll_struct.values() if x==0])
            if u != 0:
                return self.m*math.log(self.m/u)
            else:
                return raw
        elif raw <= (1/30)*2**32:
            print("Intermediate range correction")
            return raw
        else:
            print("Large range correction")
            return -(2**32)*math.log(1-raw/(2**32))
                
        return 0
      

    def verify(self, stream):
        print("Estimated distinct values: %f" % self.alg(stream))
        print("Actual distinct values: %d" % len(set(stream)))

In [27]:
ip_stream = []
for i in range(50000):
    ip = ".".join(map(str, (random.randint(0, 255) 
                        for _ in range(4))))
    ip_stream.append(ip)

#test_stream = random.sample(range(0, 100000), 50000)

In [28]:
len(set(ip_stream))

50000

In [29]:
hll_alg = BaseAlg()
hll_alg.verify(ip_stream)

16384
16384
16384/16384 registers holding some count
Intermediate range correction
Estimated distinct values: 50457.835143
Actual distinct values: 50000


In [30]:
ip_stream[:3]

['178.95.193.138', '45.124.145.28', '187.188.84.110']

In [1]:
import random, socket, struct
ip_stream = []

total_num = 1000
ip_stream = [(socket.inet_ntoa(struct.pack('>I', random.randint(1, 0xffffffff)))) for _ in range(10000)]

In [2]:
ip_stream

['220.52.233.217',
 '87.193.221.129',
 '162.222.82.224',
 '43.130.82.177',
 '215.212.87.179',
 '194.164.62.7',
 '169.143.149.80',
 '86.12.142.23',
 '190.30.56.13',
 '73.49.40.241',
 '103.81.188.192',
 '90.200.36.195',
 '112.192.16.245',
 '147.84.146.73',
 '90.140.50.207',
 '163.128.96.143',
 '218.24.29.94',
 '9.254.129.202',
 '153.68.160.205',
 '73.222.109.179',
 '96.224.28.68',
 '242.187.106.129',
 '33.164.136.229',
 '218.238.144.119',
 '119.223.34.18',
 '140.194.38.54',
 '88.169.74.227',
 '158.94.173.211',
 '255.130.8.123',
 '28.117.200.64',
 '188.186.140.54',
 '144.159.102.213',
 '215.241.229.156',
 '181.25.159.158',
 '169.116.95.170',
 '125.69.41.191',
 '75.203.80.29',
 '248.42.79.64',
 '221.213.222.229',
 '133.31.62.190',
 '174.95.141.237',
 '48.42.234.54',
 '186.82.245.6',
 '79.218.21.125',
 '132.49.3.10',
 '234.210.239.191',
 '112.17.54.10',
 '150.23.103.215',
 '182.145.251.92',
 '204.243.149.29',
 '92.76.249.213',
 '204.136.100.251',
 '38.40.89.235',
 '166.223.41.111',
 '57.207