In [1]:
import hyperloglog
import random
import string
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
# Parameters
k = 16  # Number of HLL centers
m = 1600  # Total unique elements
precision = 0.01

In [3]:
def random_string(length=30):
    return ''.join(random.choices(string.ascii_letters, k=length))

items = [random_string() for _ in range(m)]

### Get the estimate as if processing everything at one center

In [4]:
test_one_center = hyperloglog.HyperLogLog(precision)

In [5]:
for item in items:
    test_one_center.add(item)
one_center_cardinality = len(test_one_center)
print(f"Processing everything at one center: {one_center_cardinality}")

Processing everything at one center: 1607


In [14]:
len(set(items))

1600

### Processing at k different centers

In [6]:
# Create HLLs
centers = [hyperloglog.HyperLogLog(precision) for _ in range(k)]

In [7]:
# Assign each item to a random HLL
for item in items:
    center_idx = random.randint(0, k - 1)
    centers[center_idx].add(item)

In [8]:
# Optionally, inspect each center's estimate
estimates = [len(center) for center in centers]
for i, est in enumerate(estimates):
    print(f"Center {i}: Estimated count = {est}")

# Total estimated cardinality across all centers
total_estimated = sum(estimates)
print(f"\nSum of individual HLL estimates: {total_estimated}")
print(f"Actual number of items: {m}")

Center 0: Estimated count = 93
Center 1: Estimated count = 81
Center 2: Estimated count = 105
Center 3: Estimated count = 115
Center 4: Estimated count = 112
Center 5: Estimated count = 109
Center 6: Estimated count = 94
Center 7: Estimated count = 79
Center 8: Estimated count = 100
Center 9: Estimated count = 98
Center 10: Estimated count = 102
Center 11: Estimated count = 105
Center 12: Estimated count = 120
Center 13: Estimated count = 102
Center 14: Estimated count = 101
Center 15: Estimated count = 83

Sum of individual HLL estimates: 1599
Actual number of items: 1600


In [9]:
coordinator = hyperloglog.HyperLogLog(precision)

In [10]:
for index in range(k):
    coordinator.update(centers[index])

In [11]:
distributed_cardinality = len(coordinator)
print(f"Processing in a distributed manner: {distributed_cardinality}")

Processing in a distributed manner: 1607


In [12]:
print(f"The estimate of processing all the items at once center and in a distributed setting is equal: {one_center_cardinality == distributed_cardinality}")

The estimate of processing all the items at once center and in a distributed setting is equal: True
