In [222]:
import pandas as pd
import numpy as np

In [223]:
data = pd.read_csv('2017-07-03.csv')
source_ips = data.loc[:, 'ipv4Src']
unique_source_ips = len(source_ips.unique())
print(f'numer of unique source ips: {unique_source_ips}')

numer of unique source ips: 9640


In [224]:
true_frequencies = {}
for source_ip in source_ips:
    true_frequencies[source_ip] = true_frequencies.get(source_ip, 0) + 1

print(f'true frequency of 35.160.100.86 is: {true_frequencies["35.160.100.86"]}')

true frequency of 35.160.100.86 is: 753


In [225]:
def init_hash_functions(d, w):
    # Generates perfect hash functions from source ips to values between 0 to w - 1.
    hash_functions = []
    for _ in range(d):
        hash_functions.append(dict(map(lambda key, value: (key, value), source_ips.unique(), np.random.randint(0, w, len(source_ips.unique())))))
    return hash_functions

## Count Min Sketch

In [226]:
class CountMinSketch:
    def __init__(self, stream, d, w, morris_sigma=0.2, morris_delta=0.9, use_morris_counters=True):
        self.d = d
        self.w = w
        self.morris_estimators_num = int(1 / (morris_delta * (morris_sigma ** 2)))
        self.use_morris_counters = use_morris_counters
        self.hash_functions = init_hash_functions(d, w)
        self.sketch = self.create_sketch(stream, d, w)

    def create_sketch(self, stream, d, w):
        sketch = {}

        for i, item in enumerate(stream):
            for j, hash_function in enumerate(self.hash_functions):
                if self.use_morris_counters:
                    morris_estimators = sketch.get((j, hash_function[item]), [0 for _ in range(self.morris_estimators_num)])
                    for k in range(self.morris_estimators_num):
                        p = 1 / (2 ** morris_estimators[k])
                        if p >= np.random.uniform(0, 1):
                            morris_estimators[k] += 1
                    sketch[(j, hash_function[item])] = morris_estimators
                else:
                    sketch[(j, hash_function[item])] = sketch.get((j, hash_function[item]), 0) + 1
        return sketch

    def query(self, item):
        if self.use_morris_counters:
            hash_values = []
            for i, hash_function in enumerate(self.hash_functions):
                morris_estimators = self.sketch[(i, hash_function[item])]
                processed_morris_estimators = [2 ** estimator - 1 for estimator in morris_estimators]
                hash_values.append(sum(processed_morris_estimators) / len(processed_morris_estimators))
            return min(hash_values)
        else :
            hash_values = [self.sketch[(i, hash_function[item])] for i, hash_function in enumerate(self.hash_functions)]
            return min(hash_values)

In [227]:
def evaluate_avg_bias(items, sketch, true_frequencies):
    unique_items = items.unique()
    bias_per_item = []
    for item in unique_items:
        true_frequency = true_frequencies[item]
        estimated_frequency = sketch.query(item)
        bias = float(estimated_frequency / true_frequency)
        normalized_bias = 1 - bias if bias <= 1 else bias - 1
        bias_per_item.append(normalized_bias)
    return round(sum(bias_per_item) / len(bias_per_item), 2)

In [228]:
d = 2
w = 1000
count_min_sketch = CountMinSketch(source_ips, d, w)

KeyboardInterrupt: 

In [None]:
evaluate_avg_bias(source_ips, count_min_sketch, true_frequencies)