In [95]:
import pandas as pd
import numpy as np
import os
import glob
from collections import Counter
import math
from bitarray import bitarray
import mmh3
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

In [56]:
df_ip = pd.read_table("C:\Users\Apourva\Downloads\ips.txt", header = None)
iplist = df_ip[0].tolist()

In [107]:
l1 = int(math.floor((len(iplist))/2))
l2 = int(math.ceil((len(iplist))/2))
training = iplist[:l1]
test = iplist[l2:]

print len(training) + len(test)
print len(training)

3414010
1707005


In [58]:
match_test_ip_in_training = {}
intr = list(set(training).intersection(set(test)))
print 'Number of IP addresses common in Test and Training: ', len(intr)

Number of IP addresses common in Test and Training:  50273


In [82]:
def bloom_filter ( n ):
    bit_array = bitarray(n)
    bit_array.setall(0)
    for i in training:
        b1 = mmh3.hash(str(i), 42) % n 
        bit_array[b1] = 1
        b2 = mmh3.hash(str(i), 24) % n 
        bit_array[b2] = 1
        b3 = mmh3.hash(str(i), 84) % n 
        bit_array[b3] = 1  
    return bit_array




In [86]:
def calculate_false_positives ( bit_array, n ):
    found_list = []
    false_positive_count = 0
    for i in test:
        b1 = mmh3.hash(i, 42) % n 
        b2 = mmh3.hash(i, 24) % n 
        b3 = mmh3.hash(i, 84) % n
        x = bit_array[b1] == 1 and bit_array[b2] == 1 and bit_array[b3] == 1
        if x:
            found_list.append(i)
    false_positive_list = list(set(found_list) - set(intr))
    return false_positive_list

In [87]:
n = 10000
bit_array_10k = bloom_filter(n)
print "number of false positives: ", len(calculate_false_positives (bit_array_10k, n))

number of false positives:  200825


In [88]:
n = 100000
bit_array_100k = bloom_filter(n)
print "number of false positives: ", len(calculate_false_positives (bit_array_100k, n))

number of false positives:  200815


In [89]:
n = 1000000
bit_array_1M = bloom_filter(n)
print "number of false positives: ", len(calculate_false_positives (bit_array_1M, n))

number of false positives:  57833


In [92]:
n = 10000000
bit_array_10M = bloom_filter(n)
print "number of false positives: ", len(calculate_false_positives (bit_array_10M, n))

number of false positives:  209


In [93]:
n = 1000000000
bit_array_1B = bloom_filter(n)
print "number of false positives: ", len(calculate_false_positives (bit_array_1B, n))

number of false positives:  0


We split the list of IP addresses into two equal halves. Let's call the first half as training dataset and the second hafl as test dataset. We create a Bloom Filter for hash sizes of 10K, 100K, 1M, 10M and 1B using three seed values. The IP addresses in the training dataset are mapped to the hash spaces of different sizes.

For each IP address in the test dataset, we check if it's present in the training dataset by applying the hash functions on the IP addresses in the test dataset.

In this dataset, there are actually 50273 IP addresses in the Test dataset which are also present in the training dataset. Now we check how many IP addresses in the Test dataset has been found in the training data by the Bloom Filter. If the Bloom Filter has found an IP address in the hash space though it doesn't actualy exist, then this is a flase positive.

The table below shows the number of false positives generated for different sizes of hash space. 

The probability for false positives increases when the size of the hash space is much smaller that the number of distinct values to be mapped to the space. In this case, an IP address in the test data that is not actually present in the training data may get hashed to the same value as an IP address that is indeed present leading to a false positive.

However, as the size of the hash space increases, we see that the number of coolisions is lower and for a hash size of 1 Billion, there are no false positives.

| Hash Space Size | Number of False Positives |
|-----------------|---------------------------|
| 10000           | 200825                    |
| 100000          | 200815                    |
| 1000000         | 57833                     |
| 10000000        | 209                       |
| 1000000000      | 0                         |