In [17]:
import hashlib
import math
from bitarray import bitarray


class SimpleBloomFilter:
    def __init__(self, size, num_hash_functions):
        """
        Initialize the Bloom Filter
        :param size: The size of the bit array
        :param num_hash_functions: The number of hash functions to use
        """
        self.size = size
        self.num_hash_functions = num_hash_functions
        self.bit_array = bitarray(size)
        self.bit_array.setall(0)  # Initialize all bits to 0

    def _hash(self, item, seed):
        """
        A simple hash function that uses MD5 with a seed
        :param item: The item to hash
        :param seed: A seed to ensure different hash results
        :return: A hash value in the range of the bit array
        """
        item_hash = hashlib.md5((item + str(seed)).encode()).hexdigest()
        return int(item_hash, 16) % self.size

    def add(self, item):
        """
        Add an item to the Bloom Filter
        :param item: The item to add
        """
        for i in range(self.num_hash_functions):
            pos = self._hash(item, i)
            self.bit_array[pos] = True

    def contains(self, item):
        """
        Check if an item is in the Bloom Filter
        :param item: The item to check
        :return: True if the item is likely in the set, False otherwise
        """
        for i in range(self.num_hash_functions):
            pos = self._hash(item, i)
            if not self.bit_array[pos]:
                return False
        return True

    def __str__(self):
        """
        Return a string representation of the bit array
        """
        return self.bit_array.to01()  # Converts the bitarray to a string of 0s and 1s


# Example usage
bloom_filter = SimpleBloomFilter(size=128, num_hash_functions=5)
print("Bit array:")
print(bloom_filter)  # Calls __str__

# Add some items
bloom_filter.add("apple")
bloom_filter.add("banana")
bloom_filter.add("car")
bloom_filter.add("bike")
bloom_filter.add("bake")
bloom_filter.add("bark")
bloom_filter.add("bus")
bloom_filter.add("banana")
bloom_filter.add("motor")
bloom_filter.add("cycle")
bloom_filter.add("swim")
bloom_filter.add("bad")
bloom_filter.add("omen")
bloom_filter.add("oman")
bloom_filter.add("modi")
bloom_filter.add("narendar")
bloom_filter.add("suren")
bloom_filter.add("deven")
bloom_filter.add("dharmen")
bloom_filter.add("briti")



# Print the bit array
print("Bit array:")
print(bloom_filter)  # Calls __str__

# Check if the items are in the set
print("Contains 'apple':", bloom_filter.contains("apple"))  # Should be True
print("Contains 'banana':", bloom_filter.contains("oman"))  # Should be True
print("Contains 'cherry':", bloom_filter.contains("cherry"))  # Likely False
print("Contains 'sherry':", bloom_filter.contains("sherry"))  # Likely False


Bit array:
00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
Bit array:
11111010000101010010100111011001100101100001100111001111101000110100110101010110000111001110100011000011010011101001101011100010
Contains 'apple': True
Contains 'banana': True
Contains 'cherry': False
Contains 'sherry': False


In [32]:
# Open the file in read mode
with open('/mnt/nvme0/datasets/wiki_rnd1m/wikipedia_query_labels_numeric_double_common.txt', 'r') as file:
    with open('/mnt/nvme0/datasets/wiki_rnd1m/wikipedia_query_labels_double_common_bloom.txt', 'w', encoding='utf-8') as file2:
    # Read the file line by line
        count = 0
        for line in file:
            bloom_filter = SimpleBloomFilter(size=128, num_hash_functions=5)
            # Strip newline characters and split by commas
            values = line.strip().split(',')
            # Print the values
            for val in values:
                bloom_filter.add(val)
            #   print(bloom_filter)
            file2.write(str(bloom_filter) + '\n')
    # Write each line to the file