In [1]:
import hashlib

def compute_word_hash(word, bit_size=8):
    # Generate MD5 hash for the given word
    word_hash = hashlib.md5(word.encode()).hexdigest()

    # Convert the hash to an integer, then to binary, keeping the last 'bit_size' bits
    hash_int = int(word_hash, 16)
    binary_rep = bin(hash_int)[-bit_size:]

    # Ensure the binary representation is padded to match bit size
    return [int(bit) for bit in binary_rep.zfill(bit_size)]

def calculate_simhash(text, bit_size=8):
    words = text.split()

    # Initialize the vector to store weights of each bit
    bit_weights = [0] * bit_size

    # Dictionary to store the binary hash for each word
    word_hashes = {}

    # Loop through each word to calculate its hash and update the vector
    for word in words:
        word_binary = compute_word_hash(word, bit_size)

        # Store the binary representation for this word
        word_hashes[word] = word_binary

        # Update the bit weights based on the word's hash
        for i in range(bit_size):
            if word_binary[i] == 1:
                bit_weights[i] += 1
            else:
                bit_weights[i] -= 1

    # Compute the final SimHash value by checking the sign of each bit weight
    simhash_binary = ''.join(['1' if weight > 0 else '0' for weight in bit_weights])

    return simhash_binary, word_hashes

# New example text
sample_text = "In the fast-paced world of technology, innovations like artificial intelligence and quantum computing are changing the landscape rapidly."
simhash_value, hashes = calculate_simhash(sample_text)

# Output word hashes and the resulting SimHash
print("Word Hashes:")
for word, binary_hash in hashes.items():
    print(f"{word}: {binary_hash}")
print(f"\nSimHash (8-bit): {simhash_value}")


Word Hashes:
In: [0, 1, 0, 0, 1, 1, 0, 0]
the: [0, 1, 0, 1, 0, 1, 1, 1]
fast-paced: [0, 1, 1, 1, 0, 0, 1, 1]
world: [1, 1, 1, 0, 0, 1, 1, 1]
of: [1, 1, 1, 0, 0, 1, 0, 0]
technology,: [0, 0, 1, 1, 0, 0, 1, 1]
innovations: [1, 1, 1, 1, 0, 1, 1, 1]
like: [0, 0, 0, 0, 1, 0, 1, 1]
artificial: [1, 1, 0, 1, 0, 1, 1, 0]
intelligence: [0, 0, 0, 0, 0, 1, 0, 1]
and: [0, 1, 1, 1, 1, 0, 0, 0]
quantum: [0, 0, 0, 1, 0, 1, 1, 1]
computing: [1, 0, 0, 0, 0, 0, 1, 0]
are: [1, 0, 0, 0, 0, 1, 1, 1]
changing: [0, 0, 0, 0, 0, 1, 0, 1]
landscape: [0, 1, 0, 1, 0, 1, 0, 0]
rapidly.: [1, 1, 1, 0, 0, 0, 1, 1]

SimHash (8-bit): 01000111
