In [14]:
# Step 1: Read the input files
file1_words = []
file2_words = []
file3_words = []

# Function to read words from a file
def read_words_from_file(file_path):
    with open(file_path, 'r') as file:
        words = file.read().split()
    return words

# Replace 'file1.txt', 'file2.txt', and 'file3.txt' with actual file paths
file1_words = read_words_from_file('file1.txt')
file2_words = read_words_from_file('file2.txt')
file3_words = read_words_from_file('file3.txt')

print("File 1 words:", file1_words)
print("File 2 words:", file2_words)
print("File 3 words:", file3_words)


File 1 words: ['hey', 'how', 'are', 'you', 'doing']
File 2 words: ['an', 'understanding', 'man', 'is', 'smart']
File 3 words: ['Hard', 'times', 'always', 'lead', 'to', 'something', 'great']


In [15]:
# Step 2: Generate singles for words and letters
def generate_singles(words):
    singles = set()
    for word in words:
        singles.update(word.lower())  # Considering case-insensitive singles
    return sorted(list(singles))

file1_singles = generate_singles(file1_words)
file2_singles = generate_singles(file2_words)
file3_singles = generate_singles(file3_words)

print("File 1 singles:", file1_singles)
print("File 2 singles:", file2_singles)
print("File 3 singles:", file3_singles)


File 1 singles: ['a', 'd', 'e', 'g', 'h', 'i', 'n', 'o', 'r', 'u', 'w', 'y']
File 2 singles: ['a', 'd', 'e', 'g', 'i', 'm', 'n', 'r', 's', 't', 'u']
File 3 singles: ['a', 'd', 'e', 'g', 'h', 'i', 'l', 'm', 'n', 'o', 'r', 's', 't', 'w', 'y']


In [16]:
# Step 3: Create an incidence matrix
unique_words = list(set(file1_words + file2_words + file3_words))
incidence_matrix = []

for word in unique_words:
    row = [1 if word in file1_words else 0,
           1 if word in file2_words else 0,
           1 if word in file3_words else 0]
    incidence_matrix.append(row)

print("Incidence matrix:")
for row in incidence_matrix:
    print(row)


Incidence matrix:
[1, 0, 0]
[1, 0, 0]
[1, 0, 0]
[1, 0, 0]
[0, 0, 1]
[0, 1, 0]
[0, 1, 0]
[0, 1, 0]
[0, 0, 1]
[0, 0, 1]
[0, 0, 1]
[0, 1, 0]
[0, 0, 1]
[0, 0, 1]
[1, 0, 0]
[0, 1, 0]
[0, 0, 1]


In [17]:
# Step 4: Min-hashing on incidence matrix with 2 hash functions
num_hashes = 2  # Number of hash functions

def hash_function_1(x):
    return (2 * x + 1) % 11

def hash_function_2(x):
    return (5 * x + 2) % 11

def generate_min_hash(incidence_matrix, num_hashes):
    min_hash_values = []
    num_docs = len(incidence_matrix[0])

    for i in range(num_hashes):
        min_hash = []
        for word_row in incidence_matrix:
            hash_val = float('inf')
            for doc_idx in range(num_docs):
                if word_row[doc_idx] == 1:
                    if i == 0:
                        hash_val = min(hash_val, hash_function_1(doc_idx))
                    elif i == 1:
                        hash_val = min(hash_val, hash_function_2(doc_idx))
            min_hash.append(hash_val)
        min_hash_values.append(min_hash)

    return min_hash_values

min_hash_values = generate_min_hash(incidence_matrix, num_hashes)

print("Min-hash values:")
for i, min_hash in enumerate(min_hash_values):
    print(f"Hash Function {i+1}: {min_hash}")


Min-hash values:
Hash Function 1: [1, 1, 1, 1, 5, 3, 3, 3, 5, 5, 5, 3, 5, 5, 1, 3, 5]
Hash Function 2: [2, 2, 2, 2, 1, 7, 7, 7, 1, 1, 1, 7, 1, 1, 2, 7, 1]


In [13]:
import re
from tabulate import tabulate

# Function to generate K-shingles from a document
def generate_k_shingles(document, k):
    words = re.findall(r'\w+', document.lower())
    shingles = set()
    for i in range(len(words) - k + 1):
        shingle = ' '.join(words[i:i+k])
        shingles.add(shingle)
    return shingles

# Function to create K-shingles incidence matrix
def create_incidence_matrix(documents, k):
    incidence_matrix = {}
    for i, doc in enumerate(documents):
        shingles = generate_k_shingles(doc, k)
        for shingle in shingles:
            if shingle in incidence_matrix:
                incidence_matrix[shingle][i] = 1
            else:
                incidence_matrix[shingle] = [0] * len(documents)
                incidence_matrix[shingle][i] = 1
    return incidence_matrix

# Function to apply Minhash functions and get signature
def get_minhash_signature(incidence_matrix, num_hashes):
    signature_matrix = []
    for shingle, row in incidence_matrix.items():
        signature = []
        for i in range(num_hashes):
            minhash_value = float('inf')
            for j, val in enumerate(row):
                if val == 1:
                    hash_value = (2 * j + 3) % 11  # Apply h1 = (2x + 3) mod 11
                    minhash_value = min(minhash_value, hash_value)
            signature.append(minhash_value)
        signature_matrix.append(signature)
    return signature_matrix

# Example usage
file1 = "file1.txt"
file2 = "file2.txt"
file3 = "file3.txt"

documents = []

# Read the contents of the files
with open(file1, "r") as f:
    document1 = f.read()
    documents.append(document1)

with open(file2, "r") as f:
    document2 = f.read()
    documents.append(document2)

with open(file3, "r") as f:
    document3 = f.read()
    documents.append(document3)

k = 3

incidence_matrix = create_incidence_matrix(documents, k)

# Convert incidence matrix to a table format
table = []
header = ["Shingle"] + [f"Document {i+1}" for i in range(len(documents))]
for shingle, row in incidence_matrix.items():
    table.append([shingle] + row)

# Print the incidence matrix in table format
print("Incidence Matrix:")
print(tabulate(table, headers=header, tablefmt="grid"))

num_hashes = 2
signature_matrix = get_minhash_signature(incidence_matrix, num_hashes)

# Convert signature matrix to a table format
table = []
header = ["Shingle"] + [f"Hash {i+1}" for i in range(num_hashes)]
for i, signature in enumerate(signature_matrix):
    table.append([f"Shingle {i+1}"] + signature)

# Print the Minhash signature matrix in table format
print("\nMinhash Signature Matrix:")
print(tabulate(table, headers=header, tablefmt="grid"))


Incidence Matrix:
+----------------------+--------------+--------------+--------------+
| Shingle              |   Document 1 |   Document 2 |   Document 3 |
| are you doing        |            1 |            0 |            0 |
+----------------------+--------------+--------------+--------------+
| how are you          |            1 |            0 |            0 |
+----------------------+--------------+--------------+--------------+
| hey how are          |            1 |            0 |            0 |
+----------------------+--------------+--------------+--------------+
| man is smart         |            0 |            1 |            0 |
+----------------------+--------------+--------------+--------------+
| an understanding man |            0 |            1 |            0 |
+----------------------+--------------+--------------+--------------+
| understanding man is |            0 |            1 |            0 |
+----------------------+--------------+--------------+--------------+
| 

In [20]:
import numpy as np
from tabulate import tabulate

# Define the hash functions
def hash_function_1(x):
    return (2 * x + 1) % 11

def hash_function_2(x):
    return (5 * x + 2) % 11

# Function to generate shingles for words and letters
def generate_shingles(text):
    words = text.split()
    letters = "".join(text.split())
    return set(words), set(letters)

# Function to create the incidence matrix for unique words in the documents
def create_incidence_matrix(documents):
    unique_words = sorted(set(word for doc in documents for word in doc))
    incidence_matrix = np.zeros((len(unique_words), len(documents)), dtype=int)

    for j, doc in enumerate(documents):
        for i, word in enumerate(unique_words):
            incidence_matrix[i, j] = int(word in doc)

    return unique_words, incidence_matrix

# Function to perform min-hashing on the incidence matrix
def min_hashing(incidence_matrix, hash_functions):
    num_hashes = len(hash_functions)
    num_docs = incidence_matrix.shape[1]
    min_hash_signatures = np.full((num_hashes, num_docs), np.inf)

    for i in range(incidence_matrix.shape[0]):
        for j in range(num_docs):
            if incidence_matrix[i, j] == 1:
                for k, hash_func in enumerate(hash_functions):
                    hash_value = hash_func(i)
                    min_hash_signatures[k, j] = min(min_hash_signatures[k, j], hash_value)

    return min_hash_signatures

# Main function
def main():
    # Read three input files, each containing five words
    with open("file1.txt", "r") as file:
        input_file1 = file.read().strip()

    with open("file2.txt", "r") as file:
        input_file2 = file.read().strip()

    with open("file3.txt", "r") as file:
        input_file3 = file.read().strip()

    # Generate shingles for words and letters
    shingles1_words, shingles1_letters = generate_shingles(input_file1)
    shingles2_words, shingles2_letters = generate_shingles(input_file2)
    shingles3_words, shingles3_letters = generate_shingles(input_file3)

    # Create the incidence matrix for unique words in the documents
    unique_words, incidence_matrix = create_incidence_matrix(
        [shingles1_words, shingles2_words, shingles3_words]
    )

    # Perform min-hashing on the incidence matrix
    hash_functions = [hash_function_1, hash_function_2]
    min_hash_signatures = min_hashing(incidence_matrix, hash_functions)

    # Print the incidence matrix using tabulate
    table_data = [["Word"] + [f"Document {i+1}" for i in range(3)]]
    for i, word in enumerate(unique_words):
        row = [word] + list(incidence_matrix[i])
        table_data.append(row)

    print("Incidence Matrix:")
    print(tabulate(table_data, headers="firstrow"))

    # Print the min-hash signatures
    print("\nMin-Hash Signatures:")
    for i, signature in enumerate(min_hash_signatures):
        print(f"Signature for hash function {i+1}: {list(signature)}")

if __name__ == "__main__":
    main()


Incidence Matrix:
Word             Document 1    Document 2    Document 3
-------------  ------------  ------------  ------------
Hard                      0             0             1
always                    0             0             1
an                        0             1             0
are                       1             0             0
doing                     1             0             0
great                     0             0             1
hey                       1             0             0
how                       1             0             0
is                        0             1             0
lead                      0             0             1
man                       0             1             0
smart                     0             1             0
something                 0             0             1
times                     0             0             1
to                        0             0             1
understanding             0   