In [None]:
import pandas as pd
import numpy as np

# Function to generate shingles word by word using split()
def shingle_word_based(text, k):
    words = text.split()  # Directly split text into words
    shingles = [" ".join(words[i:i + k]) for i in range(len(words) - k + 1)]
    return shingles

# Read text files
with open("song.txt", "r", encoding="utf-8") as file:
    doc1 = file.read()
with open("song2.txt", "r", encoding="utf-8") as file:
    doc2 = file.read()
with open("song3.txt", "r", encoding="utf-8") as file:
    doc3 = file.read()

# Set k-value for shingles (number of words per shingle)
k = 3

# Generate word-based shingles
shingle1 = shingle_word_based(doc1, k)
shingle2 = shingle_word_based(doc2, k)
shingle3 = shingle_word_based(doc3, k)

# Create a list of all unique shingles
all_shingles = list(set(shingle1 + shingle2 + shingle3))

# Create an incident matrix DataFrame
df = pd.DataFrame(columns=["Shingle", "doc1", "doc2", "doc3"])
df["Shingle"] = all_shingles
df[["doc1", "doc2", "doc3"]] = 0  # Initialize columns with 0

# Fill the incident matrix
for shingle in shingle1:
    df.loc[df["Shingle"] == shingle, "doc1"] = 1
for shingle in shingle2:
    df.loc[df["Shingle"] == shingle, "doc2"] = 1
for shingle in shingle3:
    df.loc[df["Shingle"] == shingle, "doc3"] = 1  # Fixed indexing error

# Apply four hash functions for signature matrix
df["Hash1"] = df.index.map(lambda x: (10 * x + 5) % 213)
df["Hash2"] = df.index.map(lambda x: (7 * x + 3) % 243)
df["Hash3"] = df.index.map(lambda x: (10 * x + 7) % 179)
df["Hash4"] = df.index.map(lambda x: (13 * x + 9) % 227)

# Initialize Signature Matrix with infinity values
sig = np.full((4, 3), np.inf)

# Compute signature matrix using all four hash functions
for index, row in df.iterrows():
    if row["doc1"] == 1:
        sig[0, 0] = min(sig[0, 0], row["Hash1"])
        sig[1, 0] = min(sig[1, 0], row["Hash2"])
        sig[2, 0] = min(sig[2, 0], row["Hash3"])
        sig[3, 0] = min(sig[3, 0], row["Hash4"])
    if row["doc2"] == 1:
        sig[0, 1] = min(sig[0, 1], row["Hash1"])
        sig[1, 1] = min(sig[1, 1], row["Hash2"])
        sig[2, 1] = min(sig[2, 1], row["Hash3"])
        sig[3, 1] = min(sig[3, 1], row["Hash4"])
    if row["doc3"] == 1:
        sig[0, 2] = min(sig[0, 2], row["Hash1"])
        sig[1, 2] = min(sig[1, 2], row["Hash2"])
        sig[2, 2] = min(sig[2, 2], row["Hash3"])
        sig[3, 2] = min(sig[3, 2], row["Hash4"])

# Print results
print("\nIncident Matrix:\n", df)
print("\nSignature Matrix:\n", sig)



Incident Matrix:
                Shingle  doc1  doc2  doc3  Hash1  Hash2  Hash3  Hash4
0        to come wrong     0     0     1      5      3      7      9
1     (shit) Yeah, you     0     0     1     15     10     17     22
2      up (shit) Yeah,     0     0     1     25     17     27     35
3       time, get your     1     0     0     35     24     37     48
4       shit it's been     0     0     1     45     31     47     61
5        Yeah, you try     0     0     1     55     38     57     74
6        this shit, we     0     0     1     65     45     67     87
7        more at stake     0     1     0     75     52     77    100
8       stake when you     0     1     0     85     59     87    113
9    Just come outside     1     0     0     95     66     97    126
10       live, hol' up     0     0     1    105     73    107    139
11   just been poppin'     0     0     1    115     80    117    152
12         you in your     0     1     0    125     87    127    165
13      hol' up