In [67]:
import random

def shingle (text, k):
    """Generate k-shingles from the input text.

    Args:
        text (str): The input text to generate shingles from.
        k (int): The length of each shingle.

    Returns:
        set: A set of k-shingles.
    """
    shingles = set()
    text_length = len(text)
    for i in range(text_length - k + 1):
        shingle = text[i:i + k]
        shingles.add(shingle)
    return shingles

def hashShingle (text, k):
    """Generate hashed k-shingles from the input text.

    Args:
        text (str): The input text to generate hashed shingles from.
        k (int): The length of each shingle.

    Returns:
        set: A set of hashed k-shingles.
    """
    shingles = shingle(text, k)
    hashed_shingles = set()
    for sh in shingles:
        hashed_shingles.add(hash(sh))
    return sorted(hashed_shingles)

j1 = hashShingle('hello world', 3)
j2 = hashShingle('hello worre', 3)
j3 = hashShingle('hello wirld', 3)

print(j1)
print(j2)
print(j3)


[-8439986855660661576, -5065720352883301865, -3713402869938616423, -459681181410095671, 2461237164284091991, 4398989733311370737, 7348565982777001185, 8042806337379098953, 8411936057670289373]
[-9087552286682518329, -8439986855660661576, -6669396354271870675, -3713402869938616423, -459681181410095671, 4398989733311370737, 7348565982777001185, 8042806337379098953, 8411936057670289373]
[-8777481093212949885, -8439986855660661576, -8292704271379463032, -5065720352883301865, -3713402869938616423, -459681181410095671, 450088397784698936, 4398989733311370737, 8411936057670289373]


In [68]:
def jaccardSimilarity (j1, j2):
    """Calculate the Jaccard similarity between two sets.

    Args:
        set1 (set): The first set.
        set2 (set): The second set.

    Returns:
        float: The Jaccard similarity between the two sets.
    """
    set1 = set(j1)
    set2 = set(j2)
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    if not union:
        return 0.0
    return len(intersection) / len(union)

[jaccardSimilarity(j1, j2),jaccardSimilarity(j1, j3),jaccardSimilarity(j2, j3)]

[0.6363636363636364, 0.5, 0.38461538461538464]

In [69]:


p = 2**61 - 1
a = [random.randint(1, p-1) for _ in range(100)]
b = [random.randint(0, p-1) for _ in range(100)]

def hash_i(i, x):
    return (a[i] * hash(x) + b[i]) % p

def minHash(shingle, numHashes):
    signatures = []
    for i in range(numHashes):
        minHash = float('inf')
        for sh in shingle:
            hashCode = hash_i(i, sh)
            if hashCode < minHash:
                minHash = hashCode
        signatures.append(minHash)
    return signatures

m1 = minHash(j1, 100)
m2 = minHash(j2, 100)
m3 = minHash(j3, 100)
print(m1)
print(m2)
print(m3)

[121840782535675343, 170007799896181717, 204106486424697733, 285646772855307836, 162398514592364001, 433199326366291059, 290770770414554076, 273434974804958903, 20335042248245030, 93426149152212674, 24430531517790750, 55037947641016558, 454390471900602752, 244999812165070112, 717192462444833321, 147892944025665731, 293120425336736737, 228387553237210698, 595701252525075593, 289413555966564422, 284650820710813494, 164007281547380724, 346738247936087013, 134261367475509623, 290134640536437043, 58889866112864619, 98504453686413211, 215611883955040422, 349542672608362423, 130794866618096172, 179106903459904567, 112016454616756174, 118083292249110592, 43469176258480003, 279676915560228989, 506145452529003007, 191457203283808297, 900038508958587279, 243243260481932754, 134728097487442804, 287840607232214408, 61548084235969853, 747503646777224849, 62109282093194117, 45400672440164720, 32737335045212536, 207611494115280534, 147807682396229805, 199782807221826901, 225848243740088861, 6728386672

In [70]:
def compareSignatures (m1, m2):
    agree = 0
    for i in range(len(m1)):
        if m1[i] == m2[i]:
            agree += 1
    return agree / len(m1)

[compareSignatures(m1, m2),compareSignatures(m1, m3),compareSignatures(m2, m3)]

[0.67, 0.4, 0.34]

In [71]:
from collections import defaultdict
from itertools import combinations

def lsh_candidates_debug(signatures, bands, rows_per_band):
    """
    signatures: dict of {doc_id: [signature_values]}
    bands: number of bands (b)
    rows_per_band: number of rows per band (r)

    Returns:
        set of candidate document pairs (tuples)
    """
    assert all(len(sig) == bands * rows_per_band for sig in signatures.values()), \
        "Signature length must equal bands * rows_per_band"

    buckets = [defaultdict(list) for _ in range(bands)]
    candidates = set()

    print("=== STEP 1: SPLIT SIGNATURES INTO BANDS AND HASH ===")
    for doc_id, sig in signatures.items():
        print(f"\nDocument: {doc_id}")
        for band in range(bands):
            start = band * rows_per_band
            end = start + rows_per_band
            band_slice = tuple(sig[start:end])
            bucket_hash = hash(band_slice)
            buckets[band][bucket_hash].append(doc_id)
            print(f"  Band {band}: {band_slice} → hash={bucket_hash % 1000} (added to bucket {band})")

    print("\n=== STEP 2: BUCKET CONTENTS ===")
    for band, band_dict in enumerate(buckets):
        print(f"\nBand {band}:")
        for h, docs in band_dict.items():
            print(f"  Bucket hash {h % 1000}: {docs}")

    print("\n=== STEP 3: CANDIDATE GENERATION ===")
    for band, band_dict in enumerate(buckets):
        for doc_list in band_dict.values():
            if len(doc_list) > 1:
                for d1, d2 in combinations(sorted(set(doc_list)), 2):
                    candidates.add((d1, d2))
                    print(f"  Band {band}: {d1} and {d2} share a bucket → candidate pair")

    print("\n=== STEP 4: FINAL CANDIDATES ===")
    if candidates:
        for c in sorted(candidates):
            print(f"  {c}")
    else:
        print("  No candidate pairs found.")

    return candidates


In [72]:
def compare_candidates(signatures, candidate_pairs, threshold=0.8):
    """
    Compare candidate document pairs and filter those that meet the similarity threshold.
    
    Args:
        signatures (dict): {doc_id: [signature_values]}
        candidate_pairs (set): set of (doc1, doc2) tuples from LSH
        threshold (float): minimum similarity fraction (0–1)
        
    Returns:
        dict: { (doc1, doc2): similarity_value } for pairs meeting threshold
    """
    print("\n=== STEP 5: OFFICIAL CANDIDATE COMPARISON ===")
    print(f"Similarity threshold: {threshold}\n")

    results = {}

    for d1, d2 in sorted(candidate_pairs):
        sig1, sig2 = signatures[d1], signatures[d2]
        assert len(sig1) == len(sig2), "Signatures must be of the same length"
        
        # Count matching components
        matches = sum(1 for i in range(len(sig1)) if sig1[i] == sig2[i])
        similarity = matches / len(sig1)
        
        status = "✅ KEPT" if similarity >= threshold else "❌ REJECTED"
        print(f"{d1}-{d2}: {matches}/{len(sig1)} components match → similarity={similarity:.3f} → {status}")
        
        if similarity >= threshold:
            results[(d1, d2)] = similarity

    if not results:
        print("\nNo pairs met the similarity threshold.")
    else:
        print("\n=== STEP 6: FINAL SIMILAR DOCUMENT PAIRS ===")
        for (d1, d2), sim in results.items():
            print(f"  ({d1}, {d2}) → similarity={sim:.3f}")

    return results


signatures = {
    "m1": m1,
    "m2": m2,
    "m3": m3
}

candidates = lsh_candidates_debug(signatures, bands=20, rows_per_band=5)
similar_pairs = compare_candidates(signatures, candidates, threshold=0.8)



=== STEP 1: SPLIT SIGNATURES INTO BANDS AND HASH ===

Document: m1
  Band 0: (121840782535675343, 170007799896181717, 204106486424697733, 285646772855307836, 162398514592364001) → hash=657 (added to bucket 0)
  Band 1: (433199326366291059, 290770770414554076, 273434974804958903, 20335042248245030, 93426149152212674) → hash=194 (added to bucket 1)
  Band 2: (24430531517790750, 55037947641016558, 454390471900602752, 244999812165070112, 717192462444833321) → hash=554 (added to bucket 2)
  Band 3: (147892944025665731, 293120425336736737, 228387553237210698, 595701252525075593, 289413555966564422) → hash=594 (added to bucket 3)
  Band 4: (284650820710813494, 164007281547380724, 346738247936087013, 134261367475509623, 290134640536437043) → hash=503 (added to bucket 4)
  Band 5: (58889866112864619, 98504453686413211, 215611883955040422, 349542672608362423, 130794866618096172) → hash=586 (added to bucket 5)
  Band 6: (179106903459904567, 112016454616756174, 118083292249110592, 4346917625848000