In [None]:
import pandas as pd

df = pd.read_csv("reddit_wsb.csv")

df = df.dropna(subset=['body'])

df['document'] = (df['title'].fillna('') + ' ' + df['body']).str.strip()

df = df.reset_index(drop=True)

documents = df['document'].tolist()

print(f"Loaded {len(documents)} documents")

In [None]:
import re

def normalize(text):
    text = text.lower()                        # lowercase
    text = re.sub(r'[^a-z0-9\s]', ' ', text)   # remove punctuation/symbols
    text = re.sub(r'\s+', ' ', text).strip()   # collapse spaces
    return text

documents = [normalize(doc) for doc in documents]

# Sample Normalized Document
print(documents[0])

In [None]:
import random
import hashlib

def shingle (text, k):
    """Generate k-shingles from the input text.

    Args:
        text (str): The input text to generate shingles from.
        k (int): The length of each shingle.

    Returns:
        set: A set of k-shingles.
    """
    shingles = set()
    text_length = len(text)
    for i in range(text_length - k + 1):
        shingle = text[i:i + k]
        shingles.add(shingle)
    return shingles

def stable_hash64(s: str) -> int:
    """Deterministic 64-bit integer hash from a string using SHA-1 (first 8 bytes)."""
    h = hashlib.sha1(s.encode('utf-8')).digest()
    return int.from_bytes(h[:8], 'big')

def hashShingle (text, k):
    """Generate hashed k-shingles from the input text.

    Args:
        text (str): The input text to generate hashed shingles from.
        k (int): The length of each shingle.

    Returns:
        set: A set of hashed k-shingles.
    """
    shingles = shingle(text, k)
    hashed_shingles = set()
    for sh in shingles:
        hashed_shingles.add(stable_hash64(sh))
    return sorted(hashed_shingles)


shingle_sets = {}
for i, doc in enumerate(documents, start=1):
    key = f"doc{i}"
    shingle_sets[key] = hashShingle(doc, 5)

In [None]:
def jaccardSimilarity (j1, j2):
    """Calculate the Jaccard similarity between two sets.

    Args:
        set1 (set): The first set.
        set2 (set): The second set.

    Returns:
        float: The Jaccard similarity between the two sets.
    """
    set1 = set(j1)
    set2 = set(j2)
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    if not union:
        return 0.0
    return len(intersection) / len(union)

In [None]:
p = 2**61 - 1
a = [random.randint(1, p-1) for _ in range(100)]
b = [random.randint(0, p-1) for _ in range(100)]

def hash_i(i, x):
    return (a[i] * (x) + b[i]) % p

def minHash(shingle, numHashes):
    signatures = []
    for i in range(numHashes):
        minHash = float('inf')
        for sh in shingle:
            hashCode = hash_i(i, sh)
            if hashCode < minHash:
                minHash = hashCode
        signatures.append(minHash)
    return signatures

minhash_signatures = {k: minHash(v, 100) for k, v in shingle_sets.items()}

In [None]:
def compareSignatures (m1, m2):
    agree = 0
    for i in range(len(m1)):
        if m1[i] == m2[i]:
            agree += 1
    return agree / len(m1)


In [None]:
from collections import defaultdict
from itertools import combinations

def lsh_candidates_debug(signatures, bands, rows_per_band):
    """
    signatures: dict of {doc_id: [signature_values]}
    bands: number of bands (b)
    rows_per_band: number of rows per band (r)

    Returns:
        set of candidate document pairs (tuples)
    """
    assert all(len(sig) == bands * rows_per_band for sig in signatures.values()), \
        "Signature length must equal bands * rows_per_band"

    buckets = [defaultdict(list) for _ in range(bands)]
    candidates = set()

    print("=== STEP 1: SPLIT SIGNATURES INTO BANDS AND HASH ===")
    for doc_id, sig in signatures.items():
        print(f"\nDocument: {doc_id}")
        for band in range(bands):
            start = band * rows_per_band
            end = start + rows_per_band
            band_slice = tuple(sig[start:end])
            bucket_hash = stable_hash64(str(band_slice))
            buckets[band][bucket_hash].append(doc_id)
            print(f"  Band {band}: {band_slice} → hash={bucket_hash % 1000} (added to bucket {band})")

    print("\n=== STEP 2: BUCKET CONTENTS ===")
    for band, band_dict in enumerate(buckets):
        print(f"\nBand {band}:")
        for h, docs in band_dict.items():
            print(f"  Bucket hash {h % 1000}: {docs}")

    print("\n=== STEP 3: CANDIDATE GENERATION ===")
    for band, band_dict in enumerate(buckets):
        for doc_list in band_dict.values():
            if len(doc_list) > 1:
                for d1, d2 in combinations(sorted(set(doc_list)), 2):
                    candidates.add((d1, d2))
                    print(f"  Band {band}: {d1} and {d2} share a bucket → candidate pair")

    print("\n=== STEP 4: FINAL CANDIDATES ===")
    if candidates:
        for c in sorted(candidates):
            print(f"  {c}")
    else:
        print("  No candidate pairs found.")

    return candidates


In [None]:
def compare_candidates(signatures, candidate_pairs, threshold=0.8):
    """
    Compare candidate document pairs and filter those that meet the similarity threshold.
    
    Args:
        signatures (dict): {doc_id: [signature_values]}
        candidate_pairs (set): set of (doc1, doc2) tuples from LSH
        threshold (float): minimum similarity fraction (0–1)
        
    Returns:
        dict: { (doc1, doc2): similarity_value } for pairs meeting threshold
    """
    print("\n=== STEP 5: OFFICIAL CANDIDATE COMPARISON ===")
    print(f"Similarity threshold: {threshold}\n")

    results = {}

    for d1, d2 in sorted(candidate_pairs):
        sig1, sig2 = signatures[d1], signatures[d2]
        assert len(sig1) == len(sig2), "Signatures must be of the same length"
        
        # Count matching components
        matches = sum(1 for i in range(len(sig1)) if sig1[i] == sig2[i])
        similarity = matches / len(sig1)
        
        status = "✅ KEPT" if similarity >= threshold else "❌ REJECTED"
        print(f"{d1}-{d2}: {matches}/{len(sig1)} components match → similarity={similarity:.3f} → {status}")
        
        if similarity >= threshold:
            results[(d1, d2)] = similarity

    if not results:
        print("\nNo pairs met the similarity threshold.")
    else:
        print("\n=== STEP 6: FINAL SIMILAR DOCUMENT PAIRS ===")
        for (d1, d2), sim in results.items():
            print(f"  ({d1}, {d2}) → similarity={sim:.3f}")

    return results


signatures = minhash_signatures

# Calculates candidate pairs based on LSH
candidates = lsh_candidates_debug(signatures, bands=20, rows_per_band=5)

# Compares candidate pairs and accepts only those above the threshold
similar_pairs = compare_candidates(signatures, candidates, threshold=0.8)

print(len(similar_pairs))

