In [None]:
import pandas as pd

df = pd.read_csv("reddit_wsb.csv")

df = df.dropna(subset=['body'])

df['document'] = (df['title'].fillna('') + ' ' + df['body']).str.strip()

df = df.reset_index(drop=True)

documents = df['document'].tolist()

print(f"Loaded {len(documents)} documents")

In [None]:
import re

def normalize(text):
    text = text.lower()                        # lowercase
    text = re.sub(r'[^a-z0-9\s]', ' ', text)   # remove punctuation/symbols
    text = re.sub(r'\s+', ' ', text).strip()   # collapse spaces
    return text

documents = [normalize(doc) for doc in documents]

# Sample Normalized Document
print(documents[0])

In [None]:
import random
import hashlib

def shingle (text, k):
    """Generate k-shingles from the input text.

    Args:
        text (str): The input text to generate shingles from.
        k (int): The length of each shingle.

    Returns:
        set: A set of k-shingles.
    """
    shingles = set()
    text_length = len(text)
    for i in range(text_length - k + 1):
        shingle = text[i:i + k]
        shingles.add(shingle)
    return shingles

def stable_hash64(s: str) -> int:
    """Deterministic 64-bit integer hash from a string using SHA-1 (first 8 bytes)."""
    h = hashlib.sha1(s.encode('utf-8')).digest()
    return int.from_bytes(h[:8], 'big')

def hashShingle (text, k):
    """Generate hashed k-shingles from the input text.

    Args:
        text (str): The input text to generate hashed shingles from.
        k (int): The length of each shingle.

    Returns:
        set: A set of hashed k-shingles.
    """
    shingles = shingle(text, k)
    hashed_shingles = set()
    for sh in shingles:
        hashed_shingles.add(stable_hash64(sh))
    return sorted(hashed_shingles)

shingle_sets = {}
for i, doc in enumerate(documents, start=1):
    shingles = hashShingle(doc, 5)
    if shingles:  # non-empty set
        shingle_sets[f"doc{i}"] = shingles


In [None]:
def jaccardSimilarity (j1, j2):
    """Calculate the Jaccard similarity between two sets.

    Args:
        set1 (set): The first set.
        set2 (set): The second set.

    Returns:
        float: The Jaccard similarity between the two sets.
    """
    set1 = set(j1)
    set2 = set(j2)
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    if not union:
        return 0.0
    return len(intersection) / len(union)

In [None]:
import numpy as np

p = 2**61 - 1
num_hashes = 100

a = np.random.randint(1, p, size=num_hashes, dtype=np.uint64)
b = np.random.randint(0, p, size=num_hashes, dtype=np.uint64)

def minHash_vectorized(shingle_set):
    # Convert shingle_set to a NumPy array for vector math
    x = np.array(list(shingle_set), dtype=np.uint64)

    # Broadcasted hashing:
    # For each hash function (row), apply (a*x + b) % p to all shingles
    hashes = (a[:, None] * x[None, :] + b[:, None]) % p

    # Take the minimum per row (i.e., per hash function)
    signatures = np.min(hashes, axis=1)

    return signatures.tolist()

# Apply to all documents
minhash_signatures = {k: minHash_vectorized(v) for k, v in shingle_sets.items()}

print(minhash_signatures["doc1"])  # Example output of minhash signature for document 1

In [None]:
def compareSignatures (m1, m2):
    agree = 0
    for i in range(len(m1)):
        if m1[i] == m2[i]:
            agree += 1
    return agree / len(m1)


In [None]:
from collections import defaultdict
from itertools import combinations
import numpy as np

def lsh_candidates(signatures, bands, rows_per_band):
    doc_ids = list(signatures.keys())
    sig_matrix = np.array(list(signatures.values()), dtype=np.uint64)
    assert sig_matrix.shape[1] == bands * rows_per_band

    buckets = [defaultdict(list) for _ in range(bands)]

    # Precompute slices for all bands
    for band in range(bands):
        start = band * rows_per_band
        end = start + rows_per_band
        band_slice = sig_matrix[:, start:end]

        # Efficient string join instead of str(tuple(...))
        # Much faster, same deterministic content
        for i, row in enumerate(band_slice):
            s = ','.join(map(str, row))
            bucket_hash = stable_hash64(s)
            buckets[band][bucket_hash].append(doc_ids[i])

    # Generate candidate pairs
    candidates = set()
    for band_dict in buckets:
        for doc_list in band_dict.values():
            if len(doc_list) > 1:
                doc_list = sorted(set(doc_list))
                candidates.update(combinations(doc_list, 2))

    return candidates


In [None]:
def compare_candidates(signatures, candidate_pairs, threshold=0.8, verbose=False):
    """
    Compare candidate pairs efficiently and filter by similarity threshold.
    """
    if verbose:
        print("\n=== STEP 5: OFFICIAL CANDIDATE COMPARISON ===")
        print(f"Similarity threshold: {threshold}\n")

    results = {}

    # Skip sorting (saves memory)
    for i, (d1, d2) in enumerate(candidate_pairs):
        sig1, sig2 = signatures[d1], signatures[d2]
        matches = sum(s1 == s2 for s1, s2 in zip(sig1, sig2))
        similarity = matches / len(sig1)

        if similarity >= threshold:
            results[(d1, d2)] = similarity

        # Lightweight progress print
        if verbose and i % 1000 == 0:
            print(f"Processed {i} pairs... current matches: {len(results)}")

    if verbose:
        print(f"\nTotal qualifying pairs: {len(results)}")

    return results



signatures = minhash_signatures

# Calculates candidate pairs based on LSH
# Messing around with bands and rows_per_band heavily changes the number of candidates
candidates = lsh_candidates(signatures, bands=4, rows_per_band=25)

# Compares candidate pairs and accepts only those above the threshold
similar_pairs = compare_candidates(signatures, candidates, threshold=0.8, verbose=True)

print(len(similar_pairs))



In [None]:
# 1) How many docs total and how many unique signatures?
num_docs = len(signatures)
unique_sigs = len({tuple(sig) for sig in signatures.values()})
print("docs:", num_docs, "unique signatures:", unique_sigs,
      f"({unique_sigs/num_docs:.2%} unique)")

# 2) Are any signatures literally identical objects? (accidental list * n bug)
same_object_count = len([1 for s in signatures.values() if id(s) == id(next(iter(signatures.values())))])
print("Example identical-object check (should be 1):", same_object_count)

# 3) Check signature length and a quick per-position entropy-ish check
sig_len = len(next(iter(signatures.values())))
print("signature length:", sig_len)

# per-position distinct counts (spot collisions)
from collections import Counter
pos_counters = [Counter() for _ in range(sig_len)]
for sig in signatures.values():
    for i, v in enumerate(sig):
        pos_counters[i][v] += 1
distinct_counts = [len(c) for c in pos_counters]
print("distinct values per position (first 10):", distinct_counts[:10])
print("min/median/max distinct per position:", min(distinct_counts), sorted(distinct_counts)[len(distinct_counts)//2], max(distinct_counts))

# 4) Sample pairwise similarities (random small sample)
import random
pairs = []
docs = list(signatures.keys())
for _ in range(1000):
    a,b = random.sample(docs, 2)
    sigA, sigB = signatures[a], signatures[b]
    matches = sum(1 for x,y in zip(sigA,sigB) if x==y)
    pairs.append(matches/len(sigA))
import statistics
print("sample similarity: mean", statistics.mean(pairs), "median", statistics.median(pairs),
      "90th pct", sorted(pairs)[int(.9*len(pairs))])


import statistics

sims = list(similar_pairs.values())
print("min", min(sims), "median", statistics.median(sims),
      "max", max(sims))
