In [None]:
import json 
import mmh3 
import re 

with open("./save/dataset.json", encoding="utf-8") as f:
    data = json.load(f)

texts = [d['text'] for d in data]

hashed_texts = []
for raw_text in texts:
    tokens = re.sub(r'[^a-z0-9\s]+', ' ', raw_text.lower())
    tokens = tokens.split()
    s = set()
    for i in range(len(tokens) - 2):
        shingle = ' '.join(tokens[i: i + 3])
        hsh = mmh3.hash64(shingle)[1]
        s.add(hsh)    
    hashed_texts.append(list(s))

In [None]:
# Threaded version (for fun, it's actually slower than non-threaded)
from concurrent.futures import ThreadPoolExecutor
import numpy as np

K = 768
M = np.uint64((1 << 61) - 1)

rng = np.random.default_rng(42)
a = rng.integers(1, int(M), size=768, dtype=np.uint64)
b = rng.integers(0, int(M), size=768, dtype=np.uint64)

def worker(doc):
    sig = np.full(K, M, dtype=np.uint64)
    doc = np.array(doc).astype(np.uint64)
    for hsh in doc:
        np.minimum(sig, (a * hsh + b) % M, out=sig)
    return sig

with ThreadPoolExecutor(max_workers=3) as ex:
    sigs = list(ex.map(worker, hashed_texts))

sigs = np.vstack(sigs).astype(np.uint64, copy=False)    

In [14]:
K = 768
M = np.uint64((1 << 61) - 1)

rng = np.random.default_rng(42)
a = rng.integers(1, int(M), size=768, dtype=np.uint64)
b = rng.integers(0, int(M), size=768, dtype=np.uint64)

sig = np.full((len(hashed_texts), K), M, dtype=np.uint64)
for i, doc in enumerate(hashed_texts):
    D = np.array(doc).astype(np.uint64)
    row = sig[i] 
    for hsh in D:
        np.minimum(row, (a * hsh + b) % M, out=row)

In [61]:
def top_k_docs_similar(doc_idx: int, k: int):
    k += 1
    overlap = np.sum(sig == sig[doc_idx], axis=-1)
    idx = np.argpartition(overlap, -k)[-k:]
    idx = idx[np.argsort(overlap[idx])[::-1]]
    return zip(idx[1:], overlap[idx[1:]])

doc_idx = 47
print("Doc:", data[doc_idx]['title'])
for idx, sim in top_k_docs_similar(doc_idx, 10):
    print(data[idx]['title'], sim)

Doc: Anniston Nobles
Tallassee Indians 153
Brewton Millers 127
Peoria Explorers 74
Brainerd-Little Falls Muskies 69
Western Warriors (baseball) 63
Sulphur Springs Spartans 62
San Francisco baseball team (California League) 56
Gladewater Bears 56
Greensburg Red Sox 55
Riverpoint Royals 53
