In [58]:
import random

def shingle (text, k):
    """Generate k-shingles from the input text.

    Args:
        text (str): The input text to generate shingles from.
        k (int): The length of each shingle.

    Returns:
        set: A set of k-shingles.
    """
    shingles = set()
    text_length = len(text)
    for i in range(text_length - k + 1):
        shingle = text[i:i + k]
        shingles.add(shingle)
    return shingles

def hashShingle (text, k):
    """Generate hashed k-shingles from the input text.

    Args:
        text (str): The input text to generate hashed shingles from.
        k (int): The length of each shingle.

    Returns:
        set: A set of hashed k-shingles.
    """
    shingles = shingle(text, k)
    hashed_shingles = set()
    for sh in shingles:
        hashed_shingles.add(hash(sh))
    return sorted(hashed_shingles)

j1 = hashShingle('hello world', 4)
j2 = hashShingle('hello worre', 4)

print(j1)
print(j2)


[-7993832516245132524, -7203876351467324098, -4783481853994946843, 2172036293903365578, 3320216886332486049, 3623040707048006776, 8286542814143441645, 8428288542705065602]
[-7203876351467324098, -4783481853994946843, 2172036293903365578, 3623040707048006776, 7796038657511069654, 8286542814143441645, 8373382038250351481, 8428288542705065602]


In [59]:
def jaccardSimilarity (j1, j2):
    """Calculate the Jaccard similarity between two sets.

    Args:
        set1 (set): The first set.
        set2 (set): The second set.

    Returns:
        float: The Jaccard similarity between the two sets.
    """
    set1 = set(j1)
    set2 = set(j2)
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    if not union:
        return 0.0
    return len(intersection) / len(union)

jaccardSimilarity(j1, j2)

0.6

In [60]:

p = 2**61 - 1
a = [random.randint(1, p-1) for _ in range(100)]
b = [random.randint(0, p-1) for _ in range(100)]

def hash_i(i, x):
    return (a[i] * hash(x) + b[i]) % p

def minHash(shingle, numHashes):
    signatures = []
    for i in range(numHashes):
        minHash = float('inf')
        for sh in shingle:
            hashCode = hash_i(i, sh)
            if hashCode < minHash:
                minHash = hashCode
        signatures.append(minHash)
    return signatures

m1 = minHash(j1, 100)
m2 = minHash(j2, 100)

p = 2**61 - 1
a = [random.randint(1, p-1) for _ in range(100)]
b = [random.randint(0, p-1) for _ in range(100)]

def hash_i(i, x):
    return (a[i] * hash(x) + b[i]) % p

def minHash(shingle, numHashes):
    signatures = []
    for i in range(numHashes):
        minHash = float('inf')
        for sh in shingle:
            hashCode = hash_i(i, sh)
            if hashCode < minHash:
                minHash = hashCode
        signatures.append(minHash)
    return signatures

m1 = minHash(j1, 100)
m2 = minHash(j2, 100)
print(m1)
print(m2)

[310306835800312309, 321905072394547799, 878155265163061919, 18334347284766884, 177833968886632846, 365994700263789287, 312171066172419336, 311966588608604674, 464977200636831586, 161442561519574865, 241888445239520521, 398612997018723496, 200286746252962461, 270335990042814936, 85410630574188088, 467967204704713175, 265475935740360441, 149296500970527489, 225097296673460140, 617182964163347198, 3559689674423592, 56705131291724667, 478990790613370138, 716036252111752928, 76377683673814128, 126763143804765378, 244157633931319432, 70223151493465399, 273864351210930605, 236261793463372042, 158434720857502466, 111126771483355764, 288375216231950790, 336525869265618748, 531035592520269149, 795588007000867024, 27535122909073499, 319397573414247873, 202640532781971173, 271179335956129879, 77496427687317089, 176181778448079024, 32945166264362187, 259559015131285086, 96336631663183231, 846268692521523230, 5556162956211740, 176830563464146492, 136310324857005301, 15144812097845560, 5971355862134

In [61]:
def compareSignatures (m1, m2):
    agree = 0
    for i in range(len(m1)):
        if m1[i] == m2[i]:
            agree += 1
    return agree / len(m1)

compareSignatures(m1, m2)

0.62