In [1]:
import random

def shingle (text, k):
    """Generate k-shingles from the input text.

    Args:
        text (str): The input text to generate shingles from.
        k (int): The length of each shingle.

    Returns:
        set: A set of k-shingles.
    """
    shingles = set()
    text_length = len(text)
    for i in range(text_length - k + 1):
        shingle = text[i:i + k]
        shingles.add(shingle)
    return shingles

def hashShingle (text, k):
    """Generate hashed k-shingles from the input text.

    Args:
        text (str): The input text to generate hashed shingles from.
        k (int): The length of each shingle.

    Returns:
        set: A set of hashed k-shingles.
    """
    shingles = shingle(text, k)
    hashed_shingles = set()
    for sh in shingles:
        hashed_shingles.add(hash(sh))
    return sorted(hashed_shingles)

j1 = hashShingle('hello world', 4)
j2 = hashShingle('hello worre', 4)

print(j1)
print(j2)


[-4443270549105619668, -4184497982029848588, -1604633567641877781, 1301593212456595709, 4353379842788791865, 6260578416602383294, 6571071075875533076, 7034224709457021269]
[-7132361104300084557, -4443270549105619668, -1604633567641877781, -641925319285622903, 1301593212456595709, 4353379842788791865, 6260578416602383294, 6571071075875533076]


In [2]:
def jaccardSimilarity (j1, j2):
    """Calculate the Jaccard similarity between two sets.

    Args:
        set1 (set): The first set.
        set2 (set): The second set.

    Returns:
        float: The Jaccard similarity between the two sets.
    """
    set1 = set(j1)
    set2 = set(j2)
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    if not union:
        return 0.0
    return len(intersection) / len(union)

jaccardSimilarity(j1, j2)

0.6

In [10]:


p = 2**61 - 1
a = [random.randint(1, p-1) for _ in range(100)]
b = [random.randint(0, p-1) for _ in range(100)]

def hash_i(i, x):
    return (a[i] * hash(x) + b[i]) % p

def minHash(shingle, numHashes):
    signatures = []
    for i in range(numHashes):
        minHash = float('inf')
        for sh in shingle:
            hashCode = hash_i(i, sh)
            if hashCode < minHash:
                minHash = hashCode
        signatures.append(minHash)
    return signatures

m1 = minHash(j1, 100)
m2 = minHash(j2, 100)
print(m1)
print(m2)

[112238507659011794, 757381322592271805, 53031754005010032, 550238610495658283, 225093440416797536, 137875087133960527, 337321896157379051, 85293888206725082, 445549138870079718, 161350356168165610, 48555486314453246, 629238263124491303, 625474681503561115, 53794985896665762, 533061788358958765, 454136978152145057, 771139587804147637, 77301307231910235, 425391690828323951, 93196756431272137, 36742101147622448, 246085790784158069, 304084348474768931, 139935909239197842, 66951620237703548, 192491459936306773, 33423978485538609, 193492352229528439, 423373166429783722, 66397754436061383, 486473724098293663, 301740273164367200, 32488385461737631, 478753713586215198, 33005520439951057, 62502584459363319, 73388600513197754, 308960152038451031, 6292604129643738, 257006229880316541, 238837904147877819, 409253591499930276, 13328652220865674, 51517490077237441, 90805875531823620, 183082074864324833, 117258014742803930, 293041020819044820, 171092026167435585, 287465726843893235, 520715858647933732

In [11]:
def compareSignatures (m1, m2):
    agree = 0
    for i in range(len(m1)):
        if m1[i] == m2[i]:
            agree += 1
    return agree / len(m1)

compareSignatures(m1, m2)

0.57