In [1]:
!pip install datasketch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasketch
  Downloading datasketch-1.5.8-py2.py3-none-any.whl (76 kB)
[K     |████████████████████████████████| 76 kB 793 kB/s 
Installing collected packages: datasketch
Successfully installed datasketch-1.5.8


In [11]:

import pandas as pd
import numpy as np
import random
from random import shuffle
import re
from datasketch import MinHash, MinHashLSHForest
from sklearn.metrics.pairwise import cosine_similarity

from google.colab import drive

In [3]:
def gen_shingles(text: str, k: int):
  shingles = []

  for i in range(len(text) - k + 1):
    shingle = text[i:i + k]
    shingles.append(shingle)

  return set(shingles)

In [6]:
df1 = 'sanjay'
df2 = 'data mining'
df3 = 'cmoe255 data mining assignmnet'
k = 3

t1 = gen_shingles(df1, k)
t2 = gen_shingles(df2, k)
t3 = gen_shingles(df3, k)

t1, t2, t3

({'anj', 'jay', 'nja', 'san'},
 {' mi', 'a m', 'ata', 'dat', 'ing', 'ini', 'min', 'nin', 'ta '},
 {' as',
  ' da',
  ' mi',
  '255',
  '5 d',
  '55 ',
  'a m',
  'ass',
  'ata',
  'cmo',
  'dat',
  'e25',
  'g a',
  'gnm',
  'ign',
  'ing',
  'ini',
  'min',
  'mne',
  'moe',
  'net',
  'ng ',
  'nin',
  'nmn',
  'oe2',
  'sig',
  'ssi',
  'ta '})

In [7]:
vocab = t1.union(t2).union(t3)
vocab

{' as',
 ' da',
 ' mi',
 '255',
 '5 d',
 '55 ',
 'a m',
 'anj',
 'ass',
 'ata',
 'cmo',
 'dat',
 'e25',
 'g a',
 'gnm',
 'ign',
 'ing',
 'ini',
 'jay',
 'min',
 'mne',
 'moe',
 'net',
 'ng ',
 'nin',
 'nja',
 'nmn',
 'oe2',
 'san',
 'sig',
 'ssi',
 'ta '}

In [8]:
t1_enc = [1 if s in t1 else 0 for s in vocab]
t2_enc = [1 if s in t2 else 0 for s in vocab]
t3_enc = [1 if s in t3 else 0 for s in vocab]

t1_enc

[0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0]

In [9]:
nums = list(range(1, len(t1_enc) + 1))
print(nums)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]


In [12]:
shuffle(nums)
random_nums = nums
random_nums

[7,
 10,
 15,
 18,
 24,
 11,
 27,
 5,
 31,
 29,
 6,
 32,
 19,
 4,
 22,
 17,
 14,
 30,
 8,
 2,
 23,
 1,
 3,
 26,
 16,
 13,
 9,
 12,
 25,
 28,
 20,
 21]

In [13]:
def create_hash(size):
  hash = list(range(1, len(vocab) + 1))
  shuffle(hash)
  return hash
     

In [14]:
def build_minhash_func(vocab_size: int, nbits: int):
    hashes = []
    for i in range(nbits):
        hashes.append(create_hash(vocab_size))
    return hashes

# creating 20 minhash vectors
minhash = build_minhash_func(len(vocab), 20)


In [15]:
def create_sig(onehot_enc: list):
    # To create signatures
    signature = []
    for func in minhash:
        for i in range(1, len(vocab)+1):
            idx = func.index(i)
            if onehot_enc[idx] == 1:
                signature.append(idx)
                break
    return signature


In [16]:
t1_sig = create_sig(t1_enc)
t2_sig = create_sig(t2_enc)
t3_sig = create_sig(t3_enc)

print(t1_sig)
print(t2_sig)
     

[18, 18, 18, 25, 3, 18, 3, 18, 30, 30, 3, 30, 18, 3, 30, 30, 25, 3, 3, 18]
[11, 27, 4, 21, 5, 5, 4, 10, 5, 21, 21, 4, 24, 23, 4, 4, 10, 5, 0, 4]


In [17]:
def jaccard(a, b):
  return len(a.intersection(b)) / len(a.union(b))

In [19]:
jaccard(t1, t2), jaccard(set(t1_sig), set(t2_sig))

(0.0, 0.0)

In [20]:
jaccard(t1, t3), jaccard(set(t1_sig), set(t3_sig))

(0.0, 0.0)

In [21]:
jaccard(t2, t3), jaccard(set(t2_sig), set(t3_sig))

(0.32142857142857145, 0.3333333333333333)

In [22]:

def fill_buckets(signatures, bands):
  n = len(signatures) // bands   
  bucket = [] 
  i = 0
  while i < len(signatures):
    bucket.append(signatures[i: i + n]) 
    i += n
  return bucket

In [23]:
band_text1 = fill_buckets(t1_sig, 10)
band_text2 = fill_buckets(t2_sig, 10)
band_text3 = fill_buckets(t3_sig, 10)

band_text1, band_text2, band_text3

([[18, 18],
  [18, 25],
  [3, 18],
  [3, 18],
  [30, 30],
  [3, 30],
  [18, 3],
  [30, 30],
  [25, 3],
  [3, 18]],
 [[11, 27],
  [4, 21],
  [5, 5],
  [4, 10],
  [5, 21],
  [21, 4],
  [24, 23],
  [4, 4],
  [10, 5],
  [0, 4]],
 [[14, 27],
  [4, 2],
  [29, 2],
  [26, 10],
  [5, 2],
  [7, 6],
  [8, 23],
  [8, 9],
  [10, 5],
  [0, 15]])

In [24]:
for t1, t2 in zip(band_text1, band_text2):
  if t1 == t2:
    print("Candidadte Pairs = {},{}".format(t1, t2))
    break
     

for t2, t3 in zip(band_text2, band_text3):
  if t2 == t3:
    print("Candidadte Pairs = {},{}".format(t2, t3))
    break
     

for t3, t1 in zip(band_text3, band_text1):
  if t3 == t1:
    print("Candidate Pairs = {},{}".format(t3, t1))
    break

Candidadte Pairs = [10, 5],[10, 5]
