In [0]:
!pip install -q datasketch

In [0]:
from datasketch import MinHash,MinHashLSH,LeanMinHash,MinHashLSHForest,MinHashLSHEnsemble,WeightedMinHashGenerator
#Can Use libMHCUDA GPU speed accelaration

In [0]:
set1 = set(['minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for',
            'estimating', 'the', 'similarity', 'between', 'datasets'])
set2 = set(['minhash', 'is', 'a', 'probability', 'data', 'structure', 'for',
            'estimating', 'the', 'similarity', 'between', 'documents'])
set3 = set(['minhash', 'is', 'probability', 'data', 'structure', 'for',
            'estimating', 'the', 'similarity', 'between', 'documents'])

In [0]:
#Create MinHash vectors for sets
m1=MinHash(num_perm=128)
m2=MinHash(num_perm=128)
m3=MinHash(num_perm=128)

In [0]:
for d in set1:
    m1.update(d.encode('utf8'))
for d in set2:
    m2.update(d.encode('utf8'))
for d in set3:
    m3.update(d.encode('utf8'))

In [0]:
#Use LeanMinHash if not using updates anymore to save storage(permutations and hashcodes are note stored)
l_m1=LeanMinHash(m1)
l_m2=LeanMinHash(m2)
l_m3=LeanMinHash(m3)

In [0]:
#MinHashLSH to get the closest set with given threshold
#weights(False Positive,False Negative) equal to 1 which allows to choose band 'b' and rows 'r' 
#For t=(1/b)^(1/r) Lower 't' for avoid False Negative vice versa
lsh=MinHashLSH(threshold=0.7,num_perm=128,weights=(0.5,0.5))
lsh.insert("m1",l_m1)
lsh.insert("m2",l_m2)

In [0]:
result=lsh.query(l_m3)
result

['m2']

In [0]:
#Forest to get Top Searches for the given set
forest=MinHashLSHForest(num_perm=128)

In [0]:
forest.add("m1",l_m1)
forest.add("m2",l_m2)

In [0]:
#Must call index if not forest don't work on keys
forest.index()
"m2" in forest and "m1" in forest

True

In [0]:
result=forest.query(l_m3,2)
result

['m2', 'm1']

In [0]:
#LSH Ensemble for Containment Searches
lshensemble=MinHashLSHEnsemble(threshold=0.8,num_perm=128,num_part=32)
lshensemble.index([("m2", l_m2, len(set2)), ("m1", l_m1, len(set1))])
"m1" in lshensemble and "m2" in lshensemble

True

In [0]:
for key in lshensemble.query(m3,len(set3)):
  print(key)

m2
m1


In [0]:
#Weighted MinHash
import numpy as np
v1=np.random.randint(1,8,10)
v2=np.random.randint(1,8,10)
v3=np.random.randint(1,8,10)
for i,j,k in zip(v1,v2,v3):
  print(i," ",j," ",k)
#Provide length of Jaccard Weighted array of constant length
wmg=WeightedMinHashGenerator(dim=len(v1),sample_size=5,seed=10)

2   2   3
2   2   2
3   1   6
7   3   4
7   1   6
5   2   7
2   3   5
3   4   5
6   7   2
6   3   5


In [0]:
m1=wmg.minhash(v1)
m2=wmg.minhash(v2)
m3=wmg.minhash(v3)
lsh=MinHashLSH(threshold=0.4,num_perm=5)
lsh.insert("m1",m1)
lsh.insert("m2",m2)

In [0]:
result=lsh.query(m3)
result

['m2', 'm1']