In [567]:
import numpy as np
from collections import Counter
import random
from copy import copy, deepcopy


In [710]:
def get_document_shingles(docs: list, k: int):
    
    shingles = []
    for doc in docs:
        doc = doc.lower()
        doc = "".join(doc.split())
        doc_shingle  = []
        
        for i in range(0 , len(doc)-k+1):
            doc_shingle.append(doc[i:i+k])
        shingles.append(doc_shingle)

    return shingles


def get_shingles_vs_docs(doc_shingles : list):
    
    all_shingles = np.concatenate(doc_shingles, axis=None)
    shingle_store = {}
    index = 0
    for i in range(len(all_shingles)):
        if all_shingles[i] not in shingle_store:
            shingle_store[all_shingles[i]] = index
            index+=1
                        
    number_of_docs = len(doc_shingles)
    shingle_doc_matrix = np.zeros((len(shingle_store), number_of_docs))
    
    for doc_id  in range(len(doc_shingles)):
        
        for shingle in shingle_store:
            if shingle in  doc_shingles[doc_id]:
                shingle_doc_matrix[shingle_store[shingle]][doc_id] = shingle_store[shingle]

    return shingle_doc_matrix, shingle_store
    

def get_random_hash_functions(n):
    
    hash_funcs = [ [random.randint(0,1000), random.randint(0,1000) ] for i in range(n)] 
    
    return hash_funcs

def get_hashed_value(x, a, b):
    return int((x*a+b) % 4294967311)

def get_min_hashed_matrix(matrix, n=1000):
    
    doc_hash_matrix = np.zeros((n, matrix.shape[1]))
    hash_funcs = get_random_hash_functions(n)
    all_doc_signature = []

    for i in range(matrix.shape[1]):
        
        doc_signature  = [] 
        doc  = matrix[:, i:i+1]
        doc  =[i  for i in doc if i !=0 ]
        
        for _hash in hash_funcs:            
            _shingle_hash = []
            for shingle_id in doc:
                _shingle_hash.append(get_hashed_value(shingle_id, _hash[0], _hash[1]))  
            doc_signature.append(min(_shingle_hash))
        
        all_doc_signature.append(doc_signature)
    
    return all_doc_signature
        
def minhash_doc_signature(matrix,n=12):
    hash_funcs = get_random_hash_functions(n)    
    hash_value = []
    for func in hash_funcs:
        val = [get_hashed_value(i,func[0],func[1]) for i in range(matrix.shape[0])]
        hash_value.append(val)
    signature_matrix = np.zeros((n,matrix.shape[1])) + float('inf')
    for c in range(matrix.shape[1]):
        for r in range(matrix.shape[0]):
            if matrix[r,c] != 0:
                for i in range(n):
                    hi = hash_value[i]
                    signature_matrix[i,c] = min(signature_matrix[i,c],hi[r])
    return signature_matrix



In [715]:
d0 = "how are you doing"
d1 = "how are you"
d2 = "the quick brown fox jumped over a lazy dog"
d3 = "the quick dog fox jumped over a lazy cat"
d4 = "a lazy dog"
d5 = "minhashing is a technique"
d6 = "minhash is effiencent"
d7 = "I'm doing minhashing"
docs = [d0,d1,d2,d3,d4,d5,d6,d7]

In [718]:
k = 3
doc_shingles = get_document_shingles(docs,k)
matrix, rows = get_shingles_vs_docs(doc_shingles)
signature_matrix = minhash_doc_signature(matrix,1000)
signature_matrix.shape

(1000, 8)