# Exploring Locality Sensitive Hashing

In [56]:
# Imports

import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
import sys
import functools
import mongomock

In [14]:
# Permutation function

def get_permutations(len_permutations=2048, num_permutations=100):
    return map(lambda _: np.random.permutation(2048), range(num_permutations))

In [44]:
permutations = get_permutations()

In [45]:
def get_min_hash(mol, permutations):
    qfp = list(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048))
    min_hash = []
    for perm in permutations:
        for idx, i in enumerate(perm):
            if qfp_bits[i]:
                min_hash.append(idx)
                break            
    return min_hash

In [46]:
mol = Chem.MolFromSmiles('C1=CC=CC=C1OC')
min_hash = get_min_hash(mol, permutations)


In [54]:
def hash_to_buckets(min_hash, num_buckets=25, nBits=2048):
    if len(min_hash) % num_buckets:
        raise Exception('number of buckets must be divisiable by the hash length')
    buckets = []
    hash_per_bucket = int(len(min_hash) / num_buckets)
    num_bits = (nBits-1).bit_length()
#     if num_bits * hash_per_bucket > sys.maxint.bit_length():
#         raise Exception('numbers are too large to produce valid buckets')
    for b in range(num_buckets):
        buckets.append(functools.reduce(lambda x,y: (x << num_bits) + y, min_hash[b:(b + hash_per_bucket)]))
    return buckets

In [55]:
hash_to_buckets(min_hash)

[250056202389,
 1941707205052,
 782309908621,
 1281762813978,
 3814522409145,
 1211290208280,
 224114294943,
 1589238888575,
 206825584784,
 1366332571687,
 1091525753125,
 1237114759205,
 336236456125,
 2517006411838,
 318620430363,
 1623757740190,
 532689514536,
 232591015954,
 1357377474657,
 343673079808,
 155025670508,
 833224401069,
 1527081060,
 3127462010894,
 1486478143488]

In [57]:
client = mongomock.MongoClient()

In [58]:
db = client.db

In [59]:
db.list_collection_names()

[]

In [60]:
db.molecules.insert_one({'_id': 1, 'molecule': 'boom'})

<pymongo.results.InsertOneResult at 0x11d3e8550>

In [61]:
db.list_collection_names()

['molecules']

In [62]:
db.molecules.find()

<mongomock.collection.Cursor at 0x11af44950>

In [63]:
db.molecules.find_one()

{'_id': 1, 'molecule': 'boom'}