In [1]:
from pathlib import Path
from indigo import Indigo
import numpy as np
import pandas as pd

from substrucure_finder import BucketsInitializer
from substrucure_finder import SearchEngine

import fp_utils
from fp_utils.finders import Finder
from fp_utils.tests import FinderSpeedTester
from fp_utils.catch_time import CatchTime

In [2]:
fp_utils.settings.init_fp_utils()

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


# Build structure

In [3]:
data_path = Path("../data/search_data_0/")

In [4]:
initializer = BucketsInitializer(data_path, columns_count=100)

In [62]:
%%time
initializer.init_buckets()

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Start init 63
Finish init 63
Start init 64
Finish init 64
Start init 71
Finish init 71
Start init 72
Finish init 72
Start init 89
Finish init 89
Start init 90
Finish init 90
Start init 105
Finish init 105
Start init 106
Finish init 106
Start init 165
Finish init 165
Start init 166
Finish init 166
Start init 169
Finish init 169
Start init 170
Finish init 170
Start init 199
Finish init 199
Start init 200
Finish init 200
Start init 209
Finish init 209
Start init 210
Finish init 210
Start init 293
Finish init 293
Start init 294
Finish init 294
Start init 297
Finish init 297
Start init 298
Finish init 298
Start init 307
Finish init 307
Start init 308
Finish init 308
Start init 313
Finish init 313
Start init 314
Finish init 314
Start init 364
Finish init 364
Start init 366
Finish init 366
Start init 373
Finish init 373
Start init 374
Finish init 374
St

# Search Queries

In [5]:
zero_columns_name = '../data/zero_columns'

In [6]:
with open(zero_columns_name, 'r') as f:
    zero_columns = list(map(int, f.read().split()))
assert list(sorted(zero_columns)) == zero_columns

In [7]:
full_columns = list(sorted(set(range(3736)) - set(zero_columns)))

In [8]:
def bin_format(number, num_of_bits=3736):
    fp = list(map(int, list(bin(int(number, 16))[2:].zfill(num_of_bits))))
    return np.fromiter(fp, dtype=int)

In [9]:
def smiles_to_fingerprint(smiles):
    indigo_mol_to_test = Indigo().loadMolecule(smiles)
    full_fp = bin_format(indigo_mol_to_test.fingerprint("sub").toString())
    return np.fromiter(pd.Series(full_fp)[full_columns].values, dtype=int)

In [10]:
@CatchTime("search")
def search(smiles: str, finder: Finder, ans_count=None):
    fp = smiles_to_fingerprint(smiles)
    return list(finder.find(fp, ans_count))

In [11]:
class ComplexFinder(SearchEngine, Finder):   
    def find_all(self, fp):
        return self.search(fp)

In [12]:
complex_finder = ComplexFinder(data_path)

0.010s -- ComplexFinder init time


In [13]:
with open('../data/pubchem_994_queries.txt', 'r') as f:
    queries = list(map(lambda x: x.split()[0], f.read().strip().split('\n')))

In [14]:
len(queries), queries[:10]

(157,
 ['Cc1c[n]c2[nH]ccc2c1Cl',
  'OC(=O)c1c[n]c2[nH]ccc2c1Cl',
  'Cc1c(Cl)c[n]c2[nH]ccc21',
  'Oc1c[n]c2[nH]ccc2c1Cl',
  'N[C@@H](C[S@](=O)CCO)C(O)=O',
  'OC(=O)c1ccc2[nH]ccc2[n]1',
  'N[C@@H](C[S@@](=O)CCO)C(O)=O',
  'COC(=O)[C@@H](O)CC([O-])=O',
  'OC(=O)c1[n]ccc2[nH]ccc21',
  'CC[NH2+]CC(C)C'])

In [None]:
for q in queries:
    print(f'\nQuery: {q}')
    answers = list(search(q, complex_finder, ans_count=10))
    print(f'Ans len: {len(answers)}')


Query: Cc1c[n]c2[nH]ccc2c1Cl
7.858s -- search
Ans len: 3

Query: OC(=O)c1c[n]c2[nH]ccc2c1Cl
5.526s -- search
Ans len: 0

Query: Cc1c(Cl)c[n]c2[nH]ccc21
7.349s -- search
Ans len: 2

Query: Oc1c[n]c2[nH]ccc2c1Cl
7.105s -- search
Ans len: 0

Query: N[C@@H](C[S@](=O)CCO)C(O)=O
21.196s -- search
Ans len: 1

Query: OC(=O)c1ccc2[nH]ccc2[n]1
9.242s -- search
Ans len: 0

Query: N[C@@H](C[S@@](=O)CCO)C(O)=O
20.916s -- search
Ans len: 1

Query: COC(=O)[C@@H](O)CC([O-])=O
18.026s -- search
Ans len: 2

Query: OC(=O)c1[n]ccc2[nH]ccc21
7.817s -- search
Ans len: 1

Query: CC[NH2+]CC(C)C
0.384s -- search
Ans len: 10

Query: O=Cc1c[n]c2cc[nH]c2c1
9.736s -- search
Ans len: 1

Query: CCC[C@@H](C)NC
0.141s -- search
Ans len: 10

Query: COCCCOCCNC
0.247s -- search
Ans len: 10

Query: C[C@@H](CN)OC[C@@H]1CCCO1
0.131s -- search
Ans len: 10

Query: NCc1cc2[n]cc[n]c2cc1
0.352s -- search
Ans len: 10

Query: CN(C)C1C[C@@H]2CC[C@H](C1)N2
0.266s -- search
Ans len: 10

Query: CC[C@H](CN)OCCC
0.164s -- search
Ans le