In [1]:
from pathlib import Path
from indigo import Indigo
import numpy as np
import pandas as pd

from substrucure_finder import BucketsInitializer
from substrucure_finder import SearchEngine

import fp_utils
from fp_utils.finders import Finder
from fp_utils.tests import FinderSpeedTester
from fp_utils.catch_time import CatchTime

In [2]:
fp_utils.settings.init_fp_utils()

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
data_path = Path("../data/search_data_0/")

In [4]:
zero_columns_name = '../data/zero_columns'

# Build structure

In [5]:
initializer = BucketsInitializer(data_path, columns_count=100)

In [6]:
%%time
initializer.init_buckets()

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Start init 63
Finish init 63
Start init 64
Finish init 64
Start init 71
Finish init 71
Start init 72
Finish init 72
Start init 75
Finish init 75
Start init 76
Finish init 76
Start init 77
Finish init 77
Start init 78
Finish init 78
Start init 81
Finish init 81
Start init 82
Finish init 82
Start init 83
Finish init 83
Start init 84
Finish init 84
Start init 87
Finish init 87
Start init 88
Finish init 88
Start init 89
Finish init 89
Start init 90
Finish init 90
Start init 99
Finish init 99
Start init 100
Finish init 100
Start init 103
Finish init 103
Start init 104
Finish init 104
Start init 105
Finish init 105
Start init 106
Finish init 106
Start init 107
Finish init 107
Start init 108
Finish init 108
Start init 109
Finish init 109
Start init 110
Finish init 110
Start init 111
Finish init 111
Start init 112
Finish init 112
Start init 115
Finish in

# Search Queries

In [7]:
with open(zero_columns_name, 'r') as f:
    zero_columns = list(map(int, f.read().split()))
assert list(sorted(zero_columns)) == zero_columns

In [8]:
full_columns = list(sorted(set(range(3736)) - set(zero_columns)))

In [9]:
def bin_format(number, num_of_bits=3736):
    fp = list(map(int, list(bin(int(number, 16))[2:].zfill(num_of_bits))))
    return np.fromiter(fp, dtype=int)

In [10]:
def smiles_to_fingerprint(smiles):
    indigo_mol_to_test = Indigo().loadMolecule(smiles)
    full_fp = bin_format(indigo_mol_to_test.fingerprint("sub").toString())
    return np.fromiter(pd.Series(full_fp)[full_columns].values, dtype=int)

In [11]:
@CatchTime("search")
def search(smiles: str, finder: Finder, ans_count=None):
    fp = smiles_to_fingerprint(smiles)
    return list(finder.find(fp, ans_count))

In [12]:
class ComplexFinder(SearchEngine, Finder):   
    def find_all(self, fp):
        return self.search(fp)

In [13]:
complex_finder = ComplexFinder(data_path)

0.003s -- ComplexFinder init time


In [14]:
with open('../data/pubchem_994_queries.txt', 'r') as f:
    queries = list(map(lambda x: x.split()[0], f.read().strip().split('\n')))

In [15]:
len(queries), queries[:10]

(157,
 ['Cc1c[n]c2[nH]ccc2c1Cl',
  'OC(=O)c1c[n]c2[nH]ccc2c1Cl',
  'Cc1c(Cl)c[n]c2[nH]ccc21',
  'Oc1c[n]c2[nH]ccc2c1Cl',
  'N[C@@H](C[S@](=O)CCO)C(O)=O',
  'OC(=O)c1ccc2[nH]ccc2[n]1',
  'N[C@@H](C[S@@](=O)CCO)C(O)=O',
  'COC(=O)[C@@H](O)CC([O-])=O',
  'OC(=O)c1[n]ccc2[nH]ccc21',
  'CC[NH2+]CC(C)C'])

In [None]:
for q in queries:
    print(f'\nQuery: {q}')
    answers = list(search(q, complex_finder, ans_count=10))
    print(f'Ans len: {len(answers)}')


Query: Cc1c[n]c2[nH]ccc2c1Cl
18.611s -- search
Ans len: 3

Query: OC(=O)c1c[n]c2[nH]ccc2c1Cl
13.209s -- search
Ans len: 0

Query: Cc1c(Cl)c[n]c2[nH]ccc21
13.798s -- search
Ans len: 2

Query: Oc1c[n]c2[nH]ccc2c1Cl
12.688s -- search
Ans len: 0

Query: N[C@@H](C[S@](=O)CCO)C(O)=O
40.424s -- search
Ans len: 1

Query: OC(=O)c1ccc2[nH]ccc2[n]1
15.583s -- search
Ans len: 0

Query: N[C@@H](C[S@@](=O)CCO)C(O)=O
38.689s -- search
Ans len: 1

Query: COC(=O)[C@@H](O)CC([O-])=O
34.491s -- search
Ans len: 2

Query: OC(=O)c1[n]ccc2[nH]ccc21
13.260s -- search
Ans len: 1

Query: CC[NH2+]CC(C)C
0.658s -- search
Ans len: 10

Query: O=Cc1c[n]c2cc[nH]c2c1
16.641s -- search
Ans len: 1

Query: CCC[C@@H](C)NC
0.261s -- search
Ans len: 10

Query: COCCCOCCNC
0.469s -- search
Ans len: 10

Query: C[C@@H](CN)OC[C@@H]1CCCO1
0.196s -- search
Ans len: 10

Query: NCc1cc2[n]cc[n]c2cc1
0.686s -- search
Ans len: 10

Query: CN(C)C1C[C@@H]2CC[C@H](C1)N2
0.702s -- search
Ans len: 10

Query: CC[C@H](CN)OCCC
0.657s -- search