In [1]:
from pathlib import Path
from indigo import Indigo
import numpy as np
import pandas as pd

from substrucure_finder import BucketsInitializer
from substrucure_finder import SearchEngine

import fp_utils
from fp_utils.finders import Finder
from fp_utils.tests import FinderSpeedTester
from fp_utils.catch_time import CatchTime

In [2]:
fp_utils.settings.init_fp_utils()

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
data_path = Path("../data/search_data_2/")

In [4]:
zero_columns_name = '../data/zero_columns'

# Build structure

In [5]:
initializer = BucketsInitializer(data_path, columns_count=100)

In [6]:
from substrucure_finder import consts

In [7]:
consts.fingerprint_size_in_bits

2584

In [8]:
# file = data_path / "raw_buckets/55/0.csv"

In [9]:
# with file.open('r') as f:
#     lines = f.readlines()
#     line = lines[105]

In [10]:
# line

In [11]:
# df = pd.read_csv(file, delimiter=',', header=0, index_col=0, dtype=dict((str(i), bool) for i in range(3)))

In [12]:
# df

In [13]:
(data_path / 'buckets').mkdir(parents=True, exist_ok=True)

In [14]:
%%time
initializer.init_buckets()

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Start init 61
Finish init 61
Start init 62
Finish init 62
Start init 77
Finish init 77
Start init 78
Finish init 78
Start init 79
Finish init 79
Start init 80
Finish init 80
Start init 81
Finish init 81
Start init 82
Finish init 82
Start init 83
Finish init 83
Start init 84
Finish init 84
Start init 85
Finish init 85
Start init 86
Finish init 86
Start init 87
Finish init 87
Start init 88
Finish init 88
Start init 89
Finish init 89
Start init 90
Finish init 90
Start init 91
Finish init 91
Start init 92
Finish init 92
Start init 94
Finish init 94
Start init 95
Finish init 95
Start init 97
Finish init 97
Start init 98
Finish init 98
Start init 99
Finish init 99
Start init 100
Finish init 100
Start init 111
Finish init 111
Start init 112
Finish init 112
Start init 117
Finish init 117
Start init 118
Finish init 118
Start init 119
Finish init 119
Start

# Search Queries

In [25]:
with open(zero_columns_name, 'r') as f:
    zero_columns = list(map(int, f.read().split()))
assert list(sorted(zero_columns)) == zero_columns

In [26]:
full_columns = list(sorted(set(range(3736)) - set(zero_columns)))

In [27]:
def bin_format(number, num_of_bits=3736):
    fp = list(map(int, list(bin(int(number, 16))[2:].zfill(num_of_bits))))
    return np.fromiter(fp, dtype=int)

In [28]:
def smiles_to_fingerprint(smiles):
    indigo_mol_to_test = Indigo().loadMolecule(smiles)
    full_fp = bin_format(indigo_mol_to_test.fingerprint("sub").toString())
    return np.fromiter(pd.Series(full_fp)[full_columns].values, dtype=bool)

In [29]:
@CatchTime("search")
def search(smiles: str, finder: Finder, ans_count=None):
    fp = smiles_to_fingerprint(smiles)
    return list(finder.find(fp, ans_count))

In [30]:
class ComplexFinder(SearchEngine, Finder):   
    def find_all(self, fp):
        return self.search(fp)

In [31]:
complex_finder = ComplexFinder(data_path)

0.003s -- ComplexFinder init time


In [32]:
with open('../data/pubchem_994_queries.txt', 'r') as f:
    queries = list(map(lambda x: x.split()[0], f.read().strip().split('\n')))

In [33]:
len(queries), queries[:10]

(157,
 ['Cc1c[n]c2[nH]ccc2c1Cl',
  'OC(=O)c1c[n]c2[nH]ccc2c1Cl',
  'Cc1c(Cl)c[n]c2[nH]ccc21',
  'Oc1c[n]c2[nH]ccc2c1Cl',
  'N[C@@H](C[S@](=O)CCO)C(O)=O',
  'OC(=O)c1ccc2[nH]ccc2[n]1',
  'N[C@@H](C[S@@](=O)CCO)C(O)=O',
  'COC(=O)[C@@H](O)CC([O-])=O',
  'OC(=O)c1[n]ccc2[nH]ccc21',
  'CC[NH2+]CC(C)C'])

In [34]:
for q in queries:
    print(f'\nQuery: {q}')
    answers = list(search(q, complex_finder, ans_count=10))
    print(f'Ans len: {len(answers)}')


Query: Cc1c[n]c2[nH]ccc2c1Cl
1.777s -- search
Ans len: 3

Query: OC(=O)c1c[n]c2[nH]ccc2c1Cl
0.613s -- search
Ans len: 0

Query: Cc1c(Cl)c[n]c2[nH]ccc21
0.747s -- search
Ans len: 2

Query: Oc1c[n]c2[nH]ccc2c1Cl
0.660s -- search
Ans len: 0

Query: N[C@@H](C[S@](=O)CCO)C(O)=O
4.181s -- search
Ans len: 1

Query: OC(=O)c1ccc2[nH]ccc2[n]1
1.003s -- search
Ans len: 0

Query: N[C@@H](C[S@@](=O)CCO)C(O)=O
3.718s -- search
Ans len: 1

Query: COC(=O)[C@@H](O)CC([O-])=O
2.529s -- search
Ans len: 2

Query: OC(=O)c1[n]ccc2[nH]ccc21
0.929s -- search
Ans len: 1

Query: CC[NH2+]CC(C)C
0.151s -- search
Ans len: 10

Query: O=Cc1c[n]c2cc[nH]c2c1
1.041s -- search
Ans len: 1

Query: CCC[C@@H](C)NC
0.286s -- search
Ans len: 10

Query: COCCCOCCNC
0.164s -- search
Ans len: 10

Query: C[C@@H](CN)OC[C@@H]1CCCO1
0.071s -- search
Ans len: 10

Query: NCc1cc2[n]cc[n]c2cc1
0.070s -- search
Ans len: 10

Query: CN(C)C1C[C@@H]2CC[C@H](C1)N2
0.079s -- search
Ans len: 10

Query: CC[C@H](CN)OCCC
0.249s -- search
Ans len: 

0.138s -- search
Ans len: 10

Query: CCN1C(=O)COc2c1cccc2N
0.415s -- search
Ans len: 1

Query: COC(=O)c1c[n]c(Cl)o1
4.618s -- search
Ans len: 0

Query: CN1CCN(CC1)c1cc(C=O)ccc1
0.420s -- search
Ans len: 10

Query: O=C1CCC[C@H]2Nc3ccccc3[C@@H]12
0.195s -- search
Ans len: 10

Query: CCC[n]1c[n][n]c1[C@@H](C)N
0.065s -- search
Ans len: 10

Query: CCc1[n]c[n]cc1Br
0.203s -- search
Ans len: 10

Query: CCC[n]1c[n][n]c1[C@H](C)N
0.091s -- search
Ans len: 10

Query: CC(C)[n]1c[n][n]c1[C@H](C)N
1.262s -- search
Ans len: 10

Query: CC[C@H](N)c1ccc(CC)cc1
0.092s -- search
Ans len: 10

Query: C[C@@H]1Oc2c(N)cccc2N(C)C1=O
0.369s -- search
Ans len: 0

Query: Cc1ccc(cc1)N1C[C@@H](N)CC1=O
0.129s -- search
Ans len: 10

Query: C[C@H]1Oc2c(N)cccc2N(C)C1=O
0.279s -- search
Ans len: 0

Query: Cc1cccc(CO)c1NC
0.200s -- search
Ans len: 10

Query: C[C@@H]1CC(=O)N(C)C1
0.078s -- search
Ans len: 10

Query: CN1C(=O)COc2c(N)cccc12
0.314s -- search
Ans len: 1

Query: CC(C)[n]1c[n][n]c1[C@@H](C)N
0.680s -- search
A