In [1]:
from pathlib import Path
from indigo import Indigo
import numpy as np
import pandas as pd
import asyncio

from substrucure_finder import BucketsInitializer
from substrucure_finder import SearchEngine

import fp_utils
from fp_utils.finders import Finder
from fp_utils.tests import FinderSpeedTester
from fp_utils.catch_time import CatchTime

SyntaxError: 'await' outside async function (search_engine.py, line 31)

In [None]:
fp_utils.settings.init_fp_utils()

In [None]:
data_path = Path("../data/")

In [None]:
zero_columns_name = '../data/zero_columns'

# Build structure

In [None]:
data_name = ''

In [None]:
initializer = BucketsInitializer(data_path / data_name, columns_count=100)

In [None]:
(data_path / data_name / 'buckets').mkdir(exist_ok=True)

In [None]:
%%time
initializer.init_buckets()

# Search Queries

In [None]:
with open(zero_columns_name, 'r') as f:
    zero_columns = list(map(int, f.read().split()))
assert list(sorted(zero_columns)) == zero_columns

In [None]:
full_columns = list(sorted(set(range(3736)) - set(zero_columns)))

In [None]:
def bin_format(number, num_of_bits=3736):
    fp = list(map(int, list(bin(int(number, 16))[2:].zfill(num_of_bits))))
    return np.fromiter(fp, dtype=int)

In [None]:
def smiles_to_fingerprint(smiles):
    indigo_mol_to_test = Indigo().loadMolecule(smiles)
    full_fp = bin_format(indigo_mol_to_test.fingerprint("sub").toString())
    return np.fromiter(pd.Series(full_fp)[full_columns].values, dtype=bool)

In [None]:
@CatchTime("search")
def search(smiles: str, finder: Finder, ans_count=None):
    fp = smiles_to_fingerprint(smiles)
    return list(finder.find(fp, ans_count))

In [None]:
class ComplexFinder(SearchEngine, Finder):   
    def find_all(self, fp):
        return self.search(fp)

In [None]:
class AsyncFinder(SearchEngine, Finder):   
    def find_all(self, fp):
        return self.async_search(fp)

In [None]:
# mcc100_2kk = ComplexFinder(data_path / '2kk_100_mcc')
# mcc100_2kk_async = AsyncFinder(data_path / '2kk_100_mcc')
# mcc100_17kk_heur2000 = ComplexFinder(data_path / '17kk_100_heur2000')
# mcc100_17kk_heur2000_async = AsyncFinder(data_path / '17kk_100_heur2000')
# mcc100_2kk_heur1000 = ComplexFinder(data_path / '2kk_100_heur1000')
# mcc100_2kk_heur1500 = ComplexFinder(data_path / '2kk_100_heur1500')
# mcc100_2kk_heur1700 = ComplexFinder(data_path / '2kk_100_heur1700')

In [None]:
# mcc100_17kk = ComplexFinder(data_path / '17kk_100_heur2000')

In [None]:
with open('../data/pubchem_994_queries.txt', 'r') as f:
    queries = list(map(lambda x: x.split()[0], f.read().strip().split('\n')))

In [None]:
len(queries), queries[:10]

In [None]:
queries_fp = [smiles_to_fingerprint(smiles) for smiles in queries]

In [None]:
import random

In [None]:
queries_30 = random.sample(queries_fp, 30)

In [None]:
mcc100_17kk_heur2000 = ComplexFinder(data_path / '17kk_100_heur2000')

In [None]:
# %%time 
# stat_sync = FinderSpeedTester([mcc100_17kk_heur2000]).test_all(queries_30, verbose=True, ans_count=10)

In [None]:
mcc100_17kk_heur2000_async = AsyncFinder(data_path / '17kk_100_heur2000')

In [None]:
%%time 
stat_async = FinderSpeedTester([mcc100_17kk_heur2000_async]).test_all(queries_30, verbose=True, ans_count=10)

In [None]:
stat = stat_sync + stat_async

In [None]:
stat

In [None]:
stat.as_boxplot()

In [None]:
stat.as_plot()

In [None]:
from fp_utils.draw import MoleculeDrawer

In [None]:
query = queries[0]
MoleculeDrawer.draw_one(query)

In [None]:
answers = search(query, mcc100_17kk_heur2000_async, ans_count=10)

In [None]:
len(answers)

In [None]:
MoleculeDrawer.draw_many(answers, figsize=(20, 20), shape=(2, 3))

In [None]:
answers = search(query, mcc100_17kk_heur2000, ans_count=10)
MoleculeDrawer.draw_many(answers, figsize=(20, 20), shape=(2, 3))