# Substructure Testing

In [1]:
from __future__ import print_function
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from rdkit.Avalon import pyAvalonTools
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit import rdBase
from rdkit import DataStructs
import random,gzip,time
import mongordkit
import time
import pymongo
import rdkit
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from os import sys
import pandas as pd
from rdkit import Chem
from statistics import mean, median
from rdkit.Chem import AllChem
from mongordkit.Database import write
from mongordkit.Search import similarity
from mongordkit.Search import substructure

In [2]:
#Construct a MongoDB instance. 
client = pymongo.MongoClient()
print(client.list_database_names())
#client.drop_database('db')
#print(client.list_database_names())

['admin', 'config', 'frags', 'leads', 'local', 'pieces', 'test']


In [3]:
#Construct a database called test and write 100,000 molecules to it.
db = client['test']
#write.writeFromSDF(db, '../../../chembl_27.sdf', 'test', reg_option='standard_setting', index_option='inchikey', chunk_size=1000, limit=100000)

In [5]:
db.molecules.count_documents({})

101000

In [6]:
f = open('../../data/zinc.frags.500.q.smi')
frags = [Chem.MolFromSmiles(line.split()[0]) for line in f]
f.close()

f = open('../../data/zinc.leads.500.q.smi')
leads = [Chem.MolFromSmiles(line.split()[0]) for line in f]
f.close()

f = open('../../data/fragqueries.q.txt')
pieces = [Chem.MolFromSmiles(line) for line in f]
f.close()

## Benchmarking Begins

In [7]:
# Benchmark for search of 500 fragments against 101,000 molecules with the naive and fingerprint screen substructure algorithm.
frag_times = []
frag_times_naive = []

# for pattern in frags[:25]:
#     start = time.time()
#     substructure.SubSearchNaive(pattern, db)
#     print("finished one")
#     end = time.time()
#     frag_times_naive.append(end - start)
# print(f"Naive benchmarks. Median time: {median(frag_times_naive)}, Mean time: {mean(frag_times_naive)}")

In [8]:
for pattern in frags:
    start = time.time()
    substructure.SubSearch(pattern, db)
    end = time.time()
    frag_times.append(end - start)
print(f"With fingerprint screening. Median time: {median(frag_times)}, Mean time: {mean(frag_times)}")

With fingerprint screening. Median time: 0.07276749610900879, Mean time: 0.08273182916641235


In [None]:
# Benchmark for search of 500 leads against 101,000 molecules with the naive substructure algorithm.
# lead_times_naive = []

# for pattern in leads:
#     start = time.time()
#     substructure.SubSearchNaive(pattern, db)
#     end = time.time()
#     lead_times_naive.append(end - start)
# print(f"Median time: {median(lead_times_naive)}, Mean time: {mean(lead_times_naive)}")

In [9]:
lead_times = []

for pattern in leads:
    start = time.time()
    substructure.SubSearch(pattern, db)
    end = time.time()
    lead_times.append(end - start)
print(f"Median time: {median(lead_times)}, Mean time: {mean(lead_times)}")

Median time: 0.0697939395904541, Mean time: 0.0744774112701416


In [None]:
# Benchmark for search of 500 pieces against 101,000 molecules with the naive substructure algorithm.
pieces_times = []
pieces_times_naive = []

for pattern in pieces:
    start = time.time()
    substructure.SubSearchNaive(pattern, db)
    end = time.time()
    pieces_times.append(end - start)
print(f"Median time: {median(pieces_times_naive)}, Mean time: {mean(pieces_times_naive)}")

for pattern in pieces:
    start = time.time()
    substructure.SubSearch(pattern, db)
    end = time.time()
    pieces_times.append(end - start)
print(f"Median time: {median(pieces_times)}, Mean time: {mean(pieces_times)}")

In [None]:
# Benchmark for searching 500 pieces, fragments, and leads against themselves. 

frag_db = client['frags']
pieces_db = client['pieces']
leads_db = client['leads']
frag2_times_naive = []
pieces2_times_naive = []
leads2_times_naive = []
frag2_times = []
pieces2_times = []
leads2_times = []

write.WriteMolList(frag_db, frags, 'test')
write.WriteMolList(pieces_db, frags, 'test')
write.WriteMolList(leads_db, frags, 'test')

for pattern in frags:
    start = time.time()
    substructure.SubSearchNaive(pattern, frag_db)
    end = time.time()
    frag2_times_naive.append(end - start)
print(f"Median time: {median(frag2_times_naive)}, Mean time: {mean(frag2_times_naive)}")

for pattern in frags:
    start = time.time()
    substructure.SubSearch(pattern, frag_db)
    end = time.time()
    frag2_times.append(end - start)
print(f"Median time: {median(frag2_times)}, Mean time: {mean(frag2_times)}")