In [1]:
from MCS_similarity import *
import pandas as pd

df = pd.read_csv('USPTO_mutations.csv')
all_smiles = df['molecule'].tolist()
all_mutations = df['mutations'].tolist()

In [2]:
idxs = np.random.choice(len(df), 200)

comps = {}
for i in idxs:
    smi1 = all_smiles[i]
    mol1 = Chem.MolFromSmiles(smi1)
    mutations = eval(all_mutations[i])
    comp = []
    for smi2 in mutations:
        mol2 = Chem.MolFromSmiles(smi2)
        t1, MCS1 = rdkit_MCS(mol1, mol2)
        t2, MCS2 = fast_MCS(mol1, mol2)
        comp.append((t1-t2), (MCS1.smartsString==MCS2.smartsString))

TypeError: list.append() takes exactly one argument (2 given)

In [5]:
MCS1.smartsString

'[#6&!R]-&!@[#6&R]1:&@[#6&R]:&@[#6&R]:&@[#6&R]2:&@[#6&R](:&@[#7&R]:&@1):&@[#7&R]:&@[#6&R]:&@[#6&R]:&@2-&!@[#6&!R]=&!@[#6&R]1-&@[#8&R]-&@[#6&R]2:&@[#6&R](-&@[#6&R]-&@1=&!@[#8&!R]):&@[#6&R]:&@[#6&R]:&@[#6&R](:&@[#6&R]:&@2-&!@[#6&!R])-&!@[#8&!R]'

In [6]:
MCS2.smartsString

'[#6&!R]-&!@[#6&!R]1:&!@[#6&!R]:&!@[#6&!R]:&!@[#6&!R]:&!@[#6&!R](:&!@[#6&!R]:&!@[#7&!R]:&!@[#6&!R]:&!@[#7&!R]:&!@1)-&!@[#6&!R]=&!@[#6&!R]-&!@[#8&!R]-&!@[#6&!R]:&!@[#6&!R]:&!@[#6&!R]:&!@[#6&!R]:&!@[#6&!R](-&!@[#8&!R]):&!@[#6&!R]-&!@[#6&!R]'

In [None]:
# finding the most similar molecule in dataset
import math
from tqdm import tqdm

Tam_Sims = {}
rdkit_MCS_Sims = {}
fast_MCS_Sims = {}
hybrid_Sims = {}

for i in idxs:
    smi1 = all_smiles[i]
    mol1 = Chem.MolFromSmiles(smi1)
    print (i, smi1)
    recorder1 = sim_recorder(smi1)
    recorder2 = sim_recorder(smi1)
    recorder3 = sim_recorder(smi1)
    recorder4 = sim_recorder(smi1)
    mutations = eval(all_mutations[i])
    for smi2 in mutations:
        mol2 = Chem.MolFromSmiles(smi2)
        t1, sim1 = Tam_Sim(mol1, mol2)
        t2, sim2 = rdkit_MCS_Sim(mol1, mol2)
        t3, sim3 = fast_MCS_Sim(mol1, mol2)
        t4 = t1+t3
        sim4 = math.sqrt(sim1*sim3) # hybrid similarity
        recorder1.record(t1, sim1)
        recorder2.record(t2, sim2)
        recorder3.record(t3, sim3)
        recorder4.record(t4, sim4)
    Tam_Sims[i] = recorder1.summerize(mutations)
    rdkit_MCS_Sims[i] = recorder2.summerize(mutations)
    fast_MCS_Sims[i] = recorder3.summerize(mutations)
    hybrid_Sims[i] = recorder4.summerize(mutations)

732 C[SiH](C)Oc1c(C(=O)c2cn(C(c3ccccc3)(c3ccccc3)c3ccccc3)cn2)ccc2cc(C(C)(C)C)ccc12


In [4]:
import numpy as np

# Tam_Sims, rdkit_MCS_Sims, fast_MCS_Sims, hybrid_Sims

def get_average_time(d):
    return np.mean([v[0] for v in d.values()])

get_average_time(rdkit_MCS_Sims), get_average_time(fast_MCS_Sims)

(0.0002039817112299545, 0.0002039817112299545)

In [5]:
smiles_list = [all_smiles[i] for i in idxs]

In [6]:
df = pd.DataFrame.from_dict(fast_MCS_Sims, orient = 'index')
df.columns = ['time', 'similar_mols']
df['smiles'] = smiles_list
# df['time'] = [tt in enumerate(df['time'])]
df.to_csv('fast_MCS_Mutations.csv')
df

Unnamed: 0,time,similar_mols,smiles
732,0.000323,[(C=CCOc1ccc2c(O[SiH](C)C)c(C(=O)c3cn(C(c4cccc...,C[SiH](C)Oc1c(C(=O)c2cn(C(c3ccccc3)(c3ccccc3)c...
773,0.000335,[(C[C@H]1COCCN1c1cc(C2(S(=O)(=O)c3cccnc3C(=O)N...,C[C@H]1COCCN1c1cc(C2(S(=O)(=O)c3cccnc3C(=O)N(C...
800,0.000187,"[(CC1(COCc2coc(-c3ccc(F)cc3)n2)COCOC1, 0.93333...",OCC1(COCc2coc(-c3ccc(F)cc3)n2)COCOC1
881,0.000193,[(O=C(c1ccc2c(CO)cnc(-c3c(F)cc(F)cc3F)c2c1)C1C...,COC(=O)c1ccc2c(CO)cnc(-c3c(F)cc(F)cc3F)c2c1
1705,0.000170,"[(C=COC(=O)C1CCCn2c(C(=O)c3ccccc3)ccc21, 0.930...",COC(=O)C1CCCn2c(C(=O)c3ccccc3)ccc21
...,...,...,...
45707,0.000224,[(C=CCCOC(=O)C1CN(CCCN2c3ccccc3CCc3ccccc32)CCN...,CCOC(=O)C1CN(CCCN2c3ccccc3CCc3ccccc32)CCN1
45949,0.000146,"[(CCCCOc1cc(/C=C/C(=O)O)ccc1I, 0.9142857142857...",CCCCOc1cc(/C=C/C(=O)OC)ccc1I
46045,0.000242,[(COC(=O)c1cc2cc(OCCN3CCOCC3)ccc2n1S(=O)(=O)c1...,COC(=O)c1cc2cc(OCc3ccccc3)ccc2n1S(=O)(=O)c1ccc...
47013,0.000142,"[(C=CCc1ccc(N(C)C(=O)[C@H]2CCCNC2)cc1, 0.88888...",CN(C(=O)[C@H]1CCCNC1)c1ccc(F)cc1
