In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole #Only needed if modifying defaults
IPythonConsole.ipython_useSVG=True
IPythonConsole.drawOptions.addStereoAnnotation = True
IPythonConsole.drawOptions.addAtomIndices = False
IPythonConsole.drawOptions.addBondIndices = False
IPythonConsole.drawOptions.bondLineWidth = 2 # int only
IPythonConsole.molSize = 300, 200

In [7]:
import sys
sys.path.append("../check_data")
sys.path.append("../utils")

In [8]:
# small data set for query
smiles_dict = {'Benzene':'C1=CC=CC=C1',
               'Acetamide':'CC(=O)N',
               'Pyrimidine':'C1=CN=CN=C1',
               'Benzaldehyde':'C1=CC=C(C=C1)C=O',
               'Ibuprofen':'CC(C)CC1=CC=C(C=C1)C(C)C(=O)O',
               'Triazavirin':'CSC1=NC2=NN=C(C(=O)N2N1)[N+](=O)[O-]'}

names = list(smiles_dict.keys())
smiles = list(smiles_dict.values())
mols = [Chem.MolFromSmiles(x) for x in smiles]

# large dataset for db search
## uniprot id로 검색하여 target_chembl_id가 필요함
from check_chembl import assay
chembl_df = assay('CHEMBL2079846')

db_names = chembl_df['molecule_chembl_id']
db_smiles = chembl_df['canonical_smiles']

# Molecules in CHEMBL2079846:  424
# Loading(%): 11%, 23%, 35%, 47%, 58%, 70%, 82%, 94%, 

In [9]:
smiles

['C1=CC=CC=C1',
 'CC(=O)N',
 'C1=CN=CN=C1',
 'C1=CC=C(C=C1)C=O',
 'CC(C)CC1=CC=C(C=C1)C(C)C(=O)O',
 'CSC1=NC2=NN=C(C(=O)N2N1)[N+](=O)[O-]']

In [11]:
from myrdkit import split_list

In [12]:
from myrdkit import sim_search
df = sim_search(smiles, db_smiles, 0.2, names)
df

# Generting fingerprints...
# Searching...: 0  1  2  3  4  5  
# The time spent: 0.37


Unnamed: 0,id,smiles,matched_freq,matched_smiles
0,Benzene,C1=CC=CC=C1,0,[]
1,Acetamide,CC(=O)N,0,[]
2,Pyrimidine,C1=CN=CN=C1,0,[]
3,Benzaldehyde,C1=CC=C(C=C1)C=O,0,[]
4,Ibuprofen,CC(C)CC1=CC=C(C=C1)C(C)C(=O)O,0,[]
5,Triazavirin,CSC1=NC2=NN=C(C(=O)N2N1)[N+](=O)[O-],412,[COc1cc2nc(C)nc(N[C@H](C)c3cc(N)cc(C(F)(F)F)c3...


In [13]:
from myrdkit import ss_search
df = ss_search(smiles, db_smiles, names)
df

# Generting mols...
# Searching...: 0  1  2  3  4  5  
# The time spent: 0.24


Unnamed: 0,id,smiles,matched_freq,matched_smiles
0,Benzene,C1=CC=CC=C1,424,[CC[C@H](C)[C@H](NC(=O)CN1C/C=C\CCC(=O)N[C@@H]...
1,Acetamide,CC(=O)N,177,[CC[C@H](C)[C@H](NC(=O)CN1C/C=C\CCC(=O)N[C@@H]...
2,Pyrimidine,C1=CN=CN=C1,252,[Cc1cc(Cn2c(N3CCNCC3)nc3c(-c4cncnc4)cc(Cl)cc32...
3,Benzaldehyde,C1=CC=C(C=C1)C=O,27,[O=C1Oc2ccccc2C(=O)/C1=C/N1C(=O)/C(=C/c2ccc([N...
4,Ibuprofen,CC(C)CC1=CC=C(C=C1)C(C)C(=O)O,0,[]
5,Triazavirin,CSC1=NC2=NN=C(C(=O)N2N1)[N+](=O)[O-],0,[]


In [14]:
from myrdkit import butina_cluster
df = butina_cluster(db_smiles, 0.6, db_names)
df

# Generting fingerprints...
# Generating distance matrix & clustering
# Size of Clusters:  [214, 96, 28, 22, 15, 11, 7, 5, 4, 4, 3, 2, 2]
# The spent time: 0.38


Unnamed: 0,id,smiles,cl_butina,cl_size
0,CHEMBL2086797,CC[C@H](C)[C@H](NC(=O)CN1C/C=C\CCC(=O)N[C@@H](...,0,1
1,CHEMBL4160864,N[C@@H](Cc1c[nH]c2ccccc12)C(=O)N1CCC2(CC1)CN(C...,4,22
2,CHEMBL4171085,N[C@@H](Cc1c[nH]c2ccccc12)C(=O)NCC1CCN(Cc2c[nH...,4,22
3,CHEMBL4160484,N[C@@H](Cc1c[nH]c2cc(Cl)ccc12)C(=O)NC1CCN(Cc2c...,4,22
4,CHEMBL4167713,N[C@@H](Cc1c[nH]c2ccc(Cl)cc12)C(=O)NC1CCN(Cc2c...,4,22
...,...,...,...,...
419,CHEMBL4644031,NS(=O)(=O)c1ccc(Oc2ccc(F)c(Cl)c2)c(CN2CCN(C3CC...,3,28
420,CHEMBL4645686,NS(=O)(=O)c1ccc(Oc2ccc(F)c(Cl)c2)c(CN2CCN(C3CC...,3,28
421,CHEMBL4638123,NS(=O)(=O)c1ccc(Oc2ccc(F)c(Cl)c2)c(CN2CCN(C3CN...,3,28
422,CHEMBL4637232,NCCCN1CCN(Cc2cc(S(N)(=O)=O)ccc2Oc2ccc(F)c(Cl)c...,3,28
