# Create chemical clusters based on their similarities
Note: to run this code you need to have RDkit package and environment

In [11]:
import os
import pandas as pd
from rdkit.Chem.Fingerprints import ClusterMols
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem import PandasTools
from rdkit.Chem import rdFingerprintGenerator
from rdkit import Chem, DataStructs
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import MACCSkeys
from rdkit.Chem import rdmolops
from rdkit.Chem.Fingerprints import FingerprintMols
import numpy as np
from tqdm import tqdm_notebook as tqdm
from rdkit import DataStructs
from rdkit.ML.Cluster import Butina

# Get chemicals mapping file

In [14]:
mapping_file = pd.read_csv(os.path.join(os.pardir, "resources", "drugbank_pubchem_mapping.tsv"))

In [15]:
mapping_file.head()

Unnamed: 0,PubchemID,Smiles,DrugbankName
0,5311128,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,Goserelin
1,16051933,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...,Desmopressin
2,25074887,CC(C)C[C@H](NC(=O)[C@@H](CCCNC(N)=O)NC(=O)[C@H...,Cetrorelix
3,5280754,CCC1NC(=O)C(C(O)C(C)C\C=C\C)N(C)C(=O)C(C(C)C)N...,Cyclosporine
4,14257662,NCCCC[C@H](NC(=O)[C@@H]1CCCN1C(=O)[C@@H]1CSSC[...,Felypressin


In [16]:
mapping_file = mapping_file.dropna()

In [38]:
full_graph = pybel.from_pickle(os.path.join(os.pardir, "resources", "fullgraph_without_sim.pickle"))

In [47]:
chem_list = []
for node in full_graph.nodes():
    if node.namespace != 'pubchem':
        continue
    chem_list.append(node.identifier)

In [48]:
mols_dict = {}
for index, row in mapping_file.iterrows():
    if str(row['PubchemID']) not in chem_list:
        continue
    mols_dict[row['PubchemID']] = Chem.MolFromSmiles(row['Smiles'])

# Using distance matrix for clustering

# Clustering using Butina

In [50]:
fps_drug = {}
fps = []
drugs = []
for drug, mol in tqdm(mols_dict.items()):
    if mol is None:
        continue
    fp = MACCSkeys.GenMACCSKeys(mol)
    fps.append(fp)
    drugs.append(drug)
    fps_drug[drug] = fp

HBox(children=(IntProgress(value=0, max=3759), HTML(value='')))

In [55]:
dists = []
nfps = len(fps)
for i in tqdm(range(1,nfps)):
    sims = DataStructs.BulkTanimotoSimilarity(fps[i],fps[:i])
    dists.extend([1-x for x in sims])

HBox(children=(IntProgress(value=0, max=3709), HTML(value='')))

In [56]:
cs = Butina.ClusterData(dists,nfps,0.3,isDistData=True)

In [57]:
df = pd.DataFrame(columns = ['PubchemID', 'Cluster'])

In [58]:
i=1
j=1
clusters = {}
for cluster in cs:
    for drug in cluster:
        df.loc[i] = [drugs[drug-1]] + [j]
        i+=1
    j+=1

In [59]:
df.to_csv(os.path.join(os.pardir, "resources", 'Clustered_chemicals.csv'), index=False)