In [1]:
import pandas as pd
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold, rdScaffoldNetwork
from rdkit.Chem import rdFMCS 
from rdkit import DataStructs, Chem
from scipy.spatial.distance import squareform, cdist, pdist
from scipy.cluster.hierarchy import fcluster, linkage, dendrogram
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmilesFromSmiles
import tmap
from faerun import Faerun
from tqdm import tqdm
from glob import glob
import os

from matplotlib.colors import ListedColormap
import matplotlib as mpl

import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt
from mycolorpy import colorlist as mcp
import numpy as np

sns.set(style = 'white', font_scale=2)


from clsar.dataset import LSSNS,HSSMS  # dataset

seed = 100

In [2]:


dataset_dir = '/home/shenwanxiang/Research/MPCD/dataset/HSSMS/MoleculeACE_benchmark/'
dataset_name = 'CHEMBL2835_Ki'
data_save_folder = './results_tmap'

df = pd.read_csv(os.path.join(dataset_dir,dataset_name + '.csv'), index_col = 'smiles')
df_raw = pd.read_csv(os.path.join(dataset_dir,'raw',dataset_name+ '.csv'), index_col = 'smiles')
df = df.join(df_raw.chembl_id).reset_index()
df['pChEMBL Value'] = 9-np.log10(df['exp_mean [nM]'])
df['cliff_mol'] = df['cliff_mol'].map({1:True, 0:False})


df_test = df[df.split == 'test']
df_train = df[df.split == 'train']
len(df_test), len(df_train)

#df_test.to_csv('./results_tmap/df_test.csv')
df_train.to_csv('./results_tmap/df_train.csv')



In [3]:
df = df_test.reset_index(drop=True)
smiles_list = df.smiles.to_list()

dim = 2048

mols = [Chem.MolFromSmiles(s) for s in smiles_list]
ECFP4_fps = [AllChem.GetMorganFingerprintAsBitVect(x,2,dim) for x in tqdm(mols, ascii=True)]
ecfps = [tmap.VectorUchar(list(fp)) for fp in ECFP4_fps]
enc = tmap.Minhash(dim, seed = seed)
lf = tmap.LSHForest(dim)
lf.batch_add(enc.batch_from_binary_array(ecfps))
lf.index()


cfg = tmap.LayoutConfiguration()
cfg.fme_randomize = False
cfg.k = 50
cfg.kc = 50
cfg.sl_scaling_min = 1.0
cfg.sl_scaling_max = 1.0
cfg.sl_repeats = 1
cfg.sl_extra_scaling_steps = 2 #2
cfg.placer = tmap.Placer.Barycenter
cfg.merger = tmap.Merger.LocalBiconnected
cfg.merger_factor = 2.0
cfg.merger_adjustment = 0
cfg.fme_iterations = 40
cfg.sl_scaling_type = tmap.ScalingType.RelativeToDesiredLength #
cfg.node_size = 1/10
cfg.mmm_repeats = 1


x, y, s, t, gp = tmap.layout_from_lsh_forest(lf, config = cfg)

c1 = df['pChEMBL Value'].round(2)
c2 = df['cliff_mol']

c = [c1, c2] #具体的属性数据

series_title = ['pChEMBL',  'cliff_mol'] ##具体的属性数据名称

categorical = [False, True]

cmap = ['jet_r', 'Set1']

min_legend_label= [float(c1.min()), None,]
max_legend_label= [float(c1.max()), None,]

labels = (df['smiles'] + "__" + df['chembl_id'] + ': ' + c1.astype(str)).tolist() #显示smiles， 以及图片中的标签

point_scale = 15

legend_labels = [None, None]
#========================================================

faerun = Faerun(view="front", clear_color='#111111',coords=False) #'#ffffff'
faerun.add_scatter(dataset_name, { "x": x, "y": y, 
                              "c": c, "labels": labels},
                   legend_labels = legend_labels,
                   categorical = categorical,
                   max_legend_label = max_legend_label,
                   min_legend_label = min_legend_label,
                   point_scale=point_scale,
                   colormap = cmap,
                   has_legend=True,
                   series_title = series_title,
                   shader = 'smoothCircle') #"sphere", #

faerun.add_tree(dataset_name + "_tree", {"from": s, "to": t}, 
                point_helper=dataset_name,  color='#666666', ) #colors when no value

# Choose the "smiles" template to display structure on hover
faerun.plot(dataset_name+'_test', path = data_save_folder, template="smiles", notebook_height=750)

100%|###########################################################################################| 126/126 [00:00<00:00, 24976.71it/s]


In [4]:
df = df_train.reset_index(drop=True)
smiles_list = df.smiles.to_list()

mols = [Chem.MolFromSmiles(s) for s in smiles_list]
ECFP4_fps = [AllChem.GetMorganFingerprintAsBitVect(x,2,dim) for x in tqdm(mols, ascii=True)]
ecfps = [tmap.VectorUchar(list(fp)) for fp in ECFP4_fps]
enc = tmap.Minhash(dim, seed = seed)
lf = tmap.LSHForest(dim)
lf.batch_add(enc.batch_from_binary_array(ecfps))
lf.index()



x, y, s, t, gp = tmap.layout_from_lsh_forest(lf, config = cfg)

c1 = df['pChEMBL Value'].round(2)
c2 = df['cliff_mol']

c = [c1, c2] #具体的属性数据

series_title = ['pChEMBL',  'cliff_mol'] ##具体的属性数据名称

categorical = [False, True]

cmap = ['jet_r', 'Set1']

min_legend_label= [float(c1.min()), None,]
max_legend_label= [float(c1.max()), None,]

labels = (df['smiles'] + "__" + df['chembl_id'] + ': ' + c1.astype(str)).tolist() #显示smiles， 以及图片中的标签

point_scale = 10

legend_labels = [None, None]
#========================================================

faerun = Faerun(view="front", clear_color='#111111',coords=False) #'#ffffff'
faerun.add_scatter(dataset_name, { "x": x, "y": y, 
                              "c": c, "labels": labels},
                   legend_labels = legend_labels,
                   categorical = categorical,
                   max_legend_label = max_legend_label,
                   min_legend_label = min_legend_label,
                   point_scale=point_scale,
                   colormap = cmap,
                   has_legend=True,
                   series_title = series_title,
                   shader = 'smoothCircle') #"sphere", #

faerun.add_tree(dataset_name + "_tree", {"from": s, "to": t}, 
                point_helper=dataset_name,  color='#666666', ) #colors when no value

# Choose the "smiles" template to display structure on hover
faerun.plot(dataset_name+'_train', path = data_save_folder, template="smiles", notebook_height=750)

100%|###########################################################################################| 489/489 [00:00<00:00, 25981.61it/s]


In [5]:
data = (list(s), list(t))
from joblib import dump, load
dump(data, './results_tmap/chemical_space_edges_train.jb')

['./results_tmap/chemical_space_edges_train.jb']

In [6]:
dfg = pd.DataFrame([x, y]).T
dfg.columns = ['x', 'y']
dfg.index = df.index
dfg['pChEMBL'] = df['pChEMBL Value']
dfg['smiles'] = df['smiles']
dfg['chembl_id'] = df['chembl_id']

v=dfg['pChEMBL'].values

vmin = v.min()+0.2
vmax = v.max()-0.2 

norm = mpl.colors.Normalize(vmin=vmin, vmax=vmax, clip=False)
mapper = mpl.cm.ScalarMappable(norm=norm, cmap=mpl.cm.jet_r)
node_color=[mpl.colors.to_hex(mapper.to_rgba(i)) for i in v]


dfg['node_color'] = node_color
dfg['edgecolor'] = 'k'
dfg['linewidths'] = 0
dfg['node_size'] = 200
dfg['label'] = 'data'
dfg['alpha'] = 1
dfg['node_shape'] = 'o'

dfg.to_csv('./results_tmap/chemical_space_train.csv')