# t-Distributed Stochastic Neighbor Embedding (t-SNE)


t-Distributed Stochastic Neighbor Embedding (t-SNE) is a nonlinear dimensionality reduction algorithm used to visualize data in a low-dimensional space (usually two or three dimensions) from high-dimensional data. t-SNE finds patterns in the distribution of points in a high-dimensional space and tries to preserve these patterns in a low-dimensional space. This algorithm is frequently used for data visualization, especially in fields such as bioinformatics, genomics, and data science in general.

In [None]:
from IPython.utils import io
import tqdm.notebook
import os, sys, random
total = 100
with tqdm.notebook.tqdm(total=total) as pbar:
    with io.capture_output() as captured:
      # Instalar rdkit
      !pip -q install rdkit.pypi==2021.9.4
      pbar.update(25)
      # Instalar molplotly
      !pip install molplotly
      pbar.update(50)
      # Instalar jupyter-dash
      !pip install jupyter-dash
      pbar.update(75)
      # Instalar el diseño de aplicación dash
      !pip install dash-bootstrap-components
      pbar.update(100)

  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
# Import libraries
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import MACCSkeys, AllChem
from scipy.spatial.distance import pdist

In [None]:
# Compounds
url_data = "https://drive.google.com/file/d/1DyylcsCIuZ5vzNRr23JS0JY3ELH8exJQ/view?usp=drive_link"
url_data='https://drive.google.com/uc?id=' + url_data.split('/')[-2]
data = pd.read_csv(url_data)
data

Unnamed: 0,ID,isomeric smiles,canonical smiles
0,NPDBEJECOL1,CCCCCC1=CC(=C(C(=C1)O)C2C=C(CCC2C(=C)C)C)O,C=C(C)C1CCC(C)=CC1c1c(O)cc(CCCCC)cc1O
1,NPDBEJECOL2,CCCCCC1=CC(=C2C3C=C(CCC3C(OC2=C1)(C)C)C)O,CCCCCc1cc(O)c2c(c1)OC(C)(C)C1CCC(C)=CC21
2,NPDBEJECOL3,CCCCCC1=CC(=C2C(=C1)OC(C3=C2C=C(C=C3)C)(C)C)O,CCCCCc1cc(O)c2c(c1)OC(C)(C)c1ccc(C)cc1-2
3,NPDBEJECOL4,COC1=C(C=CC(=C1)/C=C/C(=O)CC(=O)/C=C/C2=CC(=C(...,COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)c(OC)c2)ccc1O
4,NPDBEJECOL5,COC1=C(C=CC(=C1)/C=C/C(=O)CC(=O)/C=C/C2=CC=C(C...,COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)cc2)ccc1O
...,...,...,...
228,NPDBEJECOL232,CN1C=NC2=C1C(=O)N(C(=O)N2C)C,Cn1c(=O)c2c(ncn2C)n(C)c1=O
229,NPDBEJECOL233,CN1C=NC2=C1C(=O)NC(=O)N2C,Cn1cnc2c1c(=O)[nH]c(=O)n2C
230,NPDBEJECOL234,C1=CC(=CC=C1C(=O)NC(CCC(=O)O)C(=O)O)NCC2=CN=C3...,N=c1[nH]c(=O)c2nc(CNc3ccc(C(=O)NC(CCC(=O)O)C(=...
231,NPDBEJECOL235,C1=CC(=C(C=C1C2=C(C(=O)C3=C(C=C(C=C3O2)O)O)O[C...,O=c1c(O[C@@H]2O[C@H]([C@H](O)CO)[C@H](O)[C@H]2...


In [None]:
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem.rdMolDescriptors import CalcNumAliphaticRings,CalcNumAromaticRings, CalcNumAliphaticHeterocycles, CalcNumAromaticHeterocycles
from rdkit.Chem.rdMolDescriptors import CalcNumHeterocycles, CalcNumRings, CalcNumSpiroAtoms, CalcNumBridgeheadAtoms,CalcExactMolWt

In [None]:
from rdkit.Chem import rdMolDescriptors
def MW (Smiles):
    mol = Chem.MolFromSmiles(Smiles)
    MW = rdMolDescriptors.CalcExactMolWt(mol)
    return MW

In [None]:
data["MW"] = data.apply(lambda x: MW(x["canonical smiles"]), axis=1)

In [None]:
data

Unnamed: 0,ID,isomeric smiles,canonical smiles,MW
0,NPDBEJECOL1,CCCCCC1=CC(=C(C(=C1)O)C2C=C(CCC2C(=C)C)C)O,C=C(C)C1CCC(C)=CC1c1c(O)cc(CCCCC)cc1O,314.224580
1,NPDBEJECOL2,CCCCCC1=CC(=C2C3C=C(CCC3C(OC2=C1)(C)C)C)O,CCCCCc1cc(O)c2c(c1)OC(C)(C)C1CCC(C)=CC21,314.224580
2,NPDBEJECOL3,CCCCCC1=CC(=C2C(=C1)OC(C3=C2C=C(C=C3)C)(C)C)O,CCCCCc1cc(O)c2c(c1)OC(C)(C)c1ccc(C)cc1-2,310.193280
3,NPDBEJECOL4,COC1=C(C=CC(=C1)/C=C/C(=O)CC(=O)/C=C/C2=CC(=C(...,COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)c(OC)c2)ccc1O,368.125988
4,NPDBEJECOL5,COC1=C(C=CC(=C1)/C=C/C(=O)CC(=O)/C=C/C2=CC=C(C...,COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)cc2)ccc1O,338.115424
...,...,...,...,...
228,NPDBEJECOL232,CN1C=NC2=C1C(=O)N(C(=O)N2C)C,Cn1c(=O)c2c(ncn2C)n(C)c1=O,194.080376
229,NPDBEJECOL233,CN1C=NC2=C1C(=O)NC(=O)N2C,Cn1cnc2c1c(=O)[nH]c(=O)n2C,180.064725
230,NPDBEJECOL234,C1=CC(=CC=C1C(=O)NC(CCC(=O)O)C(=O)O)NCC2=CN=C3...,N=c1[nH]c(=O)c2nc(CNc3ccc(C(=O)NC(CCC(=O)O)C(=...,441.139681
231,NPDBEJECOL235,C1=CC(=C(C=C1C2=C(C(=O)C3=C(C=C(C=C3O2)O)O)O[C...,O=c1c(O[C@@H]2O[C@H]([C@H](O)CO)[C@H](O)[C@H]2...,464.095476


In [None]:
# Select columns
data = data[["ID",'canonical smiles', 'MW']]
data1 = data.copy()

In [None]:
data1 = data1.round(3)

### MACCS keys

---



In [None]:
# Check for invalid SMILES strings
invalid_smiles = [x for x in data1['canonical smiles'] if Chem.MolFromSmiles(x) is None]

# Print the invalid SMILES strings
print(f"Invalid SMILES strings: {invalid_smiles}")

# Remove the invalid SMILES strings from the DataFrame
data1 = data1[~data1['canonical smiles'].isin(invalid_smiles)]

# Recalculate the MACCS keys
fps = [[int(y) for y in MACCSkeys.GenMACCSKeys(Chem.MolFromSmiles(x)).ToBitString()] for x in data1['canonical smiles']]
fps = pd.DataFrame([np.array(x) for x in fps])
fps

Invalid SMILES strings: []


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,157,158,159,160,161,162,163,164,165,166
0,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,1,1,1,1,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,1,1,1,1,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,1,1,1,1,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,1,1,1,1,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
229,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
230,0,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0
231,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,1,1,1,0


In [None]:
# Training t-SNE model
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
data_tsne = fps
#data_tsne = data_tsne.drop(labels = ["Data set", "ID","SMILES"],axis = 1)
data_tsne = StandardScaler().fit_transform(data_tsne)
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(data_tsne)
tsne_results



[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 233 samples in 0.001s...
[t-SNE] Computed neighbors for 233 samples in 0.195s...
[t-SNE] Computed conditional probabilities for sample 233 / 233
[t-SNE] Mean sigma: 4.060589
[t-SNE] KL divergence after 250 iterations with early exaggeration: 49.468426
[t-SNE] KL divergence after 300 iterations: 0.299703


array([[ -4.4209247 ,   3.899646  ],
       [ -5.6780076 ,   4.368656  ],
       [ -6.000147  ,   4.3217316 ],
       [ -8.656068  ,  -5.4424043 ],
       [ -8.673213  ,  -5.3852043 ],
       [ -8.836827  ,  -4.8522286 ],
       [  2.1579945 ,   3.012707  ],
       [  5.309948  ,   8.26849   ],
       [  1.7268766 ,   2.9371774 ],
       [  4.579041  ,   8.966517  ],
       [  7.715059  ,   7.4624004 ],
       [  6.2592297 ,   4.62974   ],
       [  3.233261  ,   8.912417  ],
       [  2.6680036 ,  -0.53267497],
       [  6.3296866 ,   6.4681106 ],
       [ -5.752456  ,   5.8850546 ],
       [  1.6951993 ,   4.78495   ],
       [  4.944529  ,   0.31752038],
       [  0.58595806,   1.0330113 ],
       [  0.35830775,   0.94780195],
       [  2.8493545 ,   2.2635858 ],
       [  7.086214  ,   4.1053934 ],
       [  5.433775  ,   0.31841645],
       [  5.433775  ,   0.31841645],
       [ -5.277702  ,  -6.5917625 ],
       [  4.8577833 ,   3.3415813 ],
       [  7.015941  ,   1.1206659 ],
 

In [None]:
# Select additional information
label = data1[["ID",'canonical smiles', 'MW']]
label = label.to_numpy()
print(label.shape)
# Concatenate numpy arrays
arr = np.concatenate((label, tsne_results), axis = 1)
print(arr.shape)
# New dataframe
tsne_dataset = pd.DataFrame(data=arr, columns = ['ID','canonical_smiles', 'MW','axis 1', 'axis 2'] )
tsne_dataset

(233, 3)
(233, 5)


Unnamed: 0,ID,canonical_smiles,MW,axis 1,axis 2
0,NPDBEJECOL1,C=C(C)C1CCC(C)=CC1c1c(O)cc(CCCCC)cc1O,314.225,-4.420925,3.899646
1,NPDBEJECOL2,CCCCCc1cc(O)c2c(c1)OC(C)(C)C1CCC(C)=CC21,314.225,-5.678008,4.368656
2,NPDBEJECOL3,CCCCCc1cc(O)c2c(c1)OC(C)(C)c1ccc(C)cc1-2,310.193,-6.000147,4.321732
3,NPDBEJECOL4,COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)c(OC)c2)ccc1O,368.126,-8.656068,-5.442404
4,NPDBEJECOL5,COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)cc2)ccc1O,338.115,-8.673213,-5.385204
...,...,...,...,...,...
228,NPDBEJECOL232,Cn1c(=O)c2c(ncn2C)n(C)c1=O,194.08,-6.169756,-11.514793
229,NPDBEJECOL233,Cn1cnc2c1c(=O)[nH]c(=O)n2C,180.065,-6.19267,-11.486111
230,NPDBEJECOL234,N=c1[nH]c(=O)c2nc(CNc3ccc(C(=O)NC(CCC(=O)O)C(=...,441.14,-6.689895,-10.892835
231,NPDBEJECOL235,O=c1c(O[C@@H]2O[C@H]([C@H](O)CO)[C@H](O)[C@H]2...,464.095,-11.269855,-1.752013


In [None]:
# Graph
import plotly.express as px
import molplotly
fig_tsne = px.scatter(tsne_dataset,
                            x='axis 1',
                            y='axis 2',
                            title='t-SNE',
                            labels={'Axis 1': 'axis 1',
                                    'Axis 2': 'axis 2'},
                            width=600,
                            height=500)

fig_tsne.update_traces(marker=dict(color='red'))

app_marker = molplotly.add_molecules(fig=fig_tsne,
                                         df=tsne_dataset,
                                         smiles_col='canonical smiles',
                                         title_col='ID')
#fig_tsne.show()
app_marker.run(port=8060)


JupyterDash is deprecated, use Dash instead.
See https://dash.plotly.com/dash-in-jupyter for more details.



<IPython.core.display.Javascript object>

## Morgan2

---



In [None]:
# Check for invalid SMILES strings
invalid_smiles = [x for x in data1['canonical smiles'] if Chem.MolFromSmiles(x) is None]

# Print the invalid SMILES strings
print(f"Invalid SMILES strings: {invalid_smiles}")

# Remove the invalid SMILES strings from the DataFrame
data1 = data1[~data1['canonical smiles'].isin(invalid_smiles)]

# Recalculate the Morgan2
fps = [[int(y) for y in AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(x), 2, nBits=1024).ToBitString()] for x in data1['canonical smiles']]
fps = pd.DataFrame([np.array(x) for x in fps])
fps

Invalid SMILES strings: []


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
229,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
230,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
231,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0


In [None]:
# Entrenar modelo t-SNE
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
data_tsne = fps
#data_tsne = data_tsne.drop(labels = ["Data set", "ID","SMILES"],axis = 1)
data_tsne = StandardScaler().fit_transform(data_tsne)
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(data_tsne)
tsne_results


'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.



[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 233 samples in 0.001s...
[t-SNE] Computed neighbors for 233 samples in 0.014s...
[t-SNE] Computed conditional probabilities for sample 233 / 233
[t-SNE] Mean sigma: 9.915769
[t-SNE] KL divergence after 250 iterations with early exaggeration: 80.985535
[t-SNE] KL divergence after 300 iterations: 1.013685


array([[  5.829528  , -13.442096  ],
       [ -4.5367107 , -14.7905    ],
       [  6.251253  , -19.771137  ],
       [-11.026577  ,   5.8463893 ],
       [-11.746675  ,   5.7301297 ],
       [-12.589678  ,   5.3304873 ],
       [-12.132943  , -23.046097  ],
       [  8.353455  ,   2.1388054 ],
       [  5.261265  ,   6.2273455 ],
       [ 13.440852  ,   2.7005858 ],
       [ -0.7472596 ,  16.248817  ],
       [ -1.4175615 ,  -6.711185  ],
       [ -1.214429  , -10.46733   ],
       [  3.088261  ,  12.069853  ],
       [-10.050189  , -12.038311  ],
       [  4.908583  ,  -8.232468  ],
       [ -7.313091  , -11.972645  ],
       [  3.406145  ,  21.629288  ],
       [ -0.7012709 ,   4.780862  ],
       [  5.882179  ,   9.974466  ],
       [-18.316647  ,  -0.82425916],
       [ -7.522126  ,  -4.1339145 ],
       [ -4.7159314 ,   2.4591467 ],
       [ -4.7159314 ,   2.4591467 ],
       [-11.248334  ,  -2.7640073 ],
       [-13.047515  ,  -7.06009   ],
       [ 16.106903  ,  -9.453519  ],
 

In [None]:
# Seleccionar impormación complementaria
label = data1[["ID",'canonical smiles','MW']]
label = label.to_numpy()
print(label.shape)
# Concatenar arrays de numpy
arr = np.concatenate((label, tsne_results), axis = 1)
print(arr.shape)
# Crear un nuevo dataframe
tsne_dataset = pd.DataFrame(data=arr, columns = ['ID','canonical smiles', 'MW','axis 1', 'axis 2'] )
tsne_dataset

(233, 3)
(233, 5)


Unnamed: 0,ID,canonical smiles,MW,axis 1,axis 2
0,NPDBEJECOL1,C=C(C)C1CCC(C)=CC1c1c(O)cc(CCCCC)cc1O,314.225,5.829528,-13.442096
1,NPDBEJECOL2,CCCCCc1cc(O)c2c(c1)OC(C)(C)C1CCC(C)=CC21,314.225,-4.536711,-14.7905
2,NPDBEJECOL3,CCCCCc1cc(O)c2c(c1)OC(C)(C)c1ccc(C)cc1-2,310.193,6.251253,-19.771137
3,NPDBEJECOL4,COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)c(OC)c2)ccc1O,368.126,-11.026577,5.846389
4,NPDBEJECOL5,COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)cc2)ccc1O,338.115,-11.746675,5.73013
...,...,...,...,...,...
228,NPDBEJECOL232,Cn1c(=O)c2c(ncn2C)n(C)c1=O,194.08,-1.476139,-18.219326
229,NPDBEJECOL233,Cn1cnc2c1c(=O)[nH]c(=O)n2C,180.065,-16.422354,-14.295237
230,NPDBEJECOL234,N=c1[nH]c(=O)c2nc(CNc3ccc(C(=O)NC(CCC(=O)O)C(=...,441.14,-15.853365,-10.50468
231,NPDBEJECOL235,O=c1c(O[C@@H]2O[C@H]([C@H](O)CO)[C@H](O)[C@H]2...,464.095,3.253144,17.739388


In [None]:
# Graficar
import plotly.express as px
import molplotly
fig_tsne = px.scatter(tsne_dataset,
                            x='axis 1',
                            y='axis 2',
                            title='t-SNE',
                            labels={'Axis 1': 'axis 1',
                                    'Axis 2': 'axis 2'},
                            width=600,
                            height=500)

fig_tsne.update_traces(marker=dict(color='orange'))

app_marker = molplotly.add_molecules(fig=fig_tsne,
                                         df=tsne_dataset,
                                         smiles_col='canonical smiles',
                                         title_col='ID')

#fig_tsne.show()
app_marker.run(port=8060)


JupyterDash is deprecated, use Dash instead.
See https://dash.plotly.com/dash-in-jupyter for more details.



<IPython.core.display.Javascript object>