# t-Distributed Stochastic Neighbor Embedding (t-SNE)


t-Distributed Stochastic Neighbor Embedding (t-SNE) is a nonlinear dimensionality reduction algorithm used to visualize data in a low-dimensional space (usually two or three dimensions) from high-dimensional data. t-SNE finds patterns in the distribution of points in a high-dimensional space and tries to preserve these patterns in a low-dimensional space. This algorithm is frequently used for data visualization, especially in fields such as bioinformatics, genomics, and data science in general.

In [None]:
from IPython.utils import io
import tqdm.notebook
import os, sys, random
total = 100
with tqdm.notebook.tqdm(total=total) as pbar:
    with io.capture_output() as captured:
      # Instalar rdkit
      !pip -q install rdkit.pypi==2021.9.4
      pbar.update(25)
      # Instalar molplotly
      !pip install molplotly
      pbar.update(50)
      # Instalar jupyter-dash
      !pip install jupyter-dash
      pbar.update(75)
      # Instalar el diseño de aplicación dash
      !pip install dash-bootstrap-components
      pbar.update(100)

  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
# Import libraries
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import MACCSkeys, AllChem
from scipy.spatial.distance import pdist

In [None]:
# Compounds
url_data = "https://drive.google.com/file/d/12lEtG19vJ0fXS6vvckV9jA3rpbiIfZRi/view?usp=drive_link"
url_data='https://drive.google.com/uc?id=' + url_data.split('/')[-2]
data = pd.read_csv(url_data)
data

Unnamed: 0,ID,All_fragments,All_fragments_h,Counts
0,NPDBEJECOL_All_frag_1,*C(C)C,CCC,34
1,NPDBEJECOL_All_frag_2,*O,O,20
2,NPDBEJECOL_All_frag_3,*C(C)CCC=C(C)C,CCCCC=C(C)C,13
3,NPDBEJECOL_All_frag_4,*CCCC(*)C,CCCCC,13
4,NPDBEJECOL_All_frag_5,*OCC,CCO,11
...,...,...,...,...
195,NPDBEJECOL_All_frag_196,*C(=O)CCCCCCCCCCCCCCC,CCCCCCCCCCCCCCCC=O,1
196,NPDBEJECOL_All_frag_197,*CC/C=C(\C)CCC*,CC/C=C(\C)CCC,1
197,NPDBEJECOL_All_frag_198,*CC/C=C(\C)CCC=C(C)C,CC/C=C(\C)CCC=C(C)C,1
198,NPDBEJECOL_All_frag_199,*CCC/C(C)=C/C/C=C(\C)C=C,C=C/C(C)=C/C/C=C(\C)CCC,1


In [None]:
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem.rdMolDescriptors import CalcNumAliphaticRings,CalcNumAromaticRings, CalcNumAliphaticHeterocycles, CalcNumAromaticHeterocycles
from rdkit.Chem.rdMolDescriptors import CalcNumHeterocycles, CalcNumRings, CalcNumSpiroAtoms, CalcNumBridgeheadAtoms,CalcExactMolWt

In [None]:
from rdkit.Chem import rdMolDescriptors
def MW (Smiles):
    mol = Chem.MolFromSmiles(Smiles)
    MW = rdMolDescriptors.CalcExactMolWt(mol)
    return MW

In [None]:
data["MW"] = data.apply(lambda x: MW(x["All_fragments_h"]), axis=1)

In [None]:
data

Unnamed: 0,ID,All_fragments,All_fragments_h,Counts,MW
0,NPDBEJECOL_All_frag_1,*C(C)C,CCC,34,44.062600
1,NPDBEJECOL_All_frag_2,*O,O,20,18.010565
2,NPDBEJECOL_All_frag_3,*C(C)CCC=C(C)C,CCCCC=C(C)C,13,112.125201
3,NPDBEJECOL_All_frag_4,*CCCC(*)C,CCCCC,13,72.093900
4,NPDBEJECOL_All_frag_5,*OCC,CCO,11,46.041865
...,...,...,...,...,...
195,NPDBEJECOL_All_frag_196,*C(=O)CCCCCCCCCCCCCCC,CCCCCCCCCCCCCCCC=O,1,240.245316
196,NPDBEJECOL_All_frag_197,*CC/C=C(\C)CCC*,CC/C=C(\C)CCC,1,112.125201
197,NPDBEJECOL_All_frag_198,*CC/C=C(\C)CCC=C(C)C,CC/C=C(\C)CCC=C(C)C,1,152.156501
198,NPDBEJECOL_All_frag_199,*CCC/C(C)=C/C/C=C(\C)C=C,C=C/C(C)=C/C/C=C(\C)CCC,1,164.156501


In [None]:
# Select columns
data = data[["ID",'All_fragments_h', 'MW']]
data1 = data.copy()

In [None]:
data1 = data1.round(3)

### MACCS keys

---



In [None]:
# Check for invalid SMILES strings
invalid_smiles = [x for x in data1['All_fragments_h'] if Chem.MolFromSmiles(x) is None]

# Print the invalid SMILES strings
print(f"Invalid SMILES strings: {invalid_smiles}")

# Remove the invalid SMILES strings from the DataFrame
data1 = data1[~data1['All_fragments_h'].isin(invalid_smiles)]

# Recalculate the MACCS keys
fps = [[int(y) for y in MACCSkeys.GenMACCSKeys(Chem.MolFromSmiles(x)).ToBitString()] for x in data1['All_fragments_h']]
fps = pd.DataFrame([np.array(x) for x in fps])
fps

Invalid SMILES strings: []


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,157,158,159,160,161,162,163,164,165,166
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
196,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
197,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
198,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [None]:
# Training t-SNE model
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
data_tsne = fps
#data_tsne = data_tsne.drop(labels = ["Data set", "ID","SMILES"],axis = 1)
data_tsne = StandardScaler().fit_transform(data_tsne)
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(data_tsne)
tsne_results


'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.



[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 200 samples in 0.001s...
[t-SNE] Computed neighbors for 200 samples in 0.004s...
[t-SNE] Computed conditional probabilities for sample 200 / 200
[t-SNE] Mean sigma: 3.980029
[t-SNE] KL divergence after 250 iterations with early exaggeration: 47.451500
[t-SNE] KL divergence after 300 iterations: 0.130581


array([[  1.8995826 ,   6.9096255 ],
       [  6.039369  ,   6.359945  ],
       [ -5.6699038 ,   5.4691386 ],
       [ -2.2001276 ,   5.045222  ],
       [ -4.352671  ,  -6.900762  ],
       [  5.0470433 ,   5.131785  ],
       [  7.130676  ,   7.0726156 ],
       [  5.0470433 ,   5.131785  ],
       [  0.93524   ,   2.4517267 ],
       [  6.8904796 ,   3.894285  ],
       [  0.61330706,   5.2080903 ],
       [ -1.6001188 ,  -4.349901  ],
       [ -1.1150337 ,  -0.70593214],
       [ -4.8644295 ,  -4.7499094 ],
       [  2.7660148 , -10.624658  ],
       [ -2.7800934 ,   3.2932043 ],
       [  4.377445  ,   2.6621225 ],
       [ -1.7040067 ,   5.6549263 ],
       [  4.809418  , -13.591264  ],
       [ -2.1535125 ,   3.814895  ],
       [  5.748706  , -13.045736  ],
       [ -0.36861438,  10.304724  ],
       [  5.295809  , -11.671763  ],
       [  8.943065  ,  -6.236764  ],
       [  7.2572536 ,  -8.514298  ],
       [  1.3305098 , -15.558576  ],
       [ -4.352671  ,  -6.900762  ],
 

In [None]:
# Select additional information
label = data1[["ID",'All_fragments_h', 'MW']]
label = label.to_numpy()
print(label.shape)
# Concatenate numpy arrays
arr = np.concatenate((label, tsne_results), axis = 1)
print(arr.shape)
# New dataframe
tsne_dataset = pd.DataFrame(data=arr, columns = ['ID','All_fragments_h', 'MW','axis 1', 'axis 2'] )
tsne_dataset

(200, 3)
(200, 5)


Unnamed: 0,ID,All_fragments_h,MW,axis 1,axis 2
0,NPDBEJECOL_All_frag_1,CCC,44.063,1.899583,6.909626
1,NPDBEJECOL_All_frag_2,O,18.011,6.039369,6.359945
2,NPDBEJECOL_All_frag_3,CCCCC=C(C)C,112.125,-5.669904,5.469139
3,NPDBEJECOL_All_frag_4,CCCCC,72.094,-2.200128,5.045222
4,NPDBEJECOL_All_frag_5,CCO,46.042,-4.352671,-6.900762
...,...,...,...,...,...
195,NPDBEJECOL_All_frag_196,CCCCCCCCCCCCCCCC=O,240.245,-1.115034,-0.705932
196,NPDBEJECOL_All_frag_197,CC/C=C(\C)CCC,112.125,-8.350015,4.920345
197,NPDBEJECOL_All_frag_198,CC/C=C(\C)CCC=C(C)C,152.157,-6.85385,5.560457
198,NPDBEJECOL_All_frag_199,C=C/C(C)=C/C/C=C(\C)CCC,164.157,-7.030829,8.303562


In [None]:
# Graph
import plotly.express as px
import molplotly
fig_tsne = px.scatter(tsne_dataset,
                            x='axis 1',
                            y='axis 2',
                            title='t-SNE',
                            labels={'Axis 1': 'axis 1',
                                    'Axis 2': 'axis 2'},
                            width=600,
                            height=500)

fig_tsne.update_traces(marker=dict(color='red'))

app_marker = molplotly.add_molecules(fig=fig_tsne,
                                         df=tsne_dataset,
                                         smiles_col='All_fragments_h',
                                         title_col='ID')
#fig_tsne.show()
app_marker.run(port=8060)


JupyterDash is deprecated, use Dash instead.
See https://dash.plotly.com/dash-in-jupyter for more details.



<IPython.core.display.Javascript object>

## Morgan2

---



In [None]:
# Check for invalid SMILES strings
invalid_smiles = [x for x in data1['All_fragments_h'] if Chem.MolFromSmiles(x) is None]

# Print the invalid SMILES strings
print(f"Invalid SMILES strings: {invalid_smiles}")

# Remove the invalid SMILES strings from the DataFrame
data1 = data1[~data1['All_fragments_h'].isin(invalid_smiles)]

# Recalculate the Morgan2
fps = [[int(y) for y in AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(x), 2, nBits=1024).ToBitString()] for x in data1['All_fragments_h']]
fps = pd.DataFrame([np.array(x) for x in fps])
fps

Invalid SMILES strings: []


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
196,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
197,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
198,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [None]:
# Entrenar modelo t-SNE
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
data_tsne = fps
#data_tsne = data_tsne.drop(labels = ["Data set", "ID","SMILES"],axis = 1)
data_tsne = StandardScaler().fit_transform(data_tsne)
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(data_tsne)
tsne_results


'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.



[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 200 samples in 0.001s...
[t-SNE] Computed neighbors for 200 samples in 0.007s...
[t-SNE] Computed conditional probabilities for sample 200 / 200
[t-SNE] Mean sigma: 8.047468
[t-SNE] KL divergence after 250 iterations with early exaggeration: 70.553864
[t-SNE] KL divergence after 300 iterations: 0.638842


array([[ -8.661039  ,  -2.5960367 ],
       [ -1.0695682 ,   6.865111  ],
       [  4.96828   ,  -1.6779356 ],
       [ -5.93876   ,   4.224108  ],
       [  1.5100249 ,   5.6183424 ],
       [ -7.944598  ,   0.3035986 ],
       [  3.8758473 ,   6.6939588 ],
       [ -7.944598  ,   0.3035986 ],
       [  0.32756   ,   2.7573934 ],
       [-11.529414  ,  -0.915625  ],
       [ -6.5601587 ,  -4.475707  ],
       [  7.357223  ,  -0.9258332 ],
       [ -2.722078  ,   1.3882968 ],
       [-18.273794  ,  11.070692  ],
       [-13.021529  ,  10.368954  ],
       [ -3.6401253 ,   4.523091  ],
       [ -3.624335  ,  -5.1782403 ],
       [  0.73008394,   0.36807713],
       [ 14.450931  ,  10.519036  ],
       [ -4.6010404 ,   3.2318652 ],
       [ -8.724372  ,  -4.172484  ],
       [ -5.027643  ,  -4.491553  ],
       [ 11.086202  ,  12.672077  ],
       [  6.141717  ,   9.15481   ],
       [ -3.2731433 ,   8.396736  ],
       [ -2.676209  , -15.083777  ],
       [  1.5100249 ,   5.6183424 ],
 

In [None]:
# Seleccionar impormación complementaria
label = data1[["ID",'All_fragments_h','MW']]
label = label.to_numpy()
print(label.shape)
# Concatenar arrays de numpy
arr = np.concatenate((label, tsne_results), axis = 1)
print(arr.shape)
# Crear un nuevo dataframe
tsne_dataset = pd.DataFrame(data=arr, columns = ['ID','All_fragments_h', 'MW','axis 1', 'axis 2'] )
tsne_dataset

(200, 3)
(200, 5)


Unnamed: 0,ID,All_fragments_h,MW,axis 1,axis 2
0,NPDBEJECOL_All_frag_1,CCC,44.063,-8.661039,-2.596037
1,NPDBEJECOL_All_frag_2,O,18.011,-1.069568,6.865111
2,NPDBEJECOL_All_frag_3,CCCCC=C(C)C,112.125,4.96828,-1.677936
3,NPDBEJECOL_All_frag_4,CCCCC,72.094,-5.93876,4.224108
4,NPDBEJECOL_All_frag_5,CCO,46.042,1.510025,5.618342
...,...,...,...,...,...
195,NPDBEJECOL_All_frag_196,CCCCCCCCCCCCCCCC=O,240.245,-2.722078,1.388297
196,NPDBEJECOL_All_frag_197,CC/C=C(\C)CCC,112.125,3.326176,3.405411
197,NPDBEJECOL_All_frag_198,CC/C=C(\C)CCC=C(C)C,152.157,6.524919,2.050775
198,NPDBEJECOL_All_frag_199,C=C/C(C)=C/C/C=C(\C)CCC,164.157,7.955836,-15.955557


In [None]:
# Graph
import plotly.express as px
import molplotly
fig_tsne = px.scatter(tsne_dataset,
                            x='axis 1',
                            y='axis 2',
                            title='t-SNE',
                            labels={'Axis 1': 'axis 1',
                                    'Axis 2': 'axis 2'},
                            width=600,
                            height=500)

fig_tsne.update_traces(marker=dict(color='orange'))

app_marker = molplotly.add_molecules(fig=fig_tsne,
                                         df=tsne_dataset,
                                         smiles_col='All_fragments_h',
                                         title_col='ID')

#fig_tsne.show()
app_marker.run(port=8060)


JupyterDash is deprecated, use Dash instead.
See https://dash.plotly.com/dash-in-jupyter for more details.



<IPython.core.display.Javascript object>