This notebook provides a short overview of different molecular representation methods.
Inspired from https://www.rdkit.org/docs/Cookbook.html , https://iwatobipen.wordpress.com/ , https://gist.github.com/greglandrum, https://practicalcheminformatics.blogspot.com/


Install RDKit which is an Open-Source Cheminformatics Software and py3Dmol which is a molecule visualiser.

In [None]:
!pip install rdkit

In [None]:
!pip install py3Dmol

In [None]:
#import all the modules needed for running this.
import py3Dmol
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import DataStructs
from rdkit.Chem.Draw.MolDrawing import MolDrawing, DrawingOptions

1.1. Working with SMILES
When we have SMILES string, we can use MolFromSmiles to render the molecule. Keep in mind that the SMILES doesn't contain the atomic coordinates and thus RdKit generates them while drawing the molecules.

In [None]:
# drawing structure from a SMILES
donz = Chem.MolFromSmiles('COC1=C(C=C2C(=C1)CC(C2=O)CC3CCN(CC3)CC4=CC=CC=C4)OC')
donz

In [None]:
# Add hydrogens to the structure.
donzH = Chem.AddHs(donz)
donzH

In [None]:
# Caluculate and display Gasteiger Charges for each atom
AllChem.ComputeGasteigerCharges(donzH)
donzH

donzH_charges = Chem.Mol(donzH)
for at in donzH_charges.GetAtoms():
    lbl = '%.2f'%(at.GetDoubleProp("_GasteigerCharge"))
    at.SetProp('atomNote',lbl)
donzH_charges

In [None]:
# Render a molecule in 3D
AllChem.EmbedMolecule(donzH)
rdkit.Chem.Draw.IPythonConsole.ipython_3d = True  # enable py3Dmol inline visualization
donzH

In [None]:
# Optmise the geometry using MMFF.
AllChem.MMFFOptimizeMolecule(donzH)
rdkit.Chem.Draw.IPythonConsole.ipython_3d = True  # enable py3Dmol inline visualization
donzH

In [None]:
#Highlighting substructure search

print(donz.GetSubstructMatches(Chem.MolFromSmarts('c1ccccc1')))
donz

In [None]:
#Download sdf file containing multiple molcules.
!wget https://raw.githubusercontent.com/Rajnishphe/AIDD-2022/main/ML%20Based%20QSAR/example_compounds.sdf


In [None]:
from rdkit import Chem
from rdkit.Chem import SDMolSupplier

# Load the SDF file
supplier = SDMolSupplier('example_compounds.sdf')

# Extract valid molecules
mols = [mol for mol in supplier if mol is not None]

print(f"Loaded {len(mols)} valid molecules.")


In [None]:
from rdkit.Chem import Draw

# Optional: show only first N molecules for clarity
n_show = 50
subset_mols = mols[:n_show]

# Generate grid image with molecule indices
img = Draw.MolsToGridImage(
    subset_mols,
    molsPerRow=10,
    subImgSize=(200, 200),
    legends=[f"Mol {i+1}" for i in range(len(subset_mols))],
    useSVG=True  # better rendering in Colab
)

img


In [None]:
import ipywidgets as widgets
from rdkit.Chem import Draw
from ipywidgets import interactive  # Make sure to import interactive

# Function to display a selected molecule
def display_molecule_with_info(index):
    mol = mols[index]
    name = mol.GetProp('_Name') if mol.HasProp('_Name') else 'No Name'

    # You can add more properties here (e.g., Activity, etc.)
    activity = mol.GetProp('Activity') if mol.HasProp('Activity') else 'N/A'

    img = Draw.MolToImage(mol, size=(300, 300))  # Create molecule image

    # Display molecule information (name and activity)
    print(f"Name: {name}, Activity: {activity}")
    display(img)

# Create the interactive widget with additional information
interactive_widget = interactive(display_molecule_with_info, index=widgets.IntSlider(min=0, max=len(mols)-1, step=1, value=0))

# Display the widget
interactive_widget


In [None]:
#Generate fingerprints

from rdkit import Chem
from rdkit.Chem import SDMolSupplier, rdFingerprintGenerator
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Create a fingerprint generator for Morgan fingerprints (radius=2, fpSize=1024)
fpg = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024)

# Generate fingerprints for each molecule in 'mols'
fingerprints = [fpg.GetFingerprint(mol) for mol in mols]

# Convert fingerprints to a binary matrix
fingerprint_matrix = []
for fp in fingerprints:
    arr = np.zeros((1,), dtype=int)
    from rdkit.Chem import DataStructs
    DataStructs.ConvertToNumpyArray(fp, arr)
    fingerprint_matrix.append(arr)

fingerprint_matrix = np.array(fingerprint_matrix)  # Convert list to numpy array

fingerprint_matrix

In [None]:
# Apply t-SNE to reduce the dimensionality of fingerprints to 2D
tsne = TSNE(n_components=2, random_state=42)
tsne_result = tsne.fit_transform(fingerprint_matrix)

# Plot the t-SNE result
plt.scatter(tsne_result[:, 0], tsne_result[:, 1])
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')
plt.title('t-SNE of Molecule Fingerprints')
plt.show()


In [None]:
#Download a csv file

!wget https://raw.githubusercontent.com/Rajnishphe/AIDD-2022/main/ML%20Based%20QSAR/new_1.csv


In [None]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('new_1.csv')

# Display the first few rows of the DataFrame
df.head()


In [None]:
#Convert SMILES column into RDKit molecule objects
df['Mol'] = df['SMILES'].apply(Chem.MolFromSmiles)

# Vizualize the molecules in a grid
# Display molecules in a grid (e.g., 5 molecules per row)
img = Draw.MolsToGridImage(df['Mol'].dropna(), molsPerRow=5, subImgSize=(200, 200))

# Display the grid image
img

In [None]:
!ls

In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors

# Step 1: Load the CSV file into a DataFrame
df = pd.read_csv('new_1.csv')

# Step 2: Convert SMILES column into RDKit molecule objects
df['Mol'] = df['SMILES'].apply(Chem.MolFromSmiles)

# Step 3: Calculate molecular descriptors and add them to the DataFrame
# List of descriptor functions to apply (you can add more descriptors as needed)
descriptors = {
    'MolWt': Descriptors.MolWt,
    'LogP': Descriptors.MolLogP,
    'TPSA': Descriptors.TPSA,
    'NumHDonors': Descriptors.NumHDonors,
    'NumHAcceptors': Descriptors.NumHAcceptors
}

# Calculate descriptors and add them as new columns in the DataFrame
for descriptor_name, descriptor_function in descriptors.items():
    df[descriptor_name] = df['Mol'].apply(lambda mol: descriptor_function(mol) if mol is not None else None)

# Display the updated DataFrame with descriptors
df.head()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming the descriptors are already calculated and stored in the DataFrame 'df'

# Define the descriptors to plot
descriptors = ['MolWt', 'LogP', 'TPSA', 'NumHDonors', 'NumHAcceptors']

# Create a subplot with multiple plots for each descriptor
fig, ax = plt.subplots(1, len(descriptors), figsize=(15, 5))

# Iterate over descriptors and plot
for i, descriptor in enumerate(descriptors):
    ax[i].hist(df[descriptor].dropna(), bins=20, color='skyblue', edgecolor='black')
    ax[i].set_title(descriptor)
    ax[i].set_xlabel(descriptor)
    ax[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()


In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Selecting relevant columns (descriptors) for clustering
df_cluster = df[['MolWt', 'LogP', 'TPSA', 'NumHDonors', 'NumHAcceptors']]

# Apply K-means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
df['Cluster'] = kmeans.fit_predict(df_cluster)

# Plot clusters
plt.scatter(df['MolWt'], df['LogP'], c=df['Cluster'], cmap='viridis')
plt.xlabel('MolWt')
plt.ylabel('LogP')
plt.title('Clustering of Molecules')
plt.show()


In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from rdkit.DataStructs import TanimotoSimilarity

# Compute similarity matrix
n = len(fingerprints)
sim_matrix = np.zeros((n, n))
for i in range(n):
    for j in range(n):
        sim_matrix[i, j] = TanimotoSimilarity(fingerprints[i], fingerprints[j])

# Plot similarity heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(sim_matrix, cmap='viridis')
plt.title("Tanimoto Similarity Heatmap (Morgan Fingerprints)")
plt.xlabel("Molecules")
plt.ylabel("Molecules")
plt.show()


1.2. Reading a molfile

In [None]:
molblock = """phenol
  Mrv1682210081607082D

  7  7  0  0  0  0            999 V2000
   -0.6473    1.0929    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -1.3618    0.6804    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -1.3618   -0.1447    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -0.6473   -0.5572    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.0671   -0.1447    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.0671    0.6804    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.7816    1.0929    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
  1  2  1  0  0  0  0
  2  3  2  0  0  0  0
  3  4  1  0  0  0  0
  4  5  2  0  0  0  0
  5  6  1  0  0  0  0
  1  6  2  0  0  0  0
  6  7  1  0  0  0  0
M  END
"""
m = Chem.MolFromMolBlock(molblock)
m

In [None]:
# Render a molecule in 3D
AllChem.EmbedMolecule(m)
rdkit.Chem.Draw.IPythonConsole.ipython_3d = True  # enable py3Dmol inline visualization
m

In [None]:
# Add hydrigens to the structure.
mH = Chem.AddHs(m)
mH

In [None]:
# Optimise the geometry to generate low energy structure.
AllChem.MMFFOptimizeMolecule(mH)
rdkit.Chem.Draw.IPythonConsole.ipython_3d = True  # enable py3Dmol inline visualization
mH

1.3. Reading a PDB file (Protein data bank)

In [None]:
viewer = py3Dmol.view(query='pdb:1EVE')
viewer.setStyle({'cartoon':{'color': 'spectrum'}})
#viewer.addSurface(py3Dmol.VDW,{'opacity':0.9,'color':'white'})

# select by distance
selection = {'resn':'E20', 'chain':'A', 'byres':'true', 'expand': 5}

# set styles
viewer.setStyle(selection,{'stick':{'colorscheme':'orangeCarbon'}})
viewer.setStyle({'resn': 'E20'},{'stick': {'colorscheme': 'greenCarbon'}})
viewer.zoomTo(selection)
viewer.show()
