<a href="https://colab.research.google.com/github/porekhov/drug_design_2024/blob/main/Chembl_Ro5_PAINS_filters.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title **Install Conda Colab, rdkit, and other dependences**

%%capture
!pip install -q condacolab
import condacolab
condacolab.install()
!conda install -c conda-forge rdkit -y
!pip install chembl_webresource_client
!pip install umap-learn

In [None]:
import math
from pathlib import Path
from zipfile import ZipFile
from tempfile import TemporaryDirectory

from chembl_webresource_client.new_client import new_client

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm.auto import tqdm

from rdkit import Chem
from rdkit.Chem import Descriptors, Draw, PandasTools, rdFingerprintGenerator
from rdkit import DataStructs
from rdkit.Chem.FilterCatalog import FilterCatalog, FilterCatalogParams
from rdkit.ML.Cluster import Butina

import umap

In [None]:
chembl_id = 'CHEMBL203'

bioactivities_api = new_client.activity

bioactivities = bioactivities_api.filter(
    target_chembl_id=chembl_id, type="IC50", relation="=", assay_type="B"
).only(
    "activity_id",
    "assay_chembl_id",
    "assay_description",
    "assay_type",
    "molecule_chembl_id",
    "type",
    "standard_units",
    "relation",
    "standard_value",
    "target_chembl_id",
    "target_organism",
)

print(f"Length of bioactivities object: {len(bioactivities)}")

In [None]:
bioactivities_df = pd.DataFrame.from_dict(bioactivities)
print(f"DataFrame shape: {bioactivities_df.shape}")
bioactivities_df.head()
bioactivities_df.to_csv('CHEMBL203.csv')

In [None]:
bioactivities_df = pd.read_csv('CHEMBL203.csv')

In [None]:
# drop two columns (units, value): we will use the standartized values (in nM)
bioactivities_df.drop(["units", "value"], axis=1, inplace=True)
bioactivities_df.head()

# convert the standard_value column to the float datatype
bioactivities_df = bioactivities_df.astype({"standard_value": "float64"})

# drop NA values
bioactivities_df.dropna(axis=0, how="any", inplace=True)
print(f"DataFrame shape after removing NA: {bioactivities_df.shape}")

# remove values not in nM
bioactivities_df = bioactivities_df[bioactivities_df["standard_units"] == "nM"]
print(f"Units after filtering: {bioactivities_df['standard_units'].unique()}")
print(f"DataFrame shape after removing non-nM values: {bioactivities_df.shape}")

# remove duplicated molecules (based on their Chembl molecule ID)
bioactivities_df.drop_duplicates("molecule_chembl_id", keep="first", inplace=True)
print(f"DataFrame shape after removing diplicates: {bioactivities_df.shape}")

# ... and reset the indices and rename the columns
bioactivities_df.reset_index(drop=True, inplace=True)

bioactivities_df.rename(
    columns={"standard_value": "IC50", "standard_units": "units"}, inplace=True
)

In [None]:
# fetchign compound data from ChEMBL
compounds_api = new_client.molecule

compounds_provider = compounds_api.filter(
    molecule_chembl_id__in=list(bioactivities_df["molecule_chembl_id"])
).only("molecule_chembl_id", "molecule_structures")

compounds = list(tqdm(compounds_provider))

compounds_df = pd.DataFrame.from_records(
    compounds,
)
print(f"DataFrame shape: {compounds_df.shape}")

compounds_df.to_csv('CHEMBL203_molecules.csv')

compounds_df.head()

In [None]:
# filtering out diplicates, NA, etc.

compounds_df.dropna(axis=0, how="any", inplace=True)
print(f"DataFrame shape after removing NA: {compounds_df.shape}")

compounds_df.drop_duplicates("molecule_chembl_id", keep="first", inplace=True)
print(f"DataFrame shape after removing duplicates: {compounds_df.shape}")

# keep only structures with the canonical SMILES

canonical_smiles = []

for i, compounds in compounds_df.iterrows():
    try:
        canonical_smiles.append(compounds["molecule_structures"]["canonical_smiles"])
    except KeyError:
        canonical_smiles.append(None)

compounds_df["smiles"] = canonical_smiles
compounds_df.drop("molecule_structures", axis=1, inplace=True)
print(f"DataFrame shape with canonical SMILES only: {compounds_df.shape}")

compounds_df.dropna(axis=0, how="any", inplace=True)
print(f"DataFrame shape: {compounds_df.shape}")

In [None]:
# merge tables with IC50 and SMILES, add pIC50 values (=-logIC50 in Molar concentration)

# Merge DataFrames
output_df = pd.merge(
    bioactivities_df[["molecule_chembl_id", "IC50", "units"]],
    compounds_df,
    on="molecule_chembl_id",
)

# Reset row indices
output_df.reset_index(drop=True, inplace=True)

def convert_ic50_to_pic50(IC50_value):
    pIC50_value = 9 - math.log10(IC50_value)
    return pIC50_value

# Apply conversion to each row of the compounds DataFrame
output_df["pIC50"] = output_df.apply(lambda x: convert_ic50_to_pic50(x.IC50), axis=1)

print(f"Dataset with {output_df.shape[0]} entries.")

output_df.to_csv('CHEMBL203_IC50_smiles.csv')

output_df.head()

In [None]:
output_df.hist(column="pIC50")

Pharmacokinetics are mainly divided into four steps: Absorption, Distribution, Metabolism, and Excretion. These are summarized as ADME. Often, ADME also includes Toxicology and is thus referred to as ADMET or ADMETox. Below, the ADME steps are discussed in more detail (Wikipedia and Mol Pharm. (2010), 7(5), 1388-1405).

**Absorption**: The amount and the time of drug-uptake into the body depends on multiple factors which can vary between individuals and their conditions as well as on the properties of the substance. Factors such as (poor) compound solubility, gastric emptying time, intestinal transit time, chemical (in-)stability in the stomach, and (in-)ability to permeate the intestinal wall can all influence the extent to which a drug is absorbed after e.g. oral administration, inhalation, or contact to skin.

**Distribution**: The distribution of an absorbed substance, i.e. within the body, between blood and different tissues, and crossing the blood-brain barrier are affected by regional blood flow rates, molecular size and polarity of the compound, and binding to serum proteins and transporter enzymes. Critical effects in toxicology can be the accumulation of highly apolar substances in fatty tissue, or crossing of the blood-brain barrier.

**Metabolism**: After entering the body, the compound will be metabolized. This means that only part of this compound will actually reach its target. Mainly liver and kidney enzymes are responsible for the break-down of xenobiotics (substances that are extrinsic to the body).

**Excretion**: Compounds and their metabolites need to be removed from the body via excretion, usually through the kidneys (urine) or in the feces. Incomplete excretion can result in accumulation of foreign substances or adverse interference with normal metabolism.

**Lead-likeness and Lipinski’s rule of five (Ro5)**


Lead compounds are developmental drug candidates with promising properties. They are used as starting structures and modified with the aim to develop effective drugs. Besides bioactivity (compound binds to the target of interest), also favorable ADME properties are important criteria for the design of efficient drugs.

The bioavailability of a compound is an important ADME property. Lipinski’s rule of five (Ro5, Adv. Drug Deliv. Rev. (1997), 23, 3-25) was introduced to estimate the bioavailability of a compound solely based on its chemical structure. The Ro5 states that poor absorption or permeation of a compound is more probable if the chemical structure violates more than one of the following rules:
1.   Molecular weight (MWT) <= 500 Da
2.   Number of hydrogen bond acceptors (HBAs) <= 10
3.   Number of hydrogen bond donors (HBD) <= 5
4.   Calculated LogP (octanol-water coefficient) <= 5

Note: All numbers in the Ro5 are multiples of five; this is the origin of the rule’s name.

In [None]:
def calculate_ro5_properties(smiles):
    """
    Test if input molecule (SMILES) fulfills Lipinski's rule of five.

    Parameters
    ----------
    smiles : str
        SMILES for a molecule.

    Returns
    -------
    pandas.Series
        Molecular weight, number of hydrogen bond acceptors/donor and logP value
        and Lipinski's rule of five compliance for input molecule.
    """
    # RDKit molecule from SMILES
    molecule = Chem.MolFromSmiles(smiles)
    # Calculate Ro5-relevant chemical properties
    molecular_weight = Descriptors.ExactMolWt(molecule)
    n_hba = Descriptors.NumHAcceptors(molecule)
    n_hbd = Descriptors.NumHDonors(molecule)
    logp = Descriptors.MolLogP(molecule)
    # Check if Ro5 conditions fulfilled
    conditions = [molecular_weight <= 500, n_hba <= 10, n_hbd <= 5, logp <= 5]
    ro5_fulfilled = sum(conditions) >= 3
    # Return True if no more than one out of four conditions is violated
    return pd.Series(
        [molecular_weight, n_hba, n_hbd, logp, ro5_fulfilled],
        index=["molecular_weight", "n_hba", "n_hbd", "logp", "ro5_fulfilled"],
    )

# Apply function calculate_ro5_properties() to each row in the table
ro5_properties = output_df["smiles"].apply(calculate_ro5_properties)
# concatenate this
output_df = pd.concat([output_df, ro5_properties], axis=1)
output_df.head()

In [None]:
output_df_ro5_fulfilled = output_df[output_df["ro5_fulfilled"]]
output_df_ro5_violated = output_df[~output_df["ro5_fulfilled"]]

print(f"# compounds in unfiltered data set: {output_df.shape[0]}")
print(f"# compounds in filtered data set: {output_df_ro5_fulfilled.shape[0]}")
print(f"# compounds not compliant with the Ro5: {output_df_ro5_violated.shape[0]}")

# output the Ro5 complient nolecules:
output_df_ro5_fulfilled.to_csv('CHEMBL203_Ro5_passed.csv')

In the next section, we will plot the radar plots with the statistics of key descriptors included into the Ro5. Some values will be scaled accordingly:

1. scaled MWT = MWT / 100
2. scaled HBA = HBA / 2
3. HBD
4. LogP

In [None]:
def calculate_mean_std(dataframe):
    """
    Calculate the mean and standard deviation of a dataset.

    Parameters
    ----------
    dataframe : pd.DataFrame
        Properties (columns) for a set of items (rows).

    Returns
    -------
    pd.DataFrame
        Mean and standard deviation (columns) for different properties (rows).
    """
    # Generate descriptive statistics for property columns
    stats = dataframe.describe()
    # Transpose DataFrame (statistical measures = columns)
    stats = stats.T
    # Select mean and standard deviation
    stats = stats[["mean", "std"]]
    return stats

output_df_ro5_fulfilled_stats = calculate_mean_std(
    output_df_ro5_fulfilled[["molecular_weight", "n_hba", "n_hbd", "logp"]]
)

output_df_ro5_violated_stats = calculate_mean_std(
    output_df_ro5_violated[["molecular_weight", "n_hba", "n_hbd", "logp"]]
)

def _scale_by_thresholds(stats, thresholds, scaled_threshold):
    """
    Scale values for different properties that have each an individually defined threshold.

    Parameters
    ----------
    stats : pd.DataFrame
        Dataframe with "mean" and "std" (columns) for each physicochemical property (rows).
    thresholds : dict of str: int
        Thresholds defined for each property.
    scaled_threshold : int or float
        Scaled thresholds across all properties.

    Returns
    -------
    pd.DataFrame
        DataFrame with scaled means and standard deviations for each physiochemical property.
    """
    # Raise error if scaling keys and data_stats indicies are not matching
    for property_name in stats.index:
        if property_name not in thresholds.keys():
            raise KeyError(f"Add property '{property_name}' to scaling variable.")
    # Scale property data
    stats_scaled = stats.apply(lambda x: x / thresholds[x.name] * scaled_threshold, axis=1)
    return stats_scaled

def _define_radial_axes_angles(n_axes):
    """Define angles (radians) for radial (x-)axes depending on the number of axes."""
    x_angles = [i / float(n_axes) * 2 * math.pi for i in range(n_axes)]
    x_angles += x_angles[:1]
    return x_angles

def plot_radar(
    y,
    thresholds,
    scaled_threshold,
    properties_labels,
    y_max=None,
    output_path=None,
):
    """
    Plot a radar chart based on the mean and standard deviation of a data set's properties.

    Parameters
    ----------
    y : pd.DataFrame
        Dataframe with "mean" and "std" (columns) for each physicochemical property (rows).
    thresholds : dict of str: int
        Thresholds defined for each property.
    scaled_threshold : int or float
        Scaled thresholds across all properties.
    properties_labels : list of str
        List of property names to be used as labels in the plot.
    y_max : None or int or float
        Set maximum y value. If None, let matplotlib decide.
    output_path : None or pathlib.Path
        If not None, save plot to file.
    """

    # Define radial x-axes angles -- uses our helper function!
    x = _define_radial_axes_angles(len(y))
    # Scale y-axis values with respect to a defined threshold -- uses our helper function!
    y = _scale_by_thresholds(y, thresholds, scaled_threshold)
    # Since our chart will be circular we append the first value of each property to the end
    y = pd.concat([y, y.head(1)])

    # Set figure and subplot axis
    plt.figure(figsize=(6, 6))
    ax = plt.subplot(111, polar=True)

    # Plot data
    ax.fill(x, [scaled_threshold] * len(x), "cornflowerblue", alpha=0.2)
    ax.plot(x, y["mean"], "b", lw=3, ls="-")
    ax.plot(x, y["mean"] + y["std"], "orange", lw=2, ls="--")
    ax.plot(x, y["mean"] - y["std"], "orange", lw=2, ls="-.")

    # From here on, we only do plot cosmetics
    # Set 0° to 12 o'clock
    ax.set_theta_offset(math.pi / 2)
    # Set clockwise rotation
    ax.set_theta_direction(-1)

    # Set y-labels next to 180° radius axis
    ax.set_rlabel_position(180)
    # Set number of radial axes' ticks and remove labels
    plt.xticks(x, [])
    # Get maximal y-ticks value
    if not y_max:
        y_max = int(ax.get_yticks()[-1])
    # Set axes limits
    plt.ylim(0, y_max)
    # Set number and labels of y axis ticks
    plt.yticks(
        range(1, y_max),
        ["5" if i == scaled_threshold else "" for i in range(1, y_max)],
        fontsize=16,
    )

    # Draw ytick labels to make sure they fit properly
    # Note that we use [:1] to exclude the last element which equals the first element (not needed here)
    for i, (angle, label) in enumerate(zip(x[:-1], properties_labels)):
        if angle == 0:
            ha = "center"
        elif 0 < angle < math.pi:
            ha = "left"
        elif angle == math.pi:
            ha = "center"
        else:
            ha = "right"
        ax.text(
            x=angle,
            y=y_max + 1,
            s=label,
            size=16,
            horizontalalignment=ha,
            verticalalignment="center",
        )

    # Add legend relative to top-left plot
    labels = ( "rule of five area", "mean", "mean + std", "mean - std")
    ax.legend(labels, loc=(1.1, 0.7), labelspacing=0.3, fontsize=16)

    # Save plot - use bbox_inches to include text boxes
    if output_path:
        plt.savefig(output_path, dpi=300, bbox_inches="tight", transparent=True)

    plt.show()

thresholds = {"molecular_weight": 500, "n_hba": 10, "n_hbd": 5, "logp": 5}
scaled_threshold = 5
properties_labels = [
    "Molecular weight (Da) / 100",
    "# HBA / 2",
    "# HBD",
    "LogP",
]
y_max = 8

plot_radar(
    output_df_ro5_fulfilled_stats,
    thresholds,
    scaled_threshold,
    properties_labels,
    y_max,
)

plot_radar(
    output_df_ro5_violated_stats,
    thresholds,
    scaled_threshold,
    properties_labels,
    y_max,
)

In the next step, we can filter out **Pan Assay Interference Compounds (PAINS)**.

Substructures can be unfavorable, e.g., because they are toxic or reactive, due to unfavorable pharmacokinetic properties, or because they likely interfere with certain assays. Nowadays, drug discovery campaigns often involve high throughput screening. Filtering unwanted substructures can support assembling more efficient screening libraries, which can save time and resources.

PAINS are compounds that often occur as hits in HTS even though they actually are false positives. PAINS show activity at numerous targets rather than one specific target. Such behavior results from unspecific binding or interaction with assay components. Baell et al. (J. Med. Chem. (2010), 53, 2719-2740) focused on substructures interfering in assay signaling. They described substructures which can help to identify such PAINS and provided a list which can be used for substructure filtering.

In [None]:
output_df_ro5_fulfilled = pd.read_csv('CHEMBL203_Ro5_passed.csv')

# Add molecule column
PandasTools.AddMoleculeColumnToFrame(output_df_ro5_fulfilled, smilesCol="smiles")
# Draw first 3 molecules
Chem.Draw.MolsToGridImage(
    list(output_df_ro5_fulfilled.head(3).ROMol),
    legends=list(output_df_ro5_fulfilled.head(3).molecule_chembl_id),
)

In [None]:
# initialize filter
params = FilterCatalogParams()
params.AddCatalog(FilterCatalogParams.FilterCatalogs.PAINS)
catalog = FilterCatalog(params)

# search for PAINS
matches = []
clean = []
for index, row in tqdm(output_df_ro5_fulfilled.iterrows(), total=output_df_ro5_fulfilled.shape[0]):
    molecule = row.ROMol
    entry = catalog.GetFirstMatch(row.ROMol)  # Get the first matching PAINS
    if entry is not None:
        # store PAINS information
        matches.append(
            {
                "chembl_id": row.molecule_chembl_id,
                "rdkit_molecule": molecule,
                "pains": entry.GetDescription().capitalize(),
            }
        )
    else:
        # collect indices of molecules without PAINS
        clean.append(index)

matches = pd.DataFrame(matches)
output_df_ro5_pains = output_df_ro5_fulfilled.loc[clean]  # keep molecules without PAINS

print(f"# compounds in unfiltered data set: {output_df_ro5_fulfilled.shape[0]}")
print(f"# compounds in filtered data set: {output_df_ro5_pains.shape[0]}")

The specific structural patterns corresponding to these PAINS fragments can be found in https://pubs.acs.org/doi/suppl/10.1021/jm901137j/suppl_file/jm901137j_si_001.pdf

In [None]:
# draw first three molecules with PAINS (specified in the labels)
Chem.Draw.MolsToGridImage(
    list(matches.head(3).rdkit_molecule),
    legends=list(matches.head(3)["pains"]),
)

In [None]:
output_df_ro5_pains.reset_index(drop=True, inplace=True)

# generating Morgan fingerprints for each compound
mfpgen = rdFingerprintGenerator.GetMorganGenerator(fpSize=1024)

output_df_ro5_pains['FP'] = output_df_ro5_pains['ROMol'].apply(
    lambda x: mfpgen.GetFingerprint(x))

output_df_ro5_pains.head(3)

The UMAP algorithm has several hyperparameters that give the user a bit more control over the structure of the final embedding:

1. **metric** is the metric used to determine distance between points. Because we are comparing Morgan FPs, we use Jaccard distance (typically referred to as Tanimoto distance in cheminformatics).
2. **n_neighbors** determines the prioritization of local versus global structure in the embedding. This value constrains the number of neighbors that a given compound has in the graph representation of the dataset. If n_neighbors is small then the embedding focuses on optimizing the distances between similar compounds to ensure the small differences between them are well represented. If n_neighbors is larger, then the distances between less similar compounds is prioritized.
3. **min_dist** is the minimum distance between any two points. This affects the tightness of the embedding. The larger min_dist, the more spread out the compounds will be.

More about the UMAP and other approaches (t-SNE, PCA) and choice of hyperparameters, https://blog.reverielabs.com/mapping-chemical-space-with-umap/

Also, check TMAP for projecting extra-large datasets, https://tmap.gdb.tools/

TMAP utilizes MHFP6, a fast and efficient molecular fingerprint, https://github.com/reymond-group/mhfp

In [None]:
# we need to convert the fingerprints to numerical arrays first
fingerprints = [list(fp.ToBitString()) for fp in output_df_ro5_pains['FP']]

# run umap
mapper = umap.UMAP(n_neighbors=20, min_dist=0.5, n_components=2, metric='jaccard')
umap_result = mapper.fit_transform(fingerprints)

# plot the scatter plot
plt.scatter(umap_result[:, 0], umap_result[:, 1], c=output_df_ro5_pains.pIC50, cmap='viridis', s=2)
plt.colorbar()
plt.title('UMAP Projection of Molecules')
plt.xlabel('UMAP 1')
plt.ylabel('UMAP 2')
plt.show()

Clustering is often esential to obtain a short list of representative compounds, e.g., when we have a limited amount of resources to experimentally test a few compounds in a confirmatory assay picked from the results of a virtual screening campaign.

Butina clustering (J. Chem. Inf. Model. (1999), 39 (4), 747) was developed to identify smaller but homogeneous clusters, with the prerequisite that (at least) the cluster centroid will be more similar than a given **threshold** to every other molecule in the cluster.

The higher the threshold (distance cutoff), the more molecules are considered as similar and, therefore, clustered into less clusters. The lower the threshold, the more small clusters and “singletons” appear.

It makes sense to check different threshold values. However, 0.2-0.5 typically work well depending on the diversity of a given dataset.

In [None]:
fps = output_df_ro5_pains['FP']

# generate distance matrix
dist_matrix = []
num_fps = len(fps)
for i in range(1, num_fps):
    similarities = DataStructs.BulkTanimotoSimilarity(fps[i],fps[:i])
    dist_matrix.extend([1-x for x in similarities])
# cluster
clusters = Butina.ClusterData(dist_matrix, num_fps, 0.6, isDistData=True) # distance cutoff = 0.6
print("number of clusters =", len(clusters))
num_clust_g5 = len([c for c in clusters if len(c) > 5])
print("number of clusters with more than 5 compounds =", num_clust_g5)

In [None]:
# sort the clusters by their size
clusters = sorted(clusters, key=len, reverse=True)

# Give a short report about the numbers of clusters and their sizes
num_clust_g1 = sum(1 for c in clusters if len(c) == 1)
num_clust_g5 = sum(1 for c in clusters if len(c) > 5)
num_clust_g25 = sum(1 for c in clusters if len(c) > 25)
num_clust_g100 = sum(1 for c in clusters if len(c) > 100)

print("# clusters with only 1 compound: ", num_clust_g1)
print("# clusters with >5 compounds: ", num_clust_g5)
print("# clusters with >25 compounds: ", num_clust_g25)
print("# clusters with >100 compounds: ", num_clust_g100)

# Plot the size of the clusters
fig, ax = plt.subplots(figsize=(15, 4))
ax.set_xlabel("Cluster index")
ax.set_ylabel("Number of molecules")
ax.bar(range(1, len(clusters) + 1), [len(c) for c in clusters], lw=5)
plt.show()

In [None]:
cluster_centers = [c[0] for c in clusters]
output_df_ro5_pains_clusters = output_df_ro5_pains.iloc[cluster_centers]
output_df_ro5_pains_clusters.head(3)