**Connect to ChEMBL database**
*   First, the ChEMBL web resource client as well as other Python libraries are imported.



In [None]:
import math
from pathlib import Path
from zipfile import ZipFile
from tempfile import TemporaryDirectory

In [None]:
!pip install "numpy==1.26.4" "pandas==2.1.4"
!pip install rdkit-pypi
#!pip install rdkit-pypi
import numpy as np
import pandas as pd

In [None]:
from rdkit import Chem
from rdkit.Chem import PandasTools
from tqdm.auto import tqdm

In [None]:
!pip install chembl_webresource_client

In [None]:
from chembl_webresource_client.new_client import new_client

In [None]:
HERE = Path(_dh[-1])
DATA = HERE / 'data'

In [None]:
targets_api = new_client.target
compounds_api = new_client.molecule
bioactivities_api = new_client.activity

In [None]:
type(targets_api)

**Get target data (EGFR kinase)**


*   Get UniProt ID of the target of interest (EGFR kinase: P00533) from UniProt website
*   Use UniProt ID to get target information
*   Select a different UniProt ID, if you are interested in another target.

In [None]:
#uniprot_id = "P00533"
#uniprot_id = "W8T1G8"
uniprot_id = "V5TFZ2"

In [None]:
# Fetch target data from ChEMBL
targets = targets_api.get(target_components__accession=uniprot_id).only(['target_chembl_id', 'organism', 'pref_name', 'target_type'])
# Get target information from ChEMBL but restrict it to specified values only


In [None]:
print (f'The type of the targets is "{type(targets)}"')

In [None]:
# Download target data from ChEMBL
targets = pd.DataFrame.from_records(targets)
targets

**Select target (target ChEMBL ID)**

*   After checking the entries, we select the first entry as our target of interest:
*   CHEMBL203: It is a single protein and represents the human Epidermal growth factor receptor (EGFR, also named erbB1)





In [None]:
# Select target (target ChEMBL ID)
target = targets.iloc[0]   # fetch zero (0) row from the dataframe
target

**Get bioactivity data**
*   Now, we want to query bioactivity data for the target of interest.

**Fetch bioactivity data for the target from ChEMBL**
*   In this step, we fetch the bioactivity data and filter it to only consider

---



*   human proteins,
*   bioactivity type IC50,
*   exact measurements (relation '='), and
*   binding data (assay type 'B').



In [None]:
chembl_id = target.target_chembl_id
print(f"The target ChEMBL ID is {chembl_id}")
# NBVAL_CHECK_OUTPUT

In [None]:
bioactivities = bioactivities_api.filter(
    target_chembl_id=chembl_id, type="IC50", relation="=", assay_type="B"
).only(
    "activity_id",
    "assay_chembl_id",
    "assay_description",
    "assay_type",
    "molecule_chembl_id",
    "type",
    "standard_units",
    "relation",
    "standard_value",
    "target_chembl_id",
    "target_organism",
)

print(f"Length and type of bioactivities object: {len(bioactivities)}, {type(bioactivities)}")

In [None]:
print(f"Length and type of first element: {len(bioactivities[0])}, {type(bioactivities[0])}")
bioactivities[0]

In [None]:
# Download bioactivity data from ChEMBL
bioactivities_df = pd.DataFrame.from_dict(bioactivities)
print(f"DataFrame shape: {bioactivities_df.shape}")
bioactivities_df.head()

In [None]:
bioactivities_df["units"].unique()

In [None]:
bioactivities_df.drop(["units", "value"], axis=1, inplace=True)
bioactivities_df.head()

**Preprocess and filter bioactivity data**

*   Convert standard_value’s datatype from object to float
*   Delete entries with missing values
*   Keep only entries with standard_unit == nM
*   Delete duplicate molecules
*   Reset DataFrame index
*   Rename columns

In [None]:
# Convert datatype of “standard_value” from “object” to “float”
bioactivities_df.dtypes

In [None]:
bioactivities_df = bioactivities_df.astype({"standard_value": "float64"})
bioactivities_df.dtypes

In [None]:
# Delete entries with missing values
bioactivities_df.dropna(axis=0, how="any", inplace=True)
print(f"DataFrame shape: {bioactivities_df.shape}")

In [None]:
# Keep only entries with “standard_unit == nM”
print(f"Units in downloaded data: {bioactivities_df['standard_units'].unique()}")
print(
    f"Number of non-nM entries:\
    {bioactivities_df[bioactivities_df['standard_units'] != 'nM'].shape[0]}"
)

In [None]:
bioactivities_df = bioactivities_df[bioactivities_df["standard_units"] == "nM"]
print(f"Units after filtering: {bioactivities_df['standard_units'].unique()}")

In [None]:
print(f"DataFrame shape: {bioactivities_df.shape}")

In [None]:
# Delete duplicate molecules
bioactivities_df.drop_duplicates("molecule_chembl_id", keep="first", inplace=True)
print(f"DataFrame shape: {bioactivities_df.shape}")

In [None]:
# Reset “DataFrame” index
bioactivities_df.reset_index(drop=True, inplace=True)
bioactivities_df.head()

In [None]:
# Rename columns --> standard_value to IC50 and standard_units to units
bioactivities_df.rename(
    columns={"standard_value": "IC50", "standard_units": "units"}, inplace=True
)
bioactivities_df.head()

In [None]:
print(f"DataFrame shape: {bioactivities_df.shape}")

In [None]:
# Fetch compound data from ChEMBL
compounds_provider = compounds_api.filter(
    molecule_chembl_id__in=list(bioactivities_df["molecule_chembl_id"])
).only("molecule_chembl_id", "molecule_structures")

In [None]:
compounds = list(tqdm(compounds_provider))  #Download compound data from ChEMBL

In [None]:
compounds_df = pd.DataFrame.from_records(
    compounds,
)
print(f"DataFrame shape: {compounds_df.shape}")

In [None]:
compounds_df.head()

In [None]:
# Remove entries with missing molecule structure entry
compounds_df.dropna(axis=0, how="any", inplace=True)
print(f"DataFrame shape: {compounds_df.shape}")

In [None]:
# Delete duplicate molecules
compounds_df.drop_duplicates("molecule_chembl_id", keep="first", inplace=True)
print(f"DataFrame shape: {compounds_df.shape}")

In [None]:
# Get molecules with canonical SMILES
compounds_df.iloc[0].molecule_structures.keys()

In [None]:
canonical_smiles = []

for i, compounds in compounds_df.iterrows():
    try:
        canonical_smiles.append(compounds["molecule_structures"]["canonical_smiles"])
    except KeyError:
        canonical_smiles.append(None)

compounds_df["smiles"] = canonical_smiles
compounds_df.drop("molecule_structures", axis=1, inplace=True)
print(f"DataFrame shape: {compounds_df.shape}")

In [None]:
compounds_df.dropna(axis=0, how="any", inplace=True)
print(f"DataFrame shape: {compounds_df.shape}")

In [None]:
# Summary of compound and bioactivity data
print(f"Bioactivities filtered: {bioactivities_df.shape[0]}")
bioactivities_df.columns

In [None]:
print(f"Compounds filtered: {compounds_df.shape[0]}")
compounds_df.columns

**Merge both datasets**

---


Merge values of interest from bioactivities_df and compounds_df in an output_df
based on the compounds’ ChEMBL IDs (molecule_chembl_id), keeping the following columns:
---
* ChEMBL IDs: molecule_chembl_id
* SMILES: smiles
* units: units
* IC50: IC50

In [None]:
# Merge DataFrames
output_df = pd.merge(
    bioactivities_df[["molecule_chembl_id", "IC50", "units"]],
    compounds_df,
    on="molecule_chembl_id",
)

# Reset row indices
output_df.reset_index(drop=True, inplace=True)

print(f"Dataset with {output_df.shape[0]} entries.")

In [None]:
output_df.dtypes

In [None]:
output_df.head(10)

In [None]:
# Add pIC50 values
def convert_ic50_to_pic50(IC50_value):
    pIC50_value = 9 - math.log10(IC50_value)
    return pIC50_value

In [None]:
# Apply conversion to each row of the compounds DataFrame
output_df["pIC50"] = output_df.apply(lambda x: convert_ic50_to_pic50(x.IC50), axis=1)

In [None]:
output_df.head()

**Draw compound data**
*   Let’s have a look at our collected data set.
*   First, we plot the pIC50 value distribution

In [None]:
output_df.hist(column="pIC50")

In [None]:
# Add molecule column
PandasTools.AddMoleculeColumnToFrame(output_df, smilesCol="smiles")

In [None]:
# Sort molecules by pIC50
output_df.sort_values(by="pIC50", ascending=False, inplace=True)

# Reset index
output_df.reset_index(drop=True, inplace=True)

In [None]:
output_df.drop("smiles", axis=1).head(3)

In [None]:
# Prepare saving the dataset: Drop the ROMol column
output_df = output_df.drop("ROMol", axis=1)
print(f"DataFrame shape: {output_df.shape}")

In [None]:
output_df.to_csv("/content/Denv-2_NS5/NS5_compounds.csv")
output_df.head()

In [None]:
print(f"DataFrame shape: {output_df.shape}")
# NBVAL_CHECK_OUTPUT