In [None]:
!pip install requests
!pip install pandas
!pip install numpy




# Downloading all the CHEMBL Molecules

In [None]:
import requests
import pandas as pd

BASE_URL = "https://www.ebi.ac.uk"

def get_all_records(endpoint, max_records=5000):
    url = f"https://www.ebi.ac.uk/chembl/api/data/{endpoint}.json?limit=1000"
    all_data = []

    while url and len(all_data) < max_records:
        res = requests.get(url)
        res.raise_for_status()
        data = res.json()
        all_data.extend(data[endpoint + "s"])  # e.g., 'activities', 'molecules'
        #url = data['page_meta']['next']  # next page link

        next_url = data['page_meta']['next']
        if next_url:
            # Fix: add base if it's a relative URL
            if next_url.startswith('/'):
                next_url = BASE_URL + next_url
        url = next_url

    return pd.DataFrame(all_data)

df = get_all_records("molecule", max_records=2000)
print(df.head())

In [20]:
df['molecule_properties'][0]

{'alogp': '2.11',
 'aromatic_rings': 3,
 'full_molformula': 'C17H12ClN3O3',
 'full_mwt': '341.75',
 'hba': 5,
 'hbd': 1,
 'heavy_atoms': 24,
 'mw_freebase': '341.75',
 'np_likeness_score': '-1.56',
 'num_ro5_violations': 0,
 'psa': '84.82',
 'qed_weighted': '0.74',
 'ro3_pass': 'N',
 'rtb': 3}

In [10]:
df["molecule_structures"] #.apply(lambda x: x.get("canonical_smiles") if pd.notnull(x) else None).head()

0       {'canonical_smiles': 'Cc1cc(-n2ncc(=O)[nH]c2=O...
1       {'canonical_smiles': 'Cc1cc(-n2ncc(=O)[nH]c2=O...
2       {'canonical_smiles': 'Cc1cc(-n2ncc(=O)[nH]c2=O...
3       {'canonical_smiles': 'Cc1ccc(C(=O)c2ccc(-n3ncc...
4       {'canonical_smiles': 'Cc1cc(-n2ncc(=O)[nH]c2=O...
                              ...                        
1995    {'canonical_smiles': 'Cc1cc(C)nc(NS(=O)(=O)c2c...
1996    {'canonical_smiles': 'N=C(N)NCCC[C@H](NC(=O)c1...
1997    {'canonical_smiles': 'Oc1ccc2c(c1CCCNc1ccccc1)...
1998    {'canonical_smiles': 'COc1ccc2nc(N)n3nc(-c4ccc...
1999    {'canonical_smiles': 'CCCCc1nc2ccccc2n1Cc1ccc(...
Name: molecule_structures, Length: 2000, dtype: object

# binding_free_energy_kcal_per_mol from CHEMBL

In [None]:
BASE = "https://www.ebi.ac.uk/chembl/api/data/activity.json"

response = requests.get(BASE)
data = response.json()
print(data.keys())
df_drug_properties = pd.DataFrame(data["activities"])
print(df_drug_properties[["molecule_chembl_id", "standard_type", "standard_value", "standard_units"]].head())


dict_keys(['activities', 'page_meta'])
  molecule_chembl_id standard_type standard_value standard_units
0       CHEMBL113081          IC50       100000.0             nM
1       CHEMBL324340          IC50         2500.0             nM
2       CHEMBL324340          IC50        50000.0             nM
3       CHEMBL109600          IC50         9000.0             nM
4       CHEMBL109600          IC50           None             nM


In [36]:
df_drug_properties["standard_value"] = pd.to_numeric(df_drug_properties["standard_value"], errors='coerce')

In [37]:
import numpy as np
df_drug_properties["binding_free_energy_kcal_per_mol"] = 0.593 * np.log(df_drug_properties["standard_value"])
print(df_drug_properties["binding_free_energy_kcal_per_mol"].describe())

count    17.000000
mean      4.573023
std       2.106175
min       0.651477
25%       2.462500
50%       5.158812
75%       5.980885
max       8.192598
Name: binding_free_energy_kcal_per_mol, dtype: float64


  result = getattr(ufunc, method)(*inputs, **kwargs)
