In [1]:
import pandas as pd

In [2]:
def get_data(url):
    import requests
    import zipfile
    import io
    response = requests.get(url)
    if response.status_code == 200:
        # Reading zip file from requests response
        zip_file = zipfile.ZipFile(io.BytesIO(response.content))
        # Printing file content in zip
        list_of_files = zip_file.namelist()
        print("ZIP file content:")
        for file_name in list_of_files:
            print(file_name)
        # Extracting file from zip
        zip_file.extractall()
        print("Successfully extracted zip file.")
    else:
        print("Failed to download file. Response:", response.status_code)
    return file_name

In [5]:
# copy url for csv download here
# TODO: the urls need to be updated
urls = [
    "https://www.ebi.ac.uk/chembl/interface_api/delayed_jobs/outputs/DOWNLOAD-TyK57H3Jm1mAnrOx9bQFst7sKvrpnaFiECEPv9KxldU=/DOWNLOAD-TyK57H3Jm1mAnrOx9bQFst7sKvrpnaFiECEPv9KxldU=.zip",
    "https://www.ebi.ac.uk/chembl/interface_api/delayed_jobs/outputs/DOWNLOAD-m3ajtVTYhqY3Lf7BZNMh8CBOrmJABV8KGN0k2SAwvdY=/DOWNLOAD-m3ajtVTYhqY3Lf7BZNMh8CBOrmJABV8KGN0k2SAwvdY=.zip",
    "https://www.ebi.ac.uk/chembl/interface_api/delayed_jobs/outputs/DOWNLOAD-_1WCrliOUA5F3JVss-saOqsaJ8-pwb9NLfgygYmLUsI=/DOWNLOAD-_1WCrliOUA5F3JVss-saOqsaJ8-pwb9NLfgygYmLUsI=.zip",
    "https://www.ebi.ac.uk/chembl/interface_api/delayed_jobs/outputs/DOWNLOAD-tRULel1nwmAFUjCWFEudZranAWp4kggn6bgeZyY5avw=/DOWNLOAD-tRULel1nwmAFUjCWFEudZranAWp4kggn6bgeZyY5avw=.zip",
    "https://www.ebi.ac.uk/chembl/interface_api/delayed_jobs/outputs/DOWNLOAD-8Y8JMmUmdxLExJ8oxA-wt5atjTHH3HJ8uRKtlObEt0M=/DOWNLOAD-8Y8JMmUmdxLExJ8oxA-wt5atjTHH3HJ8uRKtlObEt0M=.zip",
    "https://www.ebi.ac.uk/chembl/interface_api/delayed_jobs/outputs/DOWNLOAD-keO7vY15kfNAm1u_QtGBb8oW8rc_qkmarRvRwnhbyQg=/DOWNLOAD-keO7vY15kfNAm1u_QtGBb8oW8rc_qkmarRvRwnhbyQg=.zip"
]

In [6]:
file_names = []
for url in urls:
    # store file name in file_names
    file_names.append(get_data(url))

ZIP file content:
DOWNLOAD-keO7vY15kfNAm1u_QtGBb8oW8rc_qkmarRvRwnhbyQg=.csv
Successfully extracted zip file.


In [68]:
# In case you don't want to redownload the csv files online
# run this block to read csv file names available in current directory
# if you ran previous block above, just skip this block
from pathlib import Path
file_names = []
files = Path('./')
for file in files.iterdir():
    if 'DOWNLOAD' in file.name:
        file_names.append(file.name)

In [56]:
def process_data(files):
    import numpy as np
    # initiate data_main, main data frame that will be used to aggregate all csv
    # only use chembl ID, smile, and standard value columns
    data_main = pd.DataFrame({'Molecule ChEMBL ID':[], 'Smiles':[], 'Standard Value':[]})
    # for every csv downloaded, do these following tasks
    for file in files:
        # read data
        data = pd.read_csv(file, sep=';')
        # drop rows if Smiles or Standard Value is null
        removed_null = data.dropna(subset=['Smiles', 'Standard Value'])
        # slice data from initial data frame. only use columns in data_main
        sliced = removed_null[data_main.columns]
        # convert IC50 to pIC50
        # convert IC50 from nM to M
        sliced.loc[:, ['Standard Value']] = sliced[['Standard Value']].apply(lambda x: x*10**-9, axis=1)
        # transform to pIC50
        sliced.loc[:, ['Standard Value']] = -np.log10(sliced[['Standard Value']])
        # join data
        data_main = pd.concat([data_main, sliced])
    # drop duplicates according to Smiles column
    data_main = data_main.drop_duplicates(subset=['Smiles'])
    return data_main
        

In [57]:
# store result of data processing in df_main
df_main = process_data(file_names)

In [58]:
df_main

Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Value
0,CHEMBL3925236,CNC(=O)c1ccc(OC)c(-c2cc3c(n2C(C)C)C(c2ccc(Cl)c...,6.321027
1,CHEMBL3906706,CNC(=O)c1ccc(OC)c(-c2cc3c(n2C(C)C)C(c2ccc(Cl)c...,5.624208
2,CHEMBL3775808,O=C(Nc1ccccc1S)c1ccc(Cl)cc1,5.468521
3,CHEMBL3962130,COc1ccncc1-c1nc2c(n1C(C)C)C(c1ccc(Cl)cc1)N(c1c...,4.852633
4,CHEMBL3775883,CC(C)C(=O)Nc1ccccc1OC(=O)C(C)C,4.602060
...,...,...,...
1611,CHEMBL5191434,CCC[C@H]1N(C(=O)c2cnccc2C)CCC[C@@]1(Oc1csc(C(F...,7.468521
1614,CHEMBL5191746,CCC[C@H]1N(C(=O)c2cnccc2C(F)(F)F)CCC[C@@]1(Oc1...,7.522879
1615,CHEMBL5179820,CCC[C@H]1N(C(=O)c2c(C(F)(F)F)ccnc2O)CCC[C@@]1(...,7.958607
1616,CHEMBL5207199,CCC[C@H]1N(C(=O)c2c(C(F)(F)F)ccnc2OCC(=O)O)CCC...,7.602060


In [60]:
df_main.describe()

Unnamed: 0,Standard Value
count,3465.0
mean,7.014044
std,1.86884
min,2.273925
25%,5.395137
50%,7.056011
75%,8.69897
max,10.39794


In [62]:
df_main.isna().describe()

Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Value
count,3465,3465,3465
unique,1,1,1
top,False,False,False
freq,3465,3465,3465


In [63]:
# save df_main to a csv file
df_main.to_csv('data_train.csv')

In [66]:
data = pd.read_csv('data_train.csv', index_col=0)

In [67]:
data.head()

Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Value
0,CHEMBL3925236,CNC(=O)c1ccc(OC)c(-c2cc3c(n2C(C)C)C(c2ccc(Cl)c...,6.321027
1,CHEMBL3906706,CNC(=O)c1ccc(OC)c(-c2cc3c(n2C(C)C)C(c2ccc(Cl)c...,5.624208
2,CHEMBL3775808,O=C(Nc1ccccc1S)c1ccc(Cl)cc1,5.468521
3,CHEMBL3962130,COc1ccncc1-c1nc2c(n1C(C)C)C(c1ccc(Cl)cc1)N(c1c...,4.852633
4,CHEMBL3775883,CC(C)C(=O)Nc1ccccc1OC(=O)C(C)C,4.60206
