## 1. Installing libraries

In [5]:
! pip install chembl_webresource_client



## 2. Importing libraries

In [6]:
import pandas as pd
from chembl_webresource_client.new_client import new_client

## 3. Search for target protein (COVID19)

### 3.1 Target search for Coronavirus

In [7]:
target = new_client.target
target_query = target.search('covid-19')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,"[{'xref_id': 'Q712U5', 'xref_name': None, 'xre...",Rattus norvegicus,cyclic AMP phosphoprotein,15.0,False,CHEMBL2170,"[{'accession': 'Q712U5', 'component_descriptio...",SINGLE PROTEIN,10116
1,[],Homo sapiens,KU-19-19,15.0,False,CHEMBL1075483,[],CELL-LINE,9606
2,[],Escherichia coli,Metallo-beta-lactamase VIM-19,15.0,False,CHEMBL3309038,"[{'accession': 'D2D9J0', 'component_descriptio...",SINGLE PROTEIN,562
3,[],Homo sapiens,Ubiquitin carboxyl-terminal hydrolase 19,14.0,False,CHEMBL4523156,"[{'accession': 'O94966', 'component_descriptio...",SINGLE PROTEIN,9606
4,[],Homo sapiens,SNB-19,13.0,False,CHEMBL614164,[],CELL-LINE,9606
5,[],Homo sapiens,HOP-19,13.0,False,CHEMBL614832,[],CELL-LINE,9606
6,"[{'xref_id': 'A8QUY6', 'xref_name': None, 'xre...",Aeromonas caviae,IMP-19,13.0,False,CHEMBL5438,"[{'accession': 'A8QUY6', 'component_descriptio...",SINGLE PROTEIN,648
7,[],Homo sapiens,EFM-19,13.0,False,CHEMBL1075439,[],CELL-LINE,9606
8,[],Homo sapiens,Matrix metalloproteinase-19,13.0,False,CHEMBL1938214,"[{'accession': 'Q99542', 'component_descriptio...",SINGLE PROTEIN,9606
9,[],Homo sapiens,ARPE-19,13.0,False,CHEMBL4296399,[],CELL-LINE,9606


In [8]:
selected_target = targets.target_chembl_id[15]
selected_target

'CHEMBL1824'

In [9]:
# Retrieve bioactivity data from ChEMBL db i.e. IC50
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")

In [96]:
df1 = pd.DataFrame.from_dict(res)
df1

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,32264,[],CHEMBL845865,Inhibition of autophosphorylation of human Her...,F,,,BAO_0000190,BAO_0000219,...,Homo sapiens,Receptor protein-tyrosine kinase erbB-2,9606,,,IC50,uM,UO_0000065,,0.3
1,,32266,[],CHEMBL615491,Inhibition of ligand induced proliferation in ...,F,,,BAO_0000190,BAO_0000219,...,Homo sapiens,Receptor protein-tyrosine kinase erbB-2,9606,,,IC50,uM,UO_0000065,,2.5
2,,32271,[],CHEMBL683802,Inhibition of autophosphorylation of human Her...,F,,,BAO_0000190,BAO_0000219,...,Homo sapiens,Receptor protein-tyrosine kinase erbB-2,9606,,,IC50,uM,UO_0000065,,0.4
3,,32273,[],CHEMBL615491,Inhibition of ligand induced proliferation in ...,F,,,BAO_0000190,BAO_0000219,...,Homo sapiens,Receptor protein-tyrosine kinase erbB-2,9606,,,IC50,uM,UO_0000065,,1.21
4,,47937,[],CHEMBL683802,Inhibition of autophosphorylation of human Her...,F,,,BAO_0000190,BAO_0000219,...,Homo sapiens,Receptor protein-tyrosine kinase erbB-2,9606,,,IC50,uM,UO_0000065,,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3940,,22950598,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL4766165,Inhibition of human recombinant ERBB2 G778D mu...,B,P04626,G778D,BAO_0000190,BAO_0000357,...,Homo sapiens,Receptor protein-tyrosine kinase erbB-2,9606,,,IC50,uM,UO_0000065,,0.165
3941,,22981501,[],CHEMBL4773005,Inhibition of HER2 (unknown origin),B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Receptor protein-tyrosine kinase erbB-2,9606,,,IC50,uM,UO_0000065,,2.0
3942,,23236818,[],CHEMBL4828269,Inhibition of ERBB2 (unknown origin) by mobili...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Receptor protein-tyrosine kinase erbB-2,9606,,,IC50,nM,UO_0000065,,7491.0
3943,,23285156,[],CHEMBL4838201,Inhibition of ERBB2 (unknown origin) by ELISA,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Receptor protein-tyrosine kinase erbB-2,9606,,,IC50,nM,UO_0000065,,1000.0


In [98]:
df1.to_csv('COVID19_01_bioactivity_data_raw.csv', index=False)

## 4. Handling missing data

In [99]:
# if any column has missing value for the standard_value or smiles, drop that row(compound)

df2 = df1[df1.standard_value.notna()]
df2 = df2[df1.canonical_smiles.notna()]
df2

  after removing the cwd from sys.path.


Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,32264,[],CHEMBL845865,Inhibition of autophosphorylation of human Her...,F,,,BAO_0000190,BAO_0000219,...,Homo sapiens,Receptor protein-tyrosine kinase erbB-2,9606,,,IC50,uM,UO_0000065,,0.3
1,,32266,[],CHEMBL615491,Inhibition of ligand induced proliferation in ...,F,,,BAO_0000190,BAO_0000219,...,Homo sapiens,Receptor protein-tyrosine kinase erbB-2,9606,,,IC50,uM,UO_0000065,,2.5
2,,32271,[],CHEMBL683802,Inhibition of autophosphorylation of human Her...,F,,,BAO_0000190,BAO_0000219,...,Homo sapiens,Receptor protein-tyrosine kinase erbB-2,9606,,,IC50,uM,UO_0000065,,0.4
3,,32273,[],CHEMBL615491,Inhibition of ligand induced proliferation in ...,F,,,BAO_0000190,BAO_0000219,...,Homo sapiens,Receptor protein-tyrosine kinase erbB-2,9606,,,IC50,uM,UO_0000065,,1.21
4,,47937,[],CHEMBL683802,Inhibition of autophosphorylation of human Her...,F,,,BAO_0000190,BAO_0000219,...,Homo sapiens,Receptor protein-tyrosine kinase erbB-2,9606,,,IC50,uM,UO_0000065,,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3940,,22950598,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL4766165,Inhibition of human recombinant ERBB2 G778D mu...,B,P04626,G778D,BAO_0000190,BAO_0000357,...,Homo sapiens,Receptor protein-tyrosine kinase erbB-2,9606,,,IC50,uM,UO_0000065,,0.165
3941,,22981501,[],CHEMBL4773005,Inhibition of HER2 (unknown origin),B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Receptor protein-tyrosine kinase erbB-2,9606,,,IC50,uM,UO_0000065,,2.0
3942,,23236818,[],CHEMBL4828269,Inhibition of ERBB2 (unknown origin) by mobili...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Receptor protein-tyrosine kinase erbB-2,9606,,,IC50,nM,UO_0000065,,7491.0
3943,,23285156,[],CHEMBL4838201,Inhibition of ERBB2 (unknown origin) by ELISA,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Receptor protein-tyrosine kinase erbB-2,9606,,,IC50,nM,UO_0000065,,1000.0


In [100]:
len(df2.canonical_smiles.unique())

2622

In [103]:
# remove duplicates in the data
df3 = df2.drop_duplicates(['canonical_smiles'])
df3

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,32264,[],CHEMBL845865,Inhibition of autophosphorylation of human Her...,F,,,BAO_0000190,BAO_0000219,...,Homo sapiens,Receptor protein-tyrosine kinase erbB-2,9606,,,IC50,uM,UO_0000065,,0.3
2,,32271,[],CHEMBL683802,Inhibition of autophosphorylation of human Her...,F,,,BAO_0000190,BAO_0000219,...,Homo sapiens,Receptor protein-tyrosine kinase erbB-2,9606,,,IC50,uM,UO_0000065,,0.4
4,,47937,[],CHEMBL683802,Inhibition of autophosphorylation of human Her...,F,,,BAO_0000190,BAO_0000219,...,Homo sapiens,Receptor protein-tyrosine kinase erbB-2,9606,,,IC50,uM,UO_0000065,,0.1
6,,49330,[],CHEMBL683802,Inhibition of autophosphorylation of human Her...,F,,,BAO_0000190,BAO_0000219,...,Homo sapiens,Receptor protein-tyrosine kinase erbB-2,9606,,,IC50,uM,UO_0000065,,5.0
8,,50672,[],CHEMBL683802,Inhibition of autophosphorylation of human Her...,F,,,BAO_0000190,BAO_0000219,...,Homo sapiens,Receptor protein-tyrosine kinase erbB-2,9606,,,IC50,uM,UO_0000065,,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3940,,22950598,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL4766165,Inhibition of human recombinant ERBB2 G778D mu...,B,P04626,G778D,BAO_0000190,BAO_0000357,...,Homo sapiens,Receptor protein-tyrosine kinase erbB-2,9606,,,IC50,uM,UO_0000065,,0.165
3941,,22981501,[],CHEMBL4773005,Inhibition of HER2 (unknown origin),B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Receptor protein-tyrosine kinase erbB-2,9606,,,IC50,uM,UO_0000065,,2.0
3942,,23236818,[],CHEMBL4828269,Inhibition of ERBB2 (unknown origin) by mobili...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Receptor protein-tyrosine kinase erbB-2,9606,,,IC50,nM,UO_0000065,,7491.0
3943,,23285156,[],CHEMBL4838201,Inhibition of ERBB2 (unknown origin) by ELISA,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Receptor protein-tyrosine kinase erbB-2,9606,,,IC50,nM,UO_0000065,,1000.0


## 5. Data pre-processing of the biological data

In [104]:
# Give headings to the columns
selection = ['molecule_chembl_id','canonical_smiles','standard_value']
df4 = df3[selection]
df4

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL68920,Cc1cc(C)c(/C=C2\C(=O)Nc3ncnc(Nc4ccc(F)c(Cl)c4)...,300.0
2,CHEMBL69960,Cc1cc(C(=O)N2CCOCC2)[nH]c1/C=C1\C(=O)Nc2ncnc(N...,400.0
4,CHEMBL67057,Cc1cc(C(=O)N2CCOCC2)[nH]c1/C=C1\C(=O)Nc2ncnc(N...,100.0
6,CHEMBL65848,Cc1cc(C(=O)N2CCOCC2)[nH]c1/C=C1\C(=O)Nc2ncnc(N...,5000.0
8,CHEMBL69629,Cc1cc(C(=O)NCCN2CCOCC2)[nH]c1/C=C1\C(=O)Nc2ncn...,100.0
...,...,...,...
3940,CHEMBL4798527,Cc1ccc(NC(=O)c2ccc(C)c(C(F)(F)F)c2)cc1C#Cc1nn(...,165.0
3941,CHEMBL4799738,Cn1cc(-c2ccc3c(Nc4cccc(NC(=O)Nc5cccc(F)c5)c4)n...,2000.0
3942,CHEMBL4869634,CC(C)Oc1cc(F)ccc1Nc1ncnc2ccc(-c3ccc(C(=O)N4CCN...,7491.0
3943,CHEMBL4871361,CC(C)n1cc(C(=O)Nc2cc(F)c(-c3c[nH]c4ncnc(N)c34)...,1000.0


In [105]:
# Save data to csv file
df4.to_csv('COVID19_02_bioactivity_data_preprocessed.csv', index=False)

### 5.1 Labeling compounds as active, inactive or intermediate

In [109]:
df5 = pd.read_csv('COVID19_02_bioactivity_data_preprocessed.csv')

In [112]:
# active: standard_value < 1,000nM
# inactive: standard_value > 10,000nM
# intermediate: standard_value > 1,000nM and standard_value < 10,000nM

bioactivity_threshold = []
for i in df5.standard_value:
  if float(i) >= 10000:
    bioactivity_threshold.append("inactive")
  elif float(i) <= 1000:
    bioactivity_threshold.append("active")
  else:
    bioactivity_threshold.append("intermediate")

In [114]:
bioactivity_class = pd.Series(bioactivity_threshold, name='class')
df6 = pd.concat([df5, bioactivity_class], axis=1)
df6

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,class
0,CHEMBL68920,Cc1cc(C)c(/C=C2\C(=O)Nc3ncnc(Nc4ccc(F)c(Cl)c4)...,300.0,active
1,CHEMBL69960,Cc1cc(C(=O)N2CCOCC2)[nH]c1/C=C1\C(=O)Nc2ncnc(N...,400.0,active
2,CHEMBL67057,Cc1cc(C(=O)N2CCOCC2)[nH]c1/C=C1\C(=O)Nc2ncnc(N...,100.0,active
3,CHEMBL65848,Cc1cc(C(=O)N2CCOCC2)[nH]c1/C=C1\C(=O)Nc2ncnc(N...,5000.0,intermediate
4,CHEMBL69629,Cc1cc(C(=O)NCCN2CCOCC2)[nH]c1/C=C1\C(=O)Nc2ncn...,100.0,active
...,...,...,...,...
2617,CHEMBL4798527,Cc1ccc(NC(=O)c2ccc(C)c(C(F)(F)F)c2)cc1C#Cc1nn(...,165.0,active
2618,CHEMBL4799738,Cn1cc(-c2ccc3c(Nc4cccc(NC(=O)Nc5cccc(F)c5)c4)n...,2000.0,intermediate
2619,CHEMBL4869634,CC(C)Oc1cc(F)ccc1Nc1ncnc2ccc(-c3ccc(C(=O)N4CCN...,7491.0,intermediate
2620,CHEMBL4871361,CC(C)n1cc(C(=O)Nc2cc(F)c(-c3c[nH]c4ncnc(N)c34)...,1000.0,active


In [116]:
# Saved processed data file to CSV file
df6.to_csv('COVID19_03_bioactivity_data_curated.csv', index=False)