## 1. Installing libraries

! pip install chembl_webresource_client

## 2. Importing libraries

In [4]:
import pandas as pd
from chembl_webresource_client.new_client import new_client

## 3. Search for target protein (COVID19)

### 3.1 Target search for Coronavirus

In [5]:
target = new_client.target
target_query = target.search('coronavirus')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Coronavirus,Coronavirus,17.0,False,CHEMBL613732,[],ORGANISM,11119
1,[],SARS coronavirus,SARS coronavirus,15.0,False,CHEMBL612575,[],ORGANISM,227859
2,[],Feline coronavirus,Feline coronavirus,15.0,False,CHEMBL612744,[],ORGANISM,12663
3,[],Human coronavirus 229E,Human coronavirus 229E,13.0,False,CHEMBL613837,[],ORGANISM,11137
4,"[{'xref_id': 'P0C6U8', 'xref_name': None, 'xre...",SARS coronavirus,SARS coronavirus 3C-like proteinase,10.0,False,CHEMBL3927,"[{'accession': 'P0C6U8', 'component_descriptio...",SINGLE PROTEIN,227859
5,[],Middle East respiratory syndrome-related coron...,Middle East respiratory syndrome-related coron...,9.0,False,CHEMBL4296578,[],ORGANISM,1335626
6,"[{'xref_id': 'P0C6X7', 'xref_name': None, 'xre...",SARS coronavirus,Replicase polyprotein 1ab,4.0,False,CHEMBL5118,"[{'accession': 'P0C6X7', 'component_descriptio...",SINGLE PROTEIN,227859
7,[],Severe acute respiratory syndrome coronavirus 2,Replicase polyprotein 1ab,4.0,False,CHEMBL4523582,"[{'accession': 'P0DTD1', 'component_descriptio...",SINGLE PROTEIN,2697049


In [7]:
selected_target = targets.target_chembl_id[6]
selected_target

'CHEMBL5118'

In [8]:
# Retrieve bioactivity data from ChEMBL db i.e. IC50
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")

In [13]:
df = pd.DataFrame.from_dict(res)
df

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,1988091,[],CHEMBL898907,Inhibition of SARS-CoV 3C-like protease by FRE...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,870.0
1,,1988092,[],CHEMBL898907,Inhibition of SARS-CoV 3C-like protease by FRE...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,200.0
2,,1988093,[],CHEMBL898907,Inhibition of SARS-CoV 3C-like protease by FRE...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,300.0
3,,1988094,[],CHEMBL898907,Inhibition of SARS-CoV 3C-like protease by FRE...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,15.0
4,,1988095,[],CHEMBL898907,Inhibition of SARS-CoV 3C-like protease by FRE...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,,18548176,[],CHEMBL4198706,Inhibition of SARS coronavirus 3CL protease us...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,4.3
211,,18548177,[],CHEMBL4198706,Inhibition of SARS coronavirus 3CL protease us...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,5.5
212,,18548178,[],CHEMBL4198706,Inhibition of SARS coronavirus 3CL protease us...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,4.1
213,,18548179,[],CHEMBL4198706,Inhibition of SARS coronavirus 3CL protease us...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,3.2


In [14]:
df.standard_type.unique()

array(['IC50'], dtype=object)

In [15]:
df.to_csv('1_bioactivity_data_Data_Collection_part.csv', index=False)

## 4. Handling missing data

In [16]:
# if any column has missing value for the standard_value, drop that row(compound)

df2 = df[df.standard_value.notna()]
df2

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,1988091,[],CHEMBL898907,Inhibition of SARS-CoV 3C-like protease by FRE...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,870.0
1,,1988092,[],CHEMBL898907,Inhibition of SARS-CoV 3C-like protease by FRE...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,200.0
2,,1988093,[],CHEMBL898907,Inhibition of SARS-CoV 3C-like protease by FRE...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,300.0
3,,1988094,[],CHEMBL898907,Inhibition of SARS-CoV 3C-like protease by FRE...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,15.0
4,,1988095,[],CHEMBL898907,Inhibition of SARS-CoV 3C-like protease by FRE...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,,18548176,[],CHEMBL4198706,Inhibition of SARS coronavirus 3CL protease us...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,4.3
211,,18548177,[],CHEMBL4198706,Inhibition of SARS coronavirus 3CL protease us...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,5.5
212,,18548178,[],CHEMBL4198706,Inhibition of SARS coronavirus 3CL protease us...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,4.1
213,,18548179,[],CHEMBL4198706,Inhibition of SARS coronavirus 3CL protease us...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,3.2


## 5. Data pre-processing of the biological data

### 5.1 Labeling compounds as either active, inactive or intermediate

In [20]:
# 1000nM = 1uM
# compounds' IC50 <= 1,000nM --> ACTIVE
# compounds' IC50 >= 10,000nM --> INACTIVE
# compounds' IC50 > 1,000nM && IC50 < 10,000nM --> INTERMEDIATE 

bioactivity_class = []

for i in df2.standard_value:
    if float(i) <= 1000:
        bioactivity_class.append("active")
    if float(i) >= 10000:
        bioactivity_class.append("inactive")
    else:
        bioactivity_class.append("intermediate")


### 5.2 Extract the columns: 'molecule_chembl_id', 'canonical_smiles', 'standard_value'

In [26]:
selection = ['molecule_chembl_id', 'canonical_smiles', 'standard_value']
df3 = df2[selection]
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL194398,CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@@H...,870000.0
1,CHEMBL393608,CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...,200000.0
2,CHEMBL238216,CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...,300000.0
3,CHEMBL235873,CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...,15000.0
4,CHEMBL397154,CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...,10000.0
...,...,...,...
210,CHEMBL4208764,CC(C)C[C@H](NC(=O)OC1(Cc2ccccc2)CCN(S(C)(=O)=O...,4300.0
211,CHEMBL4212620,CCC1(OC(=O)N[C@@H](CC(C)C)C(=O)N[C@H](C=O)C[C@...,5500.0
212,CHEMBL4216101,CCC1(OC(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](C[C@@H]...,4100.0
213,CHEMBL4217568,CCOC(=O)N1CCC(OC(=O)N[C@@H](CC(C)C)C(=O)N[C@H]...,3200.0


### 5.3 Combine the 4 lists into a dataframe

In [38]:
df4 = pd.concat([df3, pd.Series(bioactivity_class)], axis=1)
df4

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,0
0,CHEMBL194398,CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@@H...,870000.0,inactive
1,CHEMBL393608,CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...,200000.0,inactive
2,CHEMBL238216,CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...,300000.0,inactive
3,CHEMBL235873,CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...,15000.0,inactive
4,CHEMBL397154,CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...,10000.0,inactive
...,...,...,...,...
214,CHEMBL4204431,CCOC(=O)N1CCC(OC(=O)N[C@@H](CC(C)C)C(=O)N[C@@H...,8800.0,intermediate
215,,,,intermediate
216,,,,intermediate
217,,,,intermediate


In [39]:
# Drop the column if molecule_chembl_id is nan
df5 = df4[df4.molecule_chembl_id.notna()]
df5

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,0
0,CHEMBL194398,CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@@H...,870000.0,inactive
1,CHEMBL393608,CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...,200000.0,inactive
2,CHEMBL238216,CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...,300000.0,inactive
3,CHEMBL235873,CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...,15000.0,inactive
4,CHEMBL397154,CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...,10000.0,inactive
...,...,...,...,...
210,CHEMBL4208764,CC(C)C[C@H](NC(=O)OC1(Cc2ccccc2)CCN(S(C)(=O)=O...,4300.0,inactive
211,CHEMBL4212620,CCC1(OC(=O)N[C@@H](CC(C)C)C(=O)N[C@H](C=O)C[C@...,5500.0,intermediate
212,CHEMBL4216101,CCC1(OC(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](C[C@@H]...,4100.0,intermediate
213,CHEMBL4217568,CCOC(=O)N1CCC(OC(=O)N[C@@H](CC(C)C)C(=O)N[C@H]...,3200.0,intermediate


In [40]:
df5.to_csv('2_bioactivity_data_PROCESSED_Data_Collection_part.csv', index=False)