## **Installing libraries**

Install the ChEMBL web service package so that we can retrieve bioactivity data from the ChEMBL Database.

In [None]:
! pip install chembl_webresource_client



## **Importing libraries**

In [None]:
# Import necessary libraries
import pandas as pd
from chembl_webresource_client.new_client import new_client

## **Search for Target protein**

### **Target search for coronavirus**

In [None]:
# Target search for coronavirus
target = new_client.target
target_query = target.search('coronavirus')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Coronavirus,Coronavirus,17.0,False,CHEMBL613732,[],ORGANISM,11119
1,[],Feline coronavirus,Feline coronavirus,14.0,False,CHEMBL612744,[],ORGANISM,12663
2,[],Murine coronavirus,Murine coronavirus,14.0,False,CHEMBL5209664,[],ORGANISM,694005
3,[],Canine coronavirus,Canine coronavirus,14.0,False,CHEMBL5291668,[],ORGANISM,11153
4,[],Human coronavirus 229E,Human coronavirus 229E,13.0,False,CHEMBL613837,[],ORGANISM,11137
5,[],Human coronavirus OC43,Human coronavirus OC43,13.0,False,CHEMBL5209665,[],ORGANISM,31631
6,"[{'xref_id': 'P0C6U8', 'xref_name': None, 'xre...",SARS coronavirus,SARS coronavirus 3C-like proteinase,10.0,False,CHEMBL3927,"[{'accession': 'P0C6U8', 'component_descriptio...",SINGLE PROTEIN,227859
7,[],Middle East respiratory syndrome-related coron...,Middle East respiratory syndrome-related coron...,9.0,False,CHEMBL4296578,[],ORGANISM,1335626
8,"[{'xref_id': 'P0C6X7', 'xref_name': None, 'xre...",SARS coronavirus,Replicase polyprotein 1ab,4.0,False,CHEMBL5118,"[{'accession': 'P0C6X7', 'component_descriptio...",SINGLE PROTEIN,227859
9,[],Severe acute respiratory syndrome coronavirus 2,Replicase polyprotein 1ab,4.0,False,CHEMBL4523582,"[{'accession': 'P0DTD1', 'component_descriptio...",SINGLE PROTEIN,2697049


### **Select and retrieve bioactivity data for *SARS coronavirus 3C-like proteinase* (fifth entry)**

We will assign the fifth entry (which corresponds to the target protein, *coronavirus 3C-like proteinase*) to the ***selected_target*** variable

In [None]:
selected_target = targets.target_chembl_id[6]
selected_target

'CHEMBL3927'

Here, we will retrieve only bioactivity data for *coronavirus 3C-like proteinase* (CHEMBL3927) that are reported as IC$_{50}$ values in nM (nanomolar) unit.

In [None]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")

In [None]:
df = pd.DataFrame.from_dict(res)

In [None]:
df

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,1480935,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,7.2
1,,,1480936,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,9.4
2,,,1481061,[],CHEMBL830868,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,13.5
3,,,1481065,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,13.11
4,,,1481066,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,,,12041507,[],CHEMBL2150313,Inhibition of SARS-CoV PLpro expressed in Esch...,B,,,BAO_0000190,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,10.6
129,,,12041508,[],CHEMBL2150313,Inhibition of SARS-CoV PLpro expressed in Esch...,B,,,BAO_0000190,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,10.1
130,,,12041509,[],CHEMBL2150313,Inhibition of SARS-CoV PLpro expressed in Esch...,B,,,BAO_0000190,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,11.5
131,,,12041510,[],CHEMBL2150313,Inhibition of SARS-CoV PLpro expressed in Esch...,B,,,BAO_0000190,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,10.7


In [None]:
df.standard_type.unique()

array(['IC50'], dtype=object)

Finally we will save the resulting bioactivity data to a CSV file **bioactivity_data.csv**.

In [None]:
df.to_csv('bioactivity_data.csv', index=False)

## **Handling missing data**
If any compounds has missing value for the **standard_value** column then drop it

In [None]:
df2 = df[df.standard_value.notna()]
df2

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,1480935,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,7.2
1,,,1480936,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,9.4
2,,,1481061,[],CHEMBL830868,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,13.5
3,,,1481065,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,13.11
4,,,1481066,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,,,12041507,[],CHEMBL2150313,Inhibition of SARS-CoV PLpro expressed in Esch...,B,,,BAO_0000190,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,10.6
129,,,12041508,[],CHEMBL2150313,Inhibition of SARS-CoV PLpro expressed in Esch...,B,,,BAO_0000190,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,10.1
130,,,12041509,[],CHEMBL2150313,Inhibition of SARS-CoV PLpro expressed in Esch...,B,,,BAO_0000190,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,11.5
131,,,12041510,[],CHEMBL2150313,Inhibition of SARS-CoV PLpro expressed in Esch...,B,,,BAO_0000190,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,10.7


Apparently, for this dataset there is no missing data. But we can use the above code cell for bioactivity data of other target protein.

## **Data pre-processing of the bioactivity data**

### **Labeling compounds as either being active, inactive or intermediate**
The bioactivity data is in the IC50 unit. Compounds having values of less than 1000 nM will be considered to be **active** while those greater than 10,000 nM will be considered to be **inactive**. As for those values in between 1,000 and 10,000 nM will be referred to as **intermediate**.

In [None]:
bioactivity_class = []
for i in df2.value:
  if float(i) >= 10:
    bioactivity_class.append("inactive")
  elif float(i) <= 1:
    bioactivity_class.append("active")
  else:
    bioactivity_class.append("intermediate")

In [None]:
bioactivity_class

['intermediate',
 'intermediate',
 'inactive',
 'inactive',
 'intermediate',
 'active',
 'intermediate',
 'active',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'active',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'intermediate',
 'inactive',
 'intermediate',
 'intermediate',
 'intermediate',
 'inactive',
 'intermediate',
 'inactive',
 'intermediate',
 'inactive',
 'intermediate',
 'inactive',
 'intermediate',
 'inactive',
 'intermediate',
 'inactive',
 'intermediate',
 'inactive',
 'intermediate',
 'inactive',
 'intermediate',
 'inactive',
 'intermediate',
 'inactive',
 'intermediate',
 'inactive',
 'intermediate',
 'inactive',
 'intermediate',
 'inactive',
 'intermediate',
 'inactive',
 'intermediate',
 'inactive',
 'intermediate',
 'inactive',
 'intermediate',
 'inactive',
 'intermediate',
 'inactive',
 'intermediate',
 'inactive',
 'intermediate',
 'inactive',
 'intermediate',
 'inactive',
 'intermediate',
 'inactive',
 'inte

### **Iterate the *molecule_chembl_id* to a list**

In [None]:
mol_cid = []
for i in df2.assay_chembl_id :
  mol_cid.append(i)

In [None]:
mol_cid

['CHEMBL829584',
 'CHEMBL829584',
 'CHEMBL830868',
 'CHEMBL829584',
 'CHEMBL829584',
 'CHEMBL828143',
 'CHEMBL829584',
 'CHEMBL829584',
 'CHEMBL829584',
 'CHEMBL829584',
 'CHEMBL829584',
 'CHEMBL829584',
 'CHEMBL880249',
 'CHEMBL880249',
 'CHEMBL868157',
 'CHEMBL868157',
 'CHEMBL868157',
 'CHEMBL868157',
 'CHEMBL868157',
 'CHEMBL868157',
 'CHEMBL868157',
 'CHEMBL868157',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248',
 'CHEMBL871248

### **Iterate *canonical_smiles* to a list**

In [None]:
canonical_smiles = []
for i in df2.canonical_smiles:
  canonical_smiles.append(i)

In [None]:
canonical_smiles

['Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21',
 'O=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc21',
 'O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21',
 'O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc21',
 'O=C1C(=O)N(Cc2cc3ccccc3s2)c2c1cccc2[N+](=O)[O-]',
 'O=C1C(=O)N(Cc2cc3ccccc3s2)c2c(Br)cccc21',
 'O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccc(F)cc21',
 'O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccc(I)cc21',
 'O=C1C(=O)N(Cc2cc3ccccc3s2)c2cccc(Cl)c21',
 'O=C1C(=O)N(C/C=C/c2cc3ccccc3s2)c2ccc(I)cc21',
 'O=C(Nc1ccc(Cl)cc1)c1ccc(CN2C(=O)C(=O)c3cc(I)ccc32)s1',
 'O=C1C(=O)N(Cc2ccc(C(=O)N3CCCCC3)s2)c2ccc(I)cc21',
 'CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@@H](CC(=O)[C@@H](NC(=O)c1cc(C)on1)C(C)C)Cc1ccccc1',
 'CCOC(=O)/C=C/[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H](CC=C(C)C)CC(=O)[C@@H](NC(=O)c1cc(C)on1)C(C)C',
 'CCCCN1C(=O)C(=O)c2cc(I)ccc21',
 'NC(=O)c1ccc2c(c1)C(=O)C(=O)N2Cc1ccc2ccccc2c1',
 'NC(=O)c1ccc2c(c1)C(=O)C(=O)N2Cc1ccccc1',
 'CCCCN1C(=O)C(=O)c2cc(C(N)=O)ccc21',
 'CCCN1C(=O)C(=O)c2cc(C(N)=O)ccc21',
 'CN1C(=O)C(=O)c2cc(C(N)=O)ccc21',
 'O=C1C(=O)N(Cc2cc

### **Iterate *standard_value* to a list**

In [None]:
standard_value = []
for i in df2.standard_value:
  standard_value.append(i)

In [None]:
standard_value

['7200.0',
 '9400.0',
 '13500.0',
 '13110.0',
 '2000.0',
 '980.0',
 '4820.0',
 '950.0',
 '11200.0',
 '23500.0',
 '12570.0',
 '17500.0',
 '45000.0',
 '70000.0',
 '66000.0',
 '370.0',
 '12500.0',
 '19000.0',
 '25000.0',
 '71000.0',
 '1100.0',
 '50000.0',
 '3019.95',
 '3000.0',
 '301995.17',
 '300000.0',
 '251188.64',
 '250000.0',
 '204173.79',
 '200000.0',
 '100000.0',
 '100000.0',
 '60255.96',
 '60000.0',
 '45708.82',
 '45000.0',
 '40738.03',
 '40000.0',
 '15135.61',
 '15000.0',
 '15135.61',
 '15000.0',
 '12022.64',
 '12000.0',
 '1000000.0',
 '1000000.0',
 '501187.23',
 '500000.0',
 '407380.28',
 '400000.0',
 '354813.39',
 '350000.0',
 '301995.17',
 '300000.0',
 '301995.17',
 '300000.0',
 '204173.79',
 '200000.0',
 '204173.79',
 '200000.0',
 '204173.79',
 '200000.0',
 '204173.79',
 '200000.0',
 '60255.96',
 '60000.0',
 '40738.03',
 '40000.0',
 '30199.52',
 '30000.0',
 '15135.61',
 '15000.0',
 '14125.38',
 '14000.0',
 '11220.18',
 '11000.0',
 '10000.0',
 '10000.0',
 '900.0',
 '6000.0',
 

### **Combine the 4 lists into a dataframe**

In [None]:
data_tuples = list(zip(mol_cid, canonical_smiles, bioactivity_class, standard_value))
df3 = pd.DataFrame( data_tuples,  columns=['molecule_chembl_id', 'canonical_smiles', 'bioactivity_class', 'standard_value'])

In [None]:
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,bioactivity_class,standard_value
0,CHEMBL829584,Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21,intermediate,7200.0
1,CHEMBL829584,O=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc21,intermediate,9400.0
2,CHEMBL830868,O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21,inactive,13500.0
3,CHEMBL829584,O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc21,inactive,13110.0
4,CHEMBL829584,O=C1C(=O)N(Cc2cc3ccccc3s2)c2c1cccc2[N+](=O)[O-],intermediate,2000.0
...,...,...,...,...
128,CHEMBL2150313,COC(=O)[C@@]1(C)CCCc2c1ccc1c2C(=O)C(=O)c2c(C)c...,inactive,10600.0
129,CHEMBL2150313,C[C@H]1COC2=C1C(=O)C(=O)c1c2ccc2c1CCCC2(C)C,inactive,10100.0
130,CHEMBL2150313,Cc1coc2c1C(=O)C(=O)c1c-2ccc2c(C)cccc12,inactive,11500.0
131,CHEMBL2150313,Cc1cccc2c3c(ccc12)C1=C(C(=O)C3=O)[C@@H](C)CO1,inactive,10700.0


### **Alternative method**

Saves dataframe to CSV file

In [None]:
df3.to_csv('bioactivity_preprocessed_data.csv', index=False)

Let's copy to the Google Drive

---