## Data Collection

ChemBL Database

**Installing libraries:**

In [1]:
pip install chembl_webresource_client

Note: you may need to restart the kernel to use updated packages.


__Importing libraries__

In [2]:
# Import necessary libraries
import pandas as pd
from chembl_webresource_client.new_client import new_client

  __version__ = __import__('pkg_resources').get_distribution('chembl_webresource_client').version


## **Search for Target protein**

**Target search for coronavirus**

In [3]:
#Target search for coronavirus
target = new_client.target
target_query = target.search('coronavirus')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Coronavirus,Coronavirus,17.0,False,CHEMBL613732,[],ORGANISM,11119
1,[],Feline coronavirus,Feline coronavirus,14.0,False,CHEMBL612744,[],ORGANISM,12663
2,[],Murine coronavirus,Murine coronavirus,14.0,False,CHEMBL5209664,[],ORGANISM,694005
3,[],Canine coronavirus,Canine coronavirus,14.0,False,CHEMBL5291668,[],ORGANISM,11153
4,[],Human coronavirus 229E,Human coronavirus 229E,13.0,False,CHEMBL613837,[],ORGANISM,11137
5,[],Human coronavirus OC43,Human coronavirus OC43,13.0,False,CHEMBL5209665,[],ORGANISM,31631
6,[],Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,10.0,False,CHEMBL3927,"[{'accession': 'P0C6U8', 'component_descriptio...",SINGLE PROTEIN,694009
7,[],Middle East respiratory syndrome-related coron...,Middle East respiratory syndrome-related coron...,9.0,False,CHEMBL4296578,[],ORGANISM,1335626
8,[],Severe acute respiratory syndrome-related coro...,Replicase polyprotein 1ab,4.0,False,CHEMBL5118,"[{'accession': 'P0C6X7', 'component_descriptio...",SINGLE PROTEIN,694009
9,[],Severe acute respiratory syndrome coronavirus 2,Replicase polyprotein 1ab,4.0,False,CHEMBL4523582,"[{'accession': 'P0DTD1', 'component_descriptio...",SINGLE PROTEIN,2697049


### Select and retrieve bioactivity for SARS coronavirus 3C-like porteinaise (6th entry)

We will assign the 7thth entry (which coressponds to the target protein, coronavirus 3C0like proteinaise to the __selected_table__

In [4]:
selected_target = targets.target_chembl_id[6]
selected_target

'CHEMBL3927'

Here, we will retrieve only bioactivity data for coronavirus-3C like proteinaise (CHEMBL3927) that are reported as IC50 values


In [5]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")

In [6]:
df = pd.DataFrame.from_dict(res)

In [7]:
df

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,1480935,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,7.2
1,,,1480936,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,9.4
2,,,1481061,[],CHEMBL830868,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,13.5
3,,,1481065,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,13.11
4,,,1481066,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242,"{'action_type': 'INHIBITOR', 'description': 'N...",,25516041,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5336133,Inhibition of SARS-CoV N-terminal 6His-SUMO-ta...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,pIC50,,UO_0000065,,6.42
243,"{'action_type': 'INHIBITOR', 'description': 'N...",,25516042,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5336133,Inhibition of SARS-CoV N-terminal 6His-SUMO-ta...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,0.165
244,"{'action_type': 'INHIBITOR', 'description': 'N...",,25516043,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5336133,Inhibition of SARS-CoV N-terminal 6His-SUMO-ta...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,0.161
245,"{'action_type': 'INHIBITOR', 'description': 'N...",,25516044,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5336133,Inhibition of SARS-CoV N-terminal 6His-SUMO-ta...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,pIC50,,UO_0000065,,6.78


In [8]:
df.head(3)

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,1480935,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,7.2
1,,,1480936,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,9.4
2,,,1481061,[],CHEMBL830868,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,13.5


standard_value reflects the concentration of the drug: the lower the concentration required, the better

In [9]:
df.to_csv('bioactivity_data.csv', index=False)

## Handling missing Data

If any compounds has the missing value for the __standard_value__ column then drop it

In [10]:
df2 = df[df.standard_value.notna()]
df2

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,1480935,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,7.2
1,,,1480936,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,9.4
2,,,1481061,[],CHEMBL830868,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,13.5
3,,,1481065,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,13.11
4,,,1481066,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242,"{'action_type': 'INHIBITOR', 'description': 'N...",,25516041,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5336133,Inhibition of SARS-CoV N-terminal 6His-SUMO-ta...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,pIC50,,UO_0000065,,6.42
243,"{'action_type': 'INHIBITOR', 'description': 'N...",,25516042,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5336133,Inhibition of SARS-CoV N-terminal 6His-SUMO-ta...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,0.165
244,"{'action_type': 'INHIBITOR', 'description': 'N...",,25516043,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5336133,Inhibition of SARS-CoV N-terminal 6His-SUMO-ta...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,IC50,uM,UO_0000065,,0.161
245,"{'action_type': 'INHIBITOR', 'description': 'N...",,25516044,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5336133,Inhibition of SARS-CoV N-terminal 6His-SUMO-ta...,B,,,BAO_0000190,...,Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,694009,,,pIC50,,UO_0000065,,6.78


## Data pre-processing of the bioactivity data

### Labeling compounds are either being active, inactive, or intermediate

The bioactivity data is in the IC50 unit. Compounds having values of less than 1000 nM will be considered to be __active__ while those grater than 10, 000 nM will be considered to be __inactive__. As for those values in between 1000 and 10,000 nM wil be referred to as __intermediate__

In [11]:
bioactivity_class = []
for i in df2.standard_value:
    if float(i) >= 10000:
        bioactivity_class.append("inactive")
    elif float(i) <= 1000:
        bioactivity_class.append("active")
    else:
        bioactivity_class.append("intermediate")

A compound is a molecule - which is a chemical structure that produces a modulatory activity (it exerts some effect on the target protein) in order to produce the desired biological effect (cures the symptoms).

Each molecule is identified with a *molecule_chembl_id*

If multiple rows share the same *molecule_chembl_id*, I will keep only one for simplicity

### Iterate the molecule_chembl_id to a list

In [12]:
mol_cid = []
for i in df2.molecule_chembl_id:
    mol_cid.append(i)

### Iterate canonical_smiles to a list

SMILES are a way to represent molecules using ASCII strings

In [13]:
canonical_smiles = []
for i in df2.canonical_smiles:
    canonical_smiles.append(i)

### Iterate standard_value to a list

In [14]:
standard_value = []
for i in df2.standard_value:
    standard_value.append(i)

### Better Approach:

In [15]:
selection = ['molecule_chembl_id', 'canonical_smiles', 'standard_value']
df3 = df2[selection]

In [16]:
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL187579,Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21,7200.0
1,CHEMBL188487,O=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc21,9400.0
2,CHEMBL185698,O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21,13500.0
3,CHEMBL426082,O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc21,13110.0
4,CHEMBL187717,O=C1C(=O)N(Cc2cc3ccccc3s2)c2c1cccc2[N+](=O)[O-],2000.0
...,...,...,...
242,CHEMBL4590273,Cc1cccc2nc(CSC(=S)NCc3cccnc3)cn12,380.19
243,CHEMBL5436771,S=C([S-])NCc1cccnc1.[K+],165.0
244,CHEMBL2365410,CC(C)C[C@H](NC(=O)OCc1ccccc1)C(=O)N[C@@H](CC1C...,161.0
245,CHEMBL5436771,S=C([S-])NCc1cccnc1.[K+],165.96


In [18]:
# add bioactivity class column with proper name
df3 = df2[['molecule_chembl_id', 'canonical_smiles', 'standard_value']].copy()
df3.loc[:, 'bioactivity_class'] = bioactivity_class

### Save dataframe to CSV file

In [19]:
df3.to_csv('bioactivity_preprocessed_data.csv', index=False)

In [20]:
! ls -l

total 696
-rw-r--r--@ 1 rubyzhou  staff   77273 Jul 22 10:15 1_data_collection.ipynb
-rw-r--r--@ 1 rubyzhou  staff   52885 Jul 22 10:10 2_data_cleaning_eda.ipynb
-rw-r--r--@ 1 rubyzhou  staff  138803 Jul 22 10:18 bioactivity_data.csv
-rw-r--r--@ 1 rubyzhou  staff   20699 Jul 22 10:19 bioactivity_preprocessed_data.csv


In [21]:
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity_class
0,CHEMBL187579,Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21,7200.0,intermediate
1,CHEMBL188487,O=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc21,9400.0,intermediate
2,CHEMBL185698,O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21,13500.0,inactive
3,CHEMBL426082,O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc21,13110.0,inactive
4,CHEMBL187717,O=C1C(=O)N(Cc2cc3ccccc3s2)c2c1cccc2[N+](=O)[O-],2000.0,intermediate
...,...,...,...,...
242,CHEMBL4590273,Cc1cccc2nc(CSC(=S)NCc3cccnc3)cn12,380.19,active
243,CHEMBL5436771,S=C([S-])NCc1cccnc1.[K+],165.0,active
244,CHEMBL2365410,CC(C)C[C@H](NC(=O)OCc1ccccc1)C(=O)N[C@@H](CC1C...,161.0,active
245,CHEMBL5436771,S=C([S-])NCc1cccnc1.[K+],165.96,active
