<a href="https://colab.research.google.com/github/samservo09/bioinformatics-bipolar-drug-discovery/blob/main/1-data-collection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bioinformatics: Drug discovery on Norepinephrine transporter protein

## Data Collection

### Libraries Required

In [None]:
# install ChEMBL web service package to retrieve the biological data
! pip install -q chembl_webresource_client

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/55.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.2/55.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/61.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/66.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.4/66.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# import necessary libraries
import pandas as pd
from chembl_webresource_client.new_client import new_client

### Searching for the target protein

In [61]:
# target search for Norepinephrine
target = new_client.target
target_query = target.search('Norepinephrine')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,"[{'xref_id': 'P23975', 'xref_name': None, 'xre...",Homo sapiens,Norepinephrine transporter,15.0,False,CHEMBL222,"[{'accession': 'P23975', 'component_descriptio...",SINGLE PROTEIN,9606
1,"[{'xref_id': 'NBK22978', 'xref_name': 'Brain n...",Mus musculus,Norepinephrine transporter,15.0,False,CHEMBL2370,"[{'accession': 'O55192', 'component_descriptio...",SINGLE PROTEIN,10090
2,"[{'xref_id': 'Q9WTR4', 'xref_name': None, 'xre...",Rattus norvegicus,Norepinephrine transporter,15.0,False,CHEMBL304,"[{'accession': 'Q9WTR4', 'component_descriptio...",SINGLE PROTEIN,10116
3,[],Rattus norvegicus,Monoamine transporters; Norepinephrine & serot...,12.0,False,CHEMBL2096672,"[{'accession': 'P31652', 'component_descriptio...",SELECTIVITY GROUP,10116
4,[],Homo sapiens,Monoamine transporters; Norepinephrine & dopamine,12.0,False,CHEMBL2096990,"[{'accession': 'Q01959', 'component_descriptio...",SELECTIVITY GROUP,9606
...,...,...,...,...,...,...,...,...,...
69,[],Cavia porcellus,Alpha-1A adrenergic receptor,4.0,False,CHEMBL2150843,"[{'accession': 'Q9WU25', 'component_descriptio...",SINGLE PROTEIN,10141
70,[],Macaca mulatta,Beta-3 adrenergic receptor,4.0,False,CHEMBL3124732,"[{'accession': 'Q28524', 'component_descriptio...",SINGLE PROTEIN,9544
71,[],Cricetulus griseus,Alpha-1B adrenergic receptor,4.0,False,CHEMBL3988626,"[{'accession': 'G3HDX1', 'component_descriptio...",SINGLE PROTEIN,10029
72,[],Homo sapiens,Mu opioid receptor/Alpha-2A adrenergic receptor,3.0,False,CHEMBL3883321,"[{'accession': 'P08913', 'component_descriptio...",PROTEIN COMPLEX,9606


In [62]:
# select and retrieve bioactivity data for CaM-kinase kinase beta (first entry)
selected_target = targets.target_chembl_id[0]
selected_target

'CHEMBL222'

In [63]:
# retrieve only bioactivity data for CHEMBL222
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")

In [64]:
df = pd.DataFrame.from_dict(res)

In [66]:
df.shape

(3458, 46)

In [67]:
df.head(3)

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,391050,[],CHEMBL751372,Inhibition of [3H]norepinephrine uptake in HEK...,B,,,BAO_0000190,...,Homo sapiens,Norepinephrine transporter,9606,,,IC50,nM,UO_0000065,,173.0
1,,,391572,[],CHEMBL756262,Inhibition of NE uptake in HEK cells expressin...,F,,,BAO_0000190,...,Homo sapiens,Norepinephrine transporter,9606,,,IC50,nM,UO_0000065,,0.62
2,,,392730,[],CHEMBL752511,Inhibition of [3H]NE reuptake by human norepin...,B,,,BAO_0000190,...,Homo sapiens,Norepinephrine transporter,9606,,,IC50,uM,UO_0000065,,0.2


In [68]:
# check that all of the standard types are IC50
df.standard_type.unique()

array(['IC50'], dtype=object)

Target: low as possible for standard value

In [69]:
# standard value represents potency; lower = better potency of the drug
df.standard_value.sort_values().head(10)

Unnamed: 0,standard_value
1647,0.0
652,0.05
2978,0.065
2977,0.079
2976,0.08
2975,0.1
2974,0.13
1313,0.1585
2983,0.21
1315,0.2512


## Save the file

In [73]:
# save the data to CSV file
df.to_csv('Norepinephrine_bioactivity_data.csv', index=False)

In [74]:
# copy the file to google drive to be accessible within colab
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [75]:
# create a data folder in the colab notebook folder
! mkdir -p "/content/gdrive/My Drive/Colab Notebooks/data/"

In [76]:
# copy the file to the data folder
! cp Norepinephrine_bioactivity_data.csv "/content/gdrive/My Drive/Colab Notebooks/data/"

In [77]:
# list the files in data folder
! ls "/content/gdrive/My Drive/Colab Notebooks/data/"

CaMKK2_bioactivity_data_2class_pIC50.csv  CaMKK2_preprocessed_data.csv
CaMKK2_bioactivity_data_3class_pIC50.csv  Norepinephrine_bioactivity_data.csv
CaMKK2_bioactivity_data.csv


In [78]:
# check the content of the file
! head "/content/gdrive/My Drive/Colab Notebooks/data/Norepinephrine_bioactivity_data.csv"

action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
,,391050,[],CHEMBL751372,Inhibition of [3H]norepinephrine uptake in HEK cells expressing human NET,B,,,BAO_0000190,BAO_0000219,cell-based format,COC(=O)C1C(c2ccc(O)c(O)c2)CC2CCC1N2C,,,CHEMBL1136450,Bioorg Med Chem Lett,2003.0,"{'bei': '23.21', 'le': '0.44', 'lle': '4.92', 'sei': '9.66'}",CHEMBL132683,,CHEMBL1

## Data Pre-Processing

### Handling missing data

If any compounds has missing value for the **standard_value** column then drop it.

In [79]:
df.shape

(3458, 46)

In [80]:
df2 = df[df.standard_value.notna()]
df2

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,391050,[],CHEMBL751372,Inhibition of [3H]norepinephrine uptake in HEK...,B,,,BAO_0000190,...,Homo sapiens,Norepinephrine transporter,9606,,,IC50,nM,UO_0000065,,173.0
1,,,391572,[],CHEMBL756262,Inhibition of NE uptake in HEK cells expressin...,F,,,BAO_0000190,...,Homo sapiens,Norepinephrine transporter,9606,,,IC50,nM,UO_0000065,,0.62
2,,,392730,[],CHEMBL752511,Inhibition of [3H]NE reuptake by human norepin...,B,,,BAO_0000190,...,Homo sapiens,Norepinephrine transporter,9606,,,IC50,uM,UO_0000065,,0.2
3,,,392740,[],CHEMBL752511,Inhibition of [3H]NE reuptake by human norepin...,B,,,BAO_0000190,...,Homo sapiens,Norepinephrine transporter,9606,,,IC50,uM,UO_0000065,,0.165
4,,,392890,[],CHEMBL756262,Inhibition of NE uptake in HEK cells expressin...,F,,,BAO_0000190,...,Homo sapiens,Norepinephrine transporter,9606,,,IC50,nM,UO_0000065,,233.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3453,"{'action_type': 'INHIBITOR', 'description': 'N...",,24965239,[],CHEMBL5216985,Inhibition of norepinephrine transporter (unkn...,B,,,BAO_0000190,...,Homo sapiens,Norepinephrine transporter,9606,,,IC50,uM,UO_0000065,,1.0
3454,"{'action_type': 'INHIBITOR', 'description': 'N...",,24987117,[],CHEMBL5228412,Inhibition of NET (unknown origin),B,,,BAO_0000190,...,Homo sapiens,Norepinephrine transporter,9606,,,IC50,nM,UO_0000065,,7.0
3455,"{'action_type': 'BINDING AGENT', 'description'...",,25402993,[],CHEMBL5303857,Cross screening panel,B,,,BAO_0000190,...,Homo sapiens,Norepinephrine transporter,9606,,,pIC50,,UO_0000065,,4.0
3456,"{'action_type': 'BINDING AGENT', 'description'...",,25403929,[],CHEMBL5303906,Cross screening panel,B,,,BAO_0000190,...,Homo sapiens,Norepinephrine transporter,9606,,,pIC50,,UO_0000065,,4.7


There is one row removed which doesn't have a standard value.

### Labeling compounds as either active, inactive, or intermediate

**IC50** - "half-maximal inhibitory concentration" <br>
- to quantify potency of a substance in inhibiting a specific biological or biochemical function.
- if we want to treat a disease/disorder with a drug, knowing which compound can inhibit the protein related to this will help
- tells us how well a drug works at stopping a disease, and a lower IC50 means the drug can do its job with less effort (more effective)

The bioactivity data is in the IC50 unit. Compounds having values LESS than 1000 nM will be considered to be **active** while those GREATER than 10,000 nM will be considered to be **inactive**. As for those values in BETWEEN 1,000 and 10,000 nM will be referred to as **intermediate**. - *From Data Professor*

In simpler terms <br>
1. **Active** - the drug works very well at a small amount. <br>
2. **Inactive** - the drug doesn't work at all, even at a large amount. <br>
3. **Intermediate** - the drug works, but not as effective as an active drug.

In [81]:
bioactivity_class = []
for i in df2.standard_value:
  if float(i) >= 10000:
    bioactivity_class.append("inactive")
  elif float(i) <= 1000:
    bioactivity_class.append("active")
  else:
    bioactivity_class.append("intermediate")

### Iterate the molecule_chembl_id, canonical_smiles, and standard_value columns to a list

**molecule_chembl_id** - unique identification number of each molecule

In [82]:
df2.molecule_chembl_id

Unnamed: 0,molecule_chembl_id
0,CHEMBL132683
1,CHEMBL544370
2,CHEMBL417049
3,CHEMBL26320
4,CHEMBL370805
...,...
3453,CHEMBL5219762
3454,CHEMBL4745144
3455,CHEMBL4635134
3456,CHEMBL4639128


**canonical_smiles** - SMILES (Simplified Molecular Input Line Entry System) <br>
- standardized way of representing chemical structures as text strings <br>
- notation for describing molecules <br>
- "canonical" part ensures each unique molecule has only one possible SMILES representation for easier comparison

**Note:** In drug discovery, we aim to identify compounds with lowest IC50 value against the target protein. A lower IC50 indicates that a smaller amount of the compound is needed to achieve a significant inhibitory effect, which can be advantageous in terms of dosage and potential side effects.

In [83]:
selection = ['molecule_chembl_id', 'canonical_smiles', 'standard_value']
df3 = df2[selection]
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL132683,COC(=O)C1C(c2ccc(O)c(O)c2)CC2CCC1N2C,173.0
1,CHEMBL544370,Cl.OC1(c2ccc3c(c2)OCO3)c2ccccc2C2=NCCCN21,0.62
2,CHEMBL417049,CN(CCOC(c1ccccc1)c1ccccc1)C1CCN(CCCc2ccccc2)CC1,200.0
3,CHEMBL26320,c1ccc(CCCN2CCN(CCOC(c3ccccc3)c3ccccc3)CC2)cc1,165.0
4,CHEMBL370805,COC(=O)[C@H]1[C@@H](OC(=O)c2ccccc2)C[C@@H]2CC[...,233.0
...,...,...,...
3453,CHEMBL5219762,[2H]C([2H])([2H])C(N[C@@]([2H])(C)C(=O)c1cccc(...,1000.0
3454,CHEMBL4745144,CNCC[C@H](Oc1cccc(N2CCN(C)c3nc(N)ncc3C2=O)c1)c...,7.0
3455,CHEMBL4635134,CNC(=O)c1cc(C(=O)NC2CC2)cn(Cc2ccccc2)c1=O,100000.0
3456,CHEMBL4639128,COCc1nc2cnc3cc(-c4c(C)noc4C)c(OC[C@H]4CCNC4)cc...,19952.62


In [84]:
# combine bioactivity column with the created dataframe
df3['bioactivity_class'] = bioactivity_class
df3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['bioactivity_class'] = bioactivity_class


Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity_class
0,CHEMBL132683,COC(=O)C1C(c2ccc(O)c(O)c2)CC2CCC1N2C,173.0,active
1,CHEMBL544370,Cl.OC1(c2ccc3c(c2)OCO3)c2ccccc2C2=NCCCN21,0.62,active
2,CHEMBL417049,CN(CCOC(c1ccccc1)c1ccccc1)C1CCN(CCCc2ccccc2)CC1,200.0,active
3,CHEMBL26320,c1ccc(CCCN2CCN(CCOC(c3ccccc3)c3ccccc3)CC2)cc1,165.0,active
4,CHEMBL370805,COC(=O)[C@H]1[C@@H](OC(=O)c2ccccc2)C[C@@H]2CC[...,233.0,active
...,...,...,...,...
3453,CHEMBL5219762,[2H]C([2H])([2H])C(N[C@@]([2H])(C)C(=O)c1cccc(...,1000.0,active
3454,CHEMBL4745144,CNCC[C@H](Oc1cccc(N2CCN(C)c3nc(N)ncc3C2=O)c1)c...,7.0,active
3455,CHEMBL4635134,CNC(=O)c1cc(C(=O)NC2CC2)cn(Cc2ccccc2)c1=O,100000.0,inactive
3456,CHEMBL4639128,COCc1nc2cnc3cc(-c4c(C)noc4C)c(OC[C@H]4CCNC4)cc...,19952.62,inactive


### Save the preprocessed file

In [85]:
# save the preprocessed dataframe to a CSV file
df3.to_csv('Norepinephrine_preprocessed_data.csv', index=False)

In [86]:
# add it to the google drive folder
! cp Norepinephrine_preprocessed_data.csv "/content/gdrive/My Drive/Colab Notebooks/data/"

In [87]:
# check if the preprocessed file is in the data
! ls "/content/gdrive/My Drive/Colab Notebooks/data/"

CaMKK2_bioactivity_data_2class_pIC50.csv  CaMKK2_preprocessed_data.csv
CaMKK2_bioactivity_data_3class_pIC50.csv  Norepinephrine_bioactivity_data.csv
CaMKK2_bioactivity_data.csv		  Norepinephrine_preprocessed_data.csv
