# Bioinformatics: Drug discovery on CaM-kinase kinase beta protein

## Data Collection

### Libraries Required

In [1]:
# install ChEMBL web service package to retrieve the biological data
! pip install -q chembl_webresource_client

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/55.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.2/55.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/61.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/66.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.4/66.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# import necessary libraries
import pandas as pd
from chembl_webresource_client.new_client import new_client

### Searching for the target protein

In [3]:
# target search for CaMKK2
target = new_client.target
target_query = target.search('CaMKK2')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,"[{'xref_id': 'CAMKK2', 'xref_name': None, 'xre...",Homo sapiens,CaM-kinase kinase beta,12.0,False,CHEMBL5284,"[{'accession': 'Q96RR4', 'component_descriptio...",SINGLE PROTEIN,9606
1,"[{'xref_id': 'O88831', 'xref_name': None, 'xre...",Rattus norvegicus,Calcium/calmodulin-dependent protein kinase ki...,11.0,False,CHEMBL1795115,"[{'accession': 'O88831', 'component_descriptio...",SINGLE PROTEIN,10116
2,[],Mus musculus,Calcium/calmodulin-dependent protein kinase ki...,11.0,False,CHEMBL4295888,"[{'accession': 'Q8C078', 'component_descriptio...",SINGLE PROTEIN,10090


In [4]:
# select and retrieve bioactivity data for CaM-kinase kinase beta (first entry)
selected_target = targets.target_chembl_id[0]
selected_target

'CHEMBL5284'

In [5]:
# retrieve only bioactivity data for CHEMBL5284
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")

In [6]:
df = pd.DataFrame.from_dict(res)

In [7]:
df.head(3)

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,2137123,[],CHEMBL940168,Inhibition of CaM-KKbeta,B,,,BAO_0000190,...,Homo sapiens,CaM-kinase kinase beta,9606,,,IC50,nM,UO_0000065,,200.0
1,,,2137359,[],CHEMBL940168,Inhibition of CaM-KKbeta,B,,,BAO_0000190,...,Homo sapiens,CaM-kinase kinase beta,9606,,,IC50,ng/ml,UO_0000274,,40.0
2,"{'action_type': 'ANTAGONIST', 'description': '...",Antagonist,2897276,[],CHEMBL1051267,Inhibition of CAMKKbeta in the presence of 20u...,B,,,BAO_0000190,...,Homo sapiens,CaM-kinase kinase beta,9606,,,IC50,uM,UO_0000065,,0.01


In [8]:
# check that all of the standard types are IC50
df.standard_type.unique()

array(['IC50'], dtype=object)

Target: low as possible for standard value

In [10]:
# standard value represents potency; lower = better potency of the drug
df.standard_value.sort_values().head(10)

Unnamed: 0,standard_value
1,0.04
32,1.585
28,1.995
45,1.995
2,10.0
55,100.0
20,100.0
128,1000.0
129,10000.0
57,10000.0


## Save the file

In [11]:
# save the data to CSV file
df.to_csv('CaMKK2_bioactivity_data.csv', index=False)

In [12]:
# copy the file to google drive to be accessible within colab
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [13]:
# create a data folder in the colab notebook folder
! mkdir -p "/content/gdrive/My Drive/Colab Notebooks/data/"

In [14]:
# copy the file to the data folder
! cp CaMKK2_bioactivity_data.csv "/content/gdrive/My Drive/Colab Notebooks/data/"

In [17]:
# list the files in data folder
! ls "/content/gdrive/My Drive/Colab Notebooks/data/"

CaMKK2_bioactivity_data.csv


In [18]:
# check the content of the file
! head "/content/gdrive/My Drive/Colab Notebooks/data/CaMKK2_bioactivity_data.csv"

action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
,,2137123,[],CHEMBL940168,Inhibition of CaM-KKbeta,B,,,BAO_0000190,BAO_0000357,single protein format,O=C(O)c1cc(NCc2cc(O)ccc2O)ccc1O,,,CHEMBL1145498,Proc Natl Acad Sci U S A,2007,"{'bei': '24.34', 'le': '0.46', 'lle': '4.59', 'sei': '6.09'}",CHEMBL319620,LAVENDUSTIN C,CHEMBL319620,6.70,0,http://www.openphacts

## Data Pre-Processing

### Handling missing data

If any compounds has missing value for the **standard_value** column then drop it.

In [21]:
df.shape

(134, 46)

In [20]:
df2 = df[df.standard_value.notna()]
df2

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,2137123,[],CHEMBL940168,Inhibition of CaM-KKbeta,B,,,BAO_0000190,...,Homo sapiens,CaM-kinase kinase beta,9606,,,IC50,nM,UO_0000065,,200.0
1,,,2137359,[],CHEMBL940168,Inhibition of CaM-KKbeta,B,,,BAO_0000190,...,Homo sapiens,CaM-kinase kinase beta,9606,,,IC50,ng/ml,UO_0000274,,40.0
2,"{'action_type': 'ANTAGONIST', 'description': '...",Antagonist,2897276,[],CHEMBL1051267,Inhibition of CAMKKbeta in the presence of 20u...,B,,,BAO_0000190,...,Homo sapiens,CaM-kinase kinase beta,9606,,,IC50,uM,UO_0000065,,0.01
3,,,6274565,[],CHEMBL1806284,Inhibition of CAMKKbeta,B,,,BAO_0000190,...,Homo sapiens,CaM-kinase kinase beta,9606,,,IC50,nmol/L,UO_0000065,,2450.0
4,,,12183914,[],CHEMBL2211051,Inhibition of CAMKK2,B,,,BAO_0000190,...,Homo sapiens,CaM-kinase kinase beta,9606,,,pIC50,,UO_0000065,,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129,,,24953854,"[{'comments': None, 'relation': None, 'result_...",CHEMBL5210020,Affinity Phenotypic Cellular interaction (West...,B,,,BAO_0000190,...,Homo sapiens,CaM-kinase kinase beta,9606,,,IC50,nM,UO_0000065,,10000.0
130,"{'action_type': 'INHIBITOR', 'description': 'N...",,24953858,"[{'comments': None, 'relation': None, 'result_...",CHEMBL5210024,Affinity Phenotypic Cellular interaction (West...,B,,,BAO_0000190,...,Homo sapiens,CaM-kinase kinase beta,9606,,,IC50,nM,UO_0000065,,1600.0
131,,,24954346,"[{'comments': None, 'relation': None, 'result_...",CHEMBL5210464,Affinity Biochemical interaction (Enzymatic ac...,B,,,BAO_0000190,...,Homo sapiens,CaM-kinase kinase beta,9606,,,IC50,nM,UO_0000065,,27000.0
132,"{'action_type': 'INHIBITOR', 'description': 'N...",,24954353,"[{'comments': None, 'relation': None, 'result_...",CHEMBL5210471,Affinity Biochemical interaction (Enzymatic ac...,B,,,BAO_0000190,...,Homo sapiens,CaM-kinase kinase beta,9606,,,IC50,nM,UO_0000065,,30.0


There is one row removed which doesn't have a standard value.

### Labeling compounds as either active, inactive, or intermediate

**IC50** - "half-maximal inhibitory concentration" <br>
- to quantify potency of a substance in inhibiting a specific biological or biochemical function.
- if we want to treat a disease/disorder with a drug, knowing which compound can inhibit the protein related to this will help
- tells us how well a drug works at stopping a disease, and a lower IC50 means the drug can do its job with less effort (more effective)

The bioactivity data is in the IC50 unit. Compounds having values LESS than 1000 nM will be considered to be **active** while those GREATER than 10,000 nM will be considered to be **inactive**. As for those values in BETWEEN 1,000 and 10,000 nM will be referred to as **intermediate**. - *From Data Professor*

In simpler terms <br>
1. **Active** - the drug works very well at a small amount. <br>
2. **Inactive** - the drug doesn't work at all, even at a large amount. <br>
3. **Intermediate** - the drug works, but not as effective as an active drug.

In [22]:
bioactivity_class = []
for i in df2.standard_value:
  if float(i) >= 10000:
    bioactivity_class.append("inactive")
  elif float(i) <= 1000:
    bioactivity_class.append("active")
  else:
    bioactivity_class.append("intermediate")

### Iterate the molecule_chembl_id, canonical_smiles, and standard_value columns to a list

**molecule_chembl_id** - unique identification number of each molecule

In [25]:
df2.molecule_chembl_id

Unnamed: 0,molecule_chembl_id
0,CHEMBL319620
1,CHEMBL265470
2,CHEMBL265470
3,CHEMBL1234833
4,CHEMBL2205766
...,...
129,CHEMBL4787282
130,CHEMBL4745471
131,CHEMBL4787282
132,CHEMBL4745471


**canonical_smiles** - SMILES (Simplified Molecular Input Line Entry System) <br>
- standardized way of representing chemical structures as text strings <br>
- notation for describing molecules <br>
- "canonical" part ensures each unique molecule has only one possible SMILES representation for easier comparison

**Note:** In drug discovery, we aim to identify compounds with lowest IC50 value against the target protein. A lower IC50 indicates that a smaller amount of the compound is needed to achieve a significant inhibitory effect, which can be advantageous in terms of dosage and potential side effects.

In [29]:
selection = ['molecule_chembl_id', 'canonical_smiles', 'standard_value']
df3 = df2[selection]
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL319620,O=C(O)c1cc(NCc2cc(O)ccc2O)ccc1O,200.0
1,CHEMBL265470,CC(=O)O.O=C(O)c1ccc2c3c1cccc3c(=O)n1c3ccccc3nc21,0.04
2,CHEMBL265470,CC(=O)O.O=C(O)c1ccc2c3c1cccc3c(=O)n1c3ccccc3nc21,10.0
3,CHEMBL1234833,CC(C)c1cnn2c(NCc3ccccc3)cc(N[C@@H](CO)[C@H](O)...,2450.0
4,CHEMBL2205766,CC(C)(C)NS(=O)(=O)c1cncc(-c2ccn3nc(N)nc3c2)c1,10000.0
...,...,...,...
129,CHEMBL4787282,O=C(O)c1ccc(-c2coc3ncc(-c4ccccc4)cc23)cc1Cl,10000.0
130,CHEMBL4745471,Cc1cccc(-c2cnc3occ(-c4ccc(C(=O)O)c(C5CCCC5)c4)...,1600.0
131,CHEMBL4787282,O=C(O)c1ccc(-c2coc3ncc(-c4ccccc4)cc23)cc1Cl,27000.0
132,CHEMBL4745471,Cc1cccc(-c2cnc3occ(-c4ccc(C(=O)O)c(C5CCCC5)c4)...,30.0


In [30]:
# combine bioactivity column with the created dataframe
df3['bioactivity_class'] = bioactivity_class
df3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['bioactivity_class'] = bioactivity_class


Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity_class
0,CHEMBL319620,O=C(O)c1cc(NCc2cc(O)ccc2O)ccc1O,200.0,active
1,CHEMBL265470,CC(=O)O.O=C(O)c1ccc2c3c1cccc3c(=O)n1c3ccccc3nc21,0.04,active
2,CHEMBL265470,CC(=O)O.O=C(O)c1ccc2c3c1cccc3c(=O)n1c3ccccc3nc21,10.0,active
3,CHEMBL1234833,CC(C)c1cnn2c(NCc3ccccc3)cc(N[C@@H](CO)[C@H](O)...,2450.0,intermediate
4,CHEMBL2205766,CC(C)(C)NS(=O)(=O)c1cncc(-c2ccn3nc(N)nc3c2)c1,10000.0,inactive
...,...,...,...,...
129,CHEMBL4787282,O=C(O)c1ccc(-c2coc3ncc(-c4ccccc4)cc23)cc1Cl,10000.0,inactive
130,CHEMBL4745471,Cc1cccc(-c2cnc3occ(-c4ccc(C(=O)O)c(C5CCCC5)c4)...,1600.0,intermediate
131,CHEMBL4787282,O=C(O)c1ccc(-c2coc3ncc(-c4ccccc4)cc23)cc1Cl,27000.0,inactive
132,CHEMBL4745471,Cc1cccc(-c2cnc3occ(-c4ccc(C(=O)O)c(C5CCCC5)c4)...,30.0,active


### Save the preprocessed file

In [32]:
# save the preprocessed dataframe to a CSV file
df3.to_csv('CaMKK2_preprocessed_data.csv', index=False)

In [33]:
# add it to the google drive folder
! cp CaMKK2_preprocessed_data.csv "/content/gdrive/My Drive/Colab Notebooks/data/"

In [35]:
# check if the preprocessed file is in the data
! ls "/content/gdrive/My Drive/Colab Notebooks/data/"

CaMKK2_bioactivity_data.csv  CaMKK2_preprocessed_data.csv
