<a href="https://colab.research.google.com/github/pranavirohit/cancer-drug-discovery/blob/main/input_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


**This notebook prepares the original ChEMBL cancer drug dataset for use in machine learning. A new dataset with only the SMILES code of each molecule is created (which represents the chemical structure of the molecule), with the addition of the cancer drug status of each drug ("True").**

## ChEMBL

### Dataset 1 `chembl_all_1`

#### Adjustments
*   Added column to `chembl_all_1` dataset indicating whether compound was a cancer drug or not
*   Replacing NaN data with "None", ignoring for now



In [None]:
chembl_all_1.columns # Printing all column names.

Index(['ChEMBL ID', 'Name', 'Synonyms', 'Type', 'Max Phase',
       'Molecular Weight', 'Targets', 'Bioactivities', 'AlogP',
       'Polar Surface Area', 'HBA', 'HBD', '#RO5 Violations',
       '#Rotatable Bonds', 'Passes Ro3', 'QED Weighted', 'CX Acidic pKa',
       'CX Basic pKa', 'CX LogP', 'CX LogD', 'Aromatic Rings',
       'Structure Type', 'Inorganic Flag', 'Heavy Atoms', 'HBA (Lipinski)',
       'HBD (Lipinski)', '#RO5 Violations (Lipinski)',
       'Molecular Weight (Monoisotopic)', 'Molecular Species',
       'Molecular Formula', 'Smiles', 'Inchi Key'],
      dtype='object')

In [None]:
cancer_ids = chembl_cancer_1['ChEMBL ID'].tolist()
# Creating a list from the ChEMBL cancer dataset of the compound IDs (can be used to filter chembl_all_1 dataset, which is all the ChEMBL compounds).

In [None]:
chembl_all_1['cancer_status'] = chembl_all_1['ChEMBL ID'].apply(lambda x: True if x in cancer_ids else False)
# Adding a column to the chembl_all_1 database, whether a drug is a cancer drug (True) or not (False).

#### New Files (1)
Created and saved file incorporating `cancer_status`.

##### File 1 `chembl_all_1_2`
Saved a new file that appended `cancer_status` to original chembl_all_1 dataset in Data folder, named `chembl_all_1_2.csv`
> File used for SMILES Embedding in [this Colab Notebook](https://colab.research.google.com/drive/16w1wCKgcfH3po46Eusn9eQ2d9b1vspIh?usp=share_link).

In [None]:
chembl_all_1_2 = chembl_all_1.to_csv('/content/drive/MyDrive/Data/chembl_all_1_2.csv', index = False)
chembl_all_1_2 = pd.read_csv('/content/drive/MyDrive/Data/chembl_all_1_2.csv')

# chembl_all_1_2 = chembl_all_1.to_csv('https://drive.google.com/uc?export=download&id=1K3X-VUOa2wHrpPspQr1u0CjvO9ffIbl4', index = False)
# chembl_all_1_2 = pd.read_csv('https://drive.google.com/uc?export=download&id=1K3X-VUOa2wHrpPspQr1u0CjvO9ffIbl4')

In [None]:
display(chembl_all_1_2)

Unnamed: 0,"<!DOCTYPE html><html><head><title>Google Drive - Virus scan warning</title><meta http-equiv=""content-type"" content=""text/html; charset=utf-8""/><style nonce=""Ri-wEjn3ssgcehrh2nJS5w"">.goog-inline-block{position:relative;display:-moz-inline-box;display:inline-block}* html .goog-inline-block{display:inline}*:first-child+html .goog-inline-block{display:inline}.goog-link-button{position:relative;color:#15c;text-decoration:underline;cursor:pointer}.goog-link-button-disabled{color:#ccc;text-decoration:none;cursor:default}body{color:#222;font:normal 13px/1.4 arial",sans-serif;margin:0}.grecaptcha-badge{visibility:hidden}.uc-main{padding-top:50px;text-align:center}#uc-dl-icon{display:inline-block;margin-top:16px;padding-right:1em;vertical-align:top}#uc-text{display:inline-block;max-width:68ex;text-align:left}.uc-error-caption,".uc-warning-caption{color:#222;font-size:16px}#uc-download-link{text-decoration:none}.uc-name-size a{color:#15c;text-decoration:none}.uc-name-size a:visited{color:#61c;text-decoration:none}.uc-name-size a:active{color:#d14836;text-decoration:none}.uc-footer{color:#777;font-size:11px;padding-bottom:5ex;padding-top:5ex;text-align:center}.uc-footer a{color:#15c}.uc-footer a:visited{color:#61c}.uc-footer a:active{color:#d14836}.uc-footer-divider{color:#ccc;width:100%}sentinel{}</style><link rel=""icon"" href=""//ssl.gstatic.com/docs/doclist/images/drive_2022q3_32dp.png""/></head><body><div class=""uc-main""><div id=""uc-dl-icon"" class=""image-container""><div class=""drive-sprite-aux-download-file""></div></div><div id=""uc-text""><p class=""uc-warning-caption"">Google Drive can't scan this file for viruses.</p><p class=""uc-warning-subcaption""><span class=""uc-name-size""><a href=""/open?id=1K3X-VUOa2wHrpPspQr1u0CjvO9ffIbl4"">chembl_all_1_2.csv</a> (513M)</span> is too large for Google to scan for viruses. Would you still like to download this file?</p><form id=""download-form"" action=""https://drive.google.com/uc?export=download&amp;id=1K3X-VUOa2wHrpPspQr1u0CjvO9ffIbl4&amp;confirm=t&amp;uuid=64feb4ba-ffd8-4269-b151-fb8041c18c85"" method=""post""><input type=""submit"" id=""uc-download-link"" class=""goog-inline-block jfk-button jfk-button-action"" value=""Download anyway""/></form></div></div><div class=""uc-footer""><hr class=""uc-footer-divider""></div></body></html>"


**Adjustments (Cont'd)**

In [None]:
chembl_all_1[['Aromatic Rings','Targets','Max Phase', 'Molecular Species']]

Unnamed: 0,Aromatic Rings,Targets,Max Phase,Molecular Species
0,3,,0,ACID
1,2,1.0,0,NEUTRAL
2,3,2.0,0,ACID
3,2,4.0,0,NEUTRAL
4,4,,0,BASE
...,...,...,...,...
2331695,3,3.0,0,NEUTRAL
2331696,3,1.0,0,NEUTRAL
2331697,4,6.0,0,NEUTRAL
2331698,2,2.0,0,NEUTRAL


In [None]:
col_list_1 = ['Molecular Weight', 'Bioactivities',
       'HBA', 'HBD', '#RO5 Violations',
       '#Rotatable Bonds', 'QED Weighted', 'CX Acidic pKa',
       'CX Basic pKa', 'CX LogP', 'CX LogD', 'Aromatic Rings','Heavy Atoms', 'HBA (Lipinski)',
       'HBD (Lipinski)', '#RO5 Violations (Lipinski)',
       'Molecular Weight (Monoisotopic)','cancer_status']

chembl_all_1_nn = chembl_all_1[col_list_1]
# Replacing None with NaN.
chembl_all_1_nn = chembl_all_1_nn.replace('None', np.nan)
# Ignoring NaN values for now, but will use in the future (when utilizing Smiles).
chembl_all_1_nn = chembl_all_1_nn.dropna()

for i in col_list_1: # Iterating through all column titles, lists what type of data each column contains.
  print(i," - " , chembl_all_1_nn[i].dtype)
  if chembl_all_1_nn[i].dtype in [str,object]:
    chembl_all_1_nn[i] = pd.to_numeric(chembl_all_1_nn[i])

Molecular Weight  -  float64
Bioactivities  -  float64
HBA  -  object
HBD  -  object
#RO5 Violations  -  object
#Rotatable Bonds  -  object
QED Weighted  -  object
CX Acidic pKa  -  object
CX Basic pKa  -  object
CX LogP  -  object
CX LogD  -  object
Aromatic Rings  -  object
Heavy Atoms  -  object
HBA (Lipinski)  -  object
HBD (Lipinski)  -  object
#RO5 Violations (Lipinski)  -  object
Molecular Weight (Monoisotopic)  -  object
cancer_status  -  bool


In [None]:
'''
col_list_2 = ['ChEMBL ID', 'Name', 'Synonyms', 'Type', 'Max Phase',
       'Molecular Weight', 'Targets', 'Bioactivities', 'AlogP',
       'Polar Surface Area', 'HBA', 'HBD', '#RO5 Violations',
       '#Rotatable Bonds', 'Passes Ro3', 'QED Weighted', 'CX Acidic pKa',
       'CX Basic pKa', 'CX LogP', 'CX LogD', 'Aromatic Rings',
       'Structure Type', 'Inorganic Flag', 'Heavy Atoms', 'HBA (Lipinski)',
       'HBD (Lipinski)', '#RO5 Violations (Lipinski)',
       'Molecular Weight (Monoisotopic)', 'Molecular Species',
       'Molecular Formula', 'Smiles', 'Inchi Key', 'cancer_status']

chembl_all_1_nn_2 = chembl_all_1[col_list_2]
# Replacing None with NaN.
chembl_all_1_nn_2 = chembl_all_1_nn_2.replace('None', np.nan)
# Ignoring NaN values for now, but will use in the future (when utilizing Smiles).
chembl_all_1_nn_2 = chembl_all_1_nn_2.dropna()

for i in col_list_2: # Iterating through all column titles, lists what type of data each column contains.
  print(i," - " , chembl_all_1_nn_2[i].dtype)
  if chembl_all_1_nn_2[i].dtype in [str,object]:
    chembl_all_1_nn_2[i] = pd.to_numeric(chembl_all_1_nn_2[i])
'''

'\ncol_list_2 = [\'ChEMBL ID\', \'Name\', \'Synonyms\', \'Type\', \'Max Phase\',\n       \'Molecular Weight\', \'Targets\', \'Bioactivities\', \'AlogP\',\n       \'Polar Surface Area\', \'HBA\', \'HBD\', \'#RO5 Violations\',\n       \'#Rotatable Bonds\', \'Passes Ro3\', \'QED Weighted\', \'CX Acidic pKa\',\n       \'CX Basic pKa\', \'CX LogP\', \'CX LogD\', \'Aromatic Rings\',\n       \'Structure Type\', \'Inorganic Flag\', \'Heavy Atoms\', \'HBA (Lipinski)\',\n       \'HBD (Lipinski)\', \'#RO5 Violations (Lipinski)\',\n       \'Molecular Weight (Monoisotopic)\', \'Molecular Species\',\n       \'Molecular Formula\', \'Smiles\', \'Inchi Key\', \'cancer_status\']\n\nchembl_all_1_nn_2 = chembl_all_1[col_list_2]\n# Replacing None with NaN.\nchembl_all_1_nn_2 = chembl_all_1_nn_2.replace(\'None\', np.nan)\n# Ignoring NaN values for now, but will use in the future (when utilizing Smiles).\nchembl_all_1_nn_2 = chembl_all_1_nn_2.dropna()\n\nfor i in col_list_2: # Iterating through all column ti

#### New Files (2)
Created and saved two more new files: File 1 and 2 stem from `col_list_1`.









##### File 2 `chembl_all_1_upd`
Saved a new file based on changes to original chembl_all_1 dataset in Data folder, named `chembl_all_1_prepared.csv`
> File Juliane created and used for machine learning

In [None]:
chembl_all_1_nn.to_csv('/content/drive/MyDrive/Data/chembl_all_1_prepared.csv', index = False, header = False)
# Saving a new file based on changes to original chembl_all_1 dataset.

In [None]:
chembl_all_1_upd = pd.read_csv('/content/drive/MyDrive/Data/chembl_all_1_prepared.csv', header = None, sep = ",")
display(chembl_all_1_upd)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,286.79,1.0,6,2,0,5,0.63,13.84,3.64,2.57,2.57,2,17,5,3,0,250.0888,False
1,712.85,1.0,10,11,2,16,0.07,4.08,10.49,-6.88,-8.95,0,50,19,14,3,712.4232,False
2,422.48,4.0,6,2,1,10,0.31,4.59,7.99,2.49,2.42,2,31,7,2,1,422.1842,False
3,454.05,60.0,3,1,1,8,0.60,13.88,8.48,6.34,5.22,2,31,3,1,1,417.2668,False
4,375.47,3.0,4,2,0,2,0.73,9.52,3.73,3.92,3.91,2,28,4,2,0,375.1834,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
736565,456.52,3.0,8,1,0,8,0.59,3.99,1.90,2.14,1.20,2,32,9,1,0,456.1467,False
736566,540.05,3.0,6,4,1,8,0.22,5.02,11.48,-0.75,-0.78,4,36,9,5,1,503.1627,False
736567,504.50,4.0,8,3,1,10,0.28,6.59,4.37,2.17,1.33,3,37,11,3,2,504.1645,False
736568,312.35,3.0,6,1,0,4,0.80,8.13,3.49,2.17,2.10,3,22,6,1,0,312.0681,False


##### File 3 `chembl_all_1_1`
Saved a new file based on changes to original chembl_all_1 dataset in Data folder, named `chembl_all_1_1.csv`, has headings and no index
> File I created to understand data better/see context of data



In [None]:
chembl_all_1_nn.to_csv('/content/drive/MyDrive/Data/chembl_all_1_1.csv', index = False, header = True)
# Saving a new file based on changes to original chembl_all_1 dataset with headings.

In [None]:
chembl_all_1_1 = pd.read_csv('/content/drive/MyDrive/Data/chembl_all_1_1.csv', sep = ",")
display(chembl_all_1_1)

Unnamed: 0,Molecular Weight,Bioactivities,HBA,HBD,#RO5 Violations,#Rotatable Bonds,QED Weighted,CX Acidic pKa,CX Basic pKa,CX LogP,CX LogD,Aromatic Rings,Heavy Atoms,HBA (Lipinski),HBD (Lipinski),#RO5 Violations (Lipinski),Molecular Weight (Monoisotopic),cancer_status
0,286.79,1.0,6,2,0,5,0.63,13.84,3.64,2.57,2.57,2,17,5,3,0,250.0888,False
1,712.85,1.0,10,11,2,16,0.07,4.08,10.49,-6.88,-8.95,0,50,19,14,3,712.4232,False
2,422.48,4.0,6,2,1,10,0.31,4.59,7.99,2.49,2.42,2,31,7,2,1,422.1842,False
3,454.05,60.0,3,1,1,8,0.60,13.88,8.48,6.34,5.22,2,31,3,1,1,417.2668,False
4,375.47,3.0,4,2,0,2,0.73,9.52,3.73,3.92,3.91,2,28,4,2,0,375.1834,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
736565,456.52,3.0,8,1,0,8,0.59,3.99,1.90,2.14,1.20,2,32,9,1,0,456.1467,False
736566,540.05,3.0,6,4,1,8,0.22,5.02,11.48,-0.75,-0.78,4,36,9,5,1,503.1627,False
736567,504.50,4.0,8,3,1,10,0.28,6.59,4.37,2.17,1.33,3,37,11,3,2,504.1645,False
736568,312.35,3.0,6,1,0,4,0.80,8.13,3.49,2.17,2.10,3,22,6,1,0,312.0681,False


#### Statistical Analysis

In [None]:
chembl_all_1.describe()

Unnamed: 0,Max Phase,Molecular Weight,Targets,Bioactivities,Inorganic Flag
count,2331700.0,2308451.0,2235477.0,2235477.0,2331700.0
mean,0.0118107,432.9135,5.795968,8.848389,-0.9177454
std,0.1976898,264.795,13.47577,46.42491,0.2750596
min,0.0,4.0,1.0,1.0,-1.0
25%,0.0,323.77,1.0,2.0,-1.0
50%,0.0,391.46,3.0,4.0,-1.0
75%,0.0,473.55,6.0,8.0,-1.0
max,4.0,12546.32,1334.0,17911.0,1.0


#### Visualizations (1)

###### Molecular Weight

In [None]:
# sns.histplot(data = chembl_all_1, x = 'Molecular Weight')

In [None]:
# sns.histplot(data = chembl_all_1, x = 'Molecular Weight', hue = 'HBD')
# plt.xlim(-1000, 2000)

In [None]:
# sns.scatterplot(data = chembl_all_1, x = 'Molecular Weight')

###### Max Phase

In [None]:
# sns.histplot(data = chembl_all_1, x = 'Max Phase')

###### Targets

In [None]:
# sns.histplot(data = chembl_all_1, x = 'Targets')
# plt.xlim(0, 100)
# plt.ylim(0, 50000)

###### Bioactivities

In [None]:
# sns.violinplot(data = chembl_all_1, y = "Bioactivities", x = "cancer_status")
# plt.ylim(-1000, 5000)

###### Polar Surface Area

In [None]:
# sns.histplot(data = chembl_all_1, x = 'Polar Surface Area')

In [None]:
# # Plot the correlation between columns and cancer status
# print(chembl_all_1.shape)
# chembl_all_1_temp = chembl_all_1.dropna()
# print(chembl_all_1_temp.shape)
# corr, _ = stats.pearsonr(chembl_all_1_temp['Bioactivities'], chembl_all_1_temp['cancer_status'])
# print(corr)

### Dataset 2 `chembl_cancer_1`

In [None]:
chembl_cancer_1.describe()

Unnamed: 0,Max Phase,Molecular Weight,Targets,Bioactivities,Inorganic Flag,Molecular Weight (Monoisotopic)
count,1577.0,1139.0,1001.0,1001.0,1577.0,1139.0
mean,2.96449,479.910272,154.095904,662.547453,0.015219,466.247447
std,1.063949,392.791286,233.16168,1282.650288,0.122461,375.746856
min,0.0,12.01,1.0,1.0,0.0,12.0
25%,2.0,302.305,10.0,29.0,0.0,296.05645
50%,3.0,414.74,42.0,158.0,0.0,408.1719
75%,4.0,526.065,212.0,805.0,0.0,510.75085
max,4.0,8203.91,1334.0,17911.0,1.0,7692.664
