<a href="https://colab.research.google.com/github/pranavirohit/cancer-drug-discovery/blob/main/data_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

**This notebook loads mutliple public datasets from public chemical libraries. Each dataset is labeled and a dataframe is loaded with Pandas, and some of the features of the dataset are explored through statistical analysis/plotting.**

## ChEMBL
All datasets are publicly available and directly from the ChEMBL visualization website (not the prepared download files).
*   `chembl_all_1` is from ChEMBL, selecting all compounds and all column properties
*   `chembl_cancer_1` is from ChEMBL, searching for cancer, then selecting all compounds and all column properties
*   `chembl_cancer_2` is from a publicly available dataset isolating cancer compounds from ChEMBL



### Dataset 1 `chembl_all_1`
Access the dataset [here](https://www.ebi.ac.uk/chembl/g/#search_results/all).

In [None]:
chembl_all_1 = pd.read_csv('/content/drive/MyDrive/Data/chembl_all_1.csv', sep = ";")
display(chembl_all_1)

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,ChEMBL ID,Name,Synonyms,Type,Max Phase,Molecular Weight,Targets,Bioactivities,AlogP,Polar Surface Area,...,Inorganic Flag,Heavy Atoms,HBA (Lipinski),HBD (Lipinski),#RO5 Violations (Lipinski),Molecular Weight (Monoisotopic),Molecular Species,Molecular Formula,Smiles,Inchi Key
0,CHEMBL1206185,,,Small molecule,0,607.88,,,9.46,89.62,...,-1,42,5,3,2,607.279,ACID,C35H45NO4S2,CCCCCCCCCCC#CC(N)c1ccccc1-c1ccc(Sc2ccc(OCCCC)c...,UFBLKYIDZFRLPR-UHFFFAOYSA-N
1,CHEMBL539070,,,Small molecule,0,286.79,1.0,1.0,2.28,73.06,...,-1,17,5,3,0,250.0888,NEUTRAL,C11H15ClN4OS,CCCOc1ccccc1-c1nnc(NN)s1.Cl,WPEWNRKLKLNLSO-UHFFFAOYSA-N
2,CHEMBL3335528,,,Small molecule,0,842.80,2.0,6.0,0.18,269.57,...,-1,60,19,5,2,842.2633,ACID,C41H46O19,COC(=O)[C@H](O[C@@H]1O[C@@H](C)[C@@H](O)[C@@H]...,KGUJQZWYZPYYRZ-LWEWUKDVSA-N
3,CHEMBL2419030,,,Small molecule,0,359.33,4.0,4.0,3.94,85.13,...,-1,24,6,1,0,359.0551,NEUTRAL,C14H12F3N3O3S,O=c1nc(NC2CCCC2)sc2c([N+](=O)[O-])cc(C(F)(F)F)...,QGDMYSDFCXOKML-UHFFFAOYSA-N
4,CHEMBL4301448,,,Small molecule,0,465.55,,,5.09,105.28,...,-1,33,7,5,1,465.1635,BASE,C24H24FN5O2S,N=C(N)NCCCOc1ccc(CNc2nc3ccc(Oc4ccc(F)cc4)cc3s2...,RXTJPHLPHOZLFS-UHFFFAOYSA-N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2331695,CHEMBL2017916,,,Small molecule,0,312.35,3.0,3.0,2.86,77.00,...,-1,22,6,1,0,312.0681,NEUTRAL,C15H12N4O2S,COc1ccc(-c2nnc(NC(=O)c3cccnc3)s2)cc1,XIZUJGDKNPVNQA-UHFFFAOYSA-N
2331696,CHEMBL374652,,,Small molecule,0,403.83,1.0,1.0,5.98,36.02,...,-1,26,2,2,1,403.0421,NEUTRAL,C18H14ClF4NOS,CC(O)(CSc1ccc(F)cc1)c1cc2cc(Cl)c(C(F)(F)F)cc2[...,CRPQTBRTHURKII-UHFFFAOYSA-N
2331697,CHEMBL1416264,,,Small molecule,0,380.41,6.0,8.0,3.06,85.07,...,-1,27,7,1,0,380.0856,NEUTRAL,C18H13FN6OS,O=C(CSc1ccc2nnc(-c3cccnc3)n2n1)Nc1ccc(F)cc1,QVYIEKHEJKFNAT-UHFFFAOYSA-N
2331698,CHEMBL213734,,,Small molecule,0,288.26,2.0,3.0,2.32,101.70,...,-1,21,7,2,0,288.0746,NEUTRAL,C14H12N2O5,O=C(COc1ccccc1)Nc1ccc([N+](=O)[O-])cc1O,PZTWAHGBGTWVEB-UHFFFAOYSA-N


In [None]:
print(chembl_all_1.columns)
chembl_all_1[['Bioactivities', 'Type','Inorganic Flag', 'Molecular Species']] # Extracting four columns from chembl_all_1 dataset (only displaying these columns).

Index(['ChEMBL ID', 'Name', 'Synonyms', 'Type', 'Max Phase',
       'Molecular Weight', 'Targets', 'Bioactivities', 'AlogP',
       'Polar Surface Area', 'HBA', 'HBD', '#RO5 Violations',
       '#Rotatable Bonds', 'Passes Ro3', 'QED Weighted', 'CX Acidic pKa',
       'CX Basic pKa', 'CX LogP', 'CX LogD', 'Aromatic Rings',
       'Structure Type', 'Inorganic Flag', 'Heavy Atoms', 'HBA (Lipinski)',
       'HBD (Lipinski)', '#RO5 Violations (Lipinski)',
       'Molecular Weight (Monoisotopic)', 'Molecular Species',
       'Molecular Formula', 'Smiles', 'Inchi Key'],
      dtype='object')


Unnamed: 0,Bioactivities,Type,Inorganic Flag,Molecular Species
0,,Small molecule,-1,ACID
1,1.0,Small molecule,-1,NEUTRAL
2,6.0,Small molecule,-1,ACID
3,4.0,Small molecule,-1,NEUTRAL
4,,Small molecule,-1,BASE
...,...,...,...,...
2331695,3.0,Small molecule,-1,NEUTRAL
2331696,1.0,Small molecule,-1,NEUTRAL
2331697,8.0,Small molecule,-1,NEUTRAL
2331698,3.0,Small molecule,-1,NEUTRAL


### Dataset 2 `chembl_cancer_1`
Access the dataset [here](https://www.ebi.ac.uk/chembl/g/#search_results/compounds/query=cancer).

In [None]:
chembl_cancer_1 = pd.read_csv('/content/drive/MyDrive/Data/chembl_cancer_1.csv', sep = ";") # Importing dataset from saved files (directory path).
display(chembl_cancer_1)

Unnamed: 0,ChEMBL ID,Name,Synonyms,Type,Max Phase,Molecular Weight,Targets,Bioactivities,AlogP,Polar Surface Area,...,Inorganic Flag,Heavy Atoms,HBA (Lipinski),HBD (Lipinski),#RO5 Violations (Lipinski),Molecular Weight (Monoisotopic),Molecular Species,Molecular Formula,Smiles,Inchi Key
0,CHEMBL1200675,TOREMIFENE CITRATE,FC-1157A|Fareston|NSC-613680|TOREMIFENE CITRATE,Small molecule,4,598.09,80.0,115.0,6.22,12.47,...,0,29,2,0,1,405.1859,BASE,C32H36ClNO8,CN(C)CCOc1ccc(/C(=C(/CCCl)c2ccccc2)c2ccccc2)cc...,IWEQQRMGNVVKQW-OQKDUQJOSA-N
1,CHEMBL506871,VELIPARIB,ABT-888|PARP-1 INHIBITOR ABT-888|VELIPARIB,Small molecule,3,244.30,614.0,724.0,1.26,83.80,...,0,18,5,4,0,244.1324,BASE,C13H16N4O,C[C@]1(c2nc3cccc(C(N)=O)c3[nH]2)CCCN1,JNAHVYVRKWKWKQ-CYBMUJFWSA-N
2,CHEMBL1096882,FLUDARABINE PHOSPHATE,2-F-ARA-AMP|2-F-ARAA|2-FLUORO ARA-A|2-FLUOROVI...,Small molecule,4,365.21,21.0,38.0,-1.72,186.07,...,0,24,12,6,2,365.0537,ACID,C10H13FN5O7P,Nc1nc(F)nc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)O)[C@...,GIUYCYHIANZCFB-FJFJXFQQSA-N
3,CHEMBL1200622,PARICALCITOL,COMPOUND 49510|COMPOUND-49510|PARICALCITOL|Zem...,Small molecule,4,416.65,10.0,42.0,5.56,60.69,...,0,30,3,3,1,416.3290,NEUTRAL,C27H44O3,C[C@@H](/C=C/[C@@H](C)[C@H]1CC[C@H]2/C(=C/C=C3...,BPKAHTKRCLCHEA-UBFJEZKGSA-N
4,CHEMBL1637,GEMCITABINE HYDROCHLORIDE,GEMCITABINE (AS HYDROCHLORIDE)|GEMCITABINE HCL...,Small molecule,4,299.66,20.0,57.0,-1.29,110.60,...,0,18,7,4,0,263.0718,NEUTRAL,C9H12ClF2N3O4,Cl.Nc1ccn([C@@H]2O[C@H](CO)[C@@H](O)C2(F)F)c(=...,OKKDEIYWILRZIA-OSZBKLCCSA-N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1572,CHEMBL1195136,SNX 5422,PF-04929113|PF-4929113|SNX 5422|SNX-5422,Small molecule,2,521.54,3.0,7.0,3.37,142.33,...,0,37,9,5,1,521.2250,NEUTRAL,C25H30F3N5O4,CC1(C)CC(=O)c2c(C(F)(F)F)nn(-c3ccc(C(N)=O)c(N[...,AVDSOVJPJZVBTC-CTYIDZIISA-N
1573,CHEMBL1680,OCTREOTIDE,OCTREOTIDE|Octreotide|SMS 201-995|SMS-201-995|...,Protein,4,1019.26,5.0,10.0,,,...,0,,,,,1018.4405,,C49H66N10O10S2,C[C@@H](O)[C@@H]1NC(=O)[C@H](CCCCN)NC(=O)[C@@H...,DEQANNDTNATYII-OULOTJBUSA-N
1574,CHEMBL4297181,SAMOTOLISIB,LY 3023414|LY-3023414|LY3023414|SAMOTOLISIB,Small molecule,2,406.49,2.0,4.0,3.21,82.17,...,0,30,7,1,0,406.2005,NEUTRAL,C23H26N4O3,CO[C@@H](C)Cn1c(=O)n(C)c2cnc3ccc(-c4cncc(C(C)(...,ACCFLVVUVBJNGT-AWEZNQCLSA-N
1575,CHEMBL4297142,CASPOFUNGIN ACETATE,CASPOFUNGIN ACETATE|CASPOFUNGIN DIACETATE|Canc...,Small molecule,4,1213.43,2.0,4.0,,,...,0,,,,,1092.6431,,C56H96N10O19,CC(=O)O.CC(=O)O.CCC(C)CC(C)CCCCCCCCC(=O)N[C@H]...,OGUJBRYAAJYXQP-LLXMLGLCSA-N


In [None]:
print(chembl_cancer_1.columns)

Index(['ChEMBL ID', 'Name', 'Synonyms', 'Type', 'Max Phase',
       'Molecular Weight', 'Targets', 'Bioactivities', 'AlogP',
       'Polar Surface Area', 'HBA', 'HBD', '#RO5 Violations',
       '#Rotatable Bonds', 'Passes Ro3', 'QED Weighted', 'CX Acidic pKa',
       'CX Basic pKa', 'CX LogP', 'CX LogD', 'Aromatic Rings',
       'Structure Type', 'Inorganic Flag', 'Heavy Atoms', 'HBA (Lipinski)',
       'HBD (Lipinski)', '#RO5 Violations (Lipinski)',
       'Molecular Weight (Monoisotopic)', 'Molecular Species',
       'Molecular Formula', 'Smiles', 'Inchi Key'],
      dtype='object')


### Dataset 3 `chembl_cancer_2`
Access the dataset [here](https://www.anticancerfund.org/en/redo-db).

In [None]:
chembl_cancer_2 = pd.read_csv('/content/drive/MyDrive/Data/chembl_cancer_2.txt', encoding = 'ISO-8859-1', sep = "\t")
display(chembl_cancer_2)

Unnamed: 0,Drug,Synonym,Main Indications,WHO,Off-Patent,Vitro,Vivo,Cases,Obs.,Trials,Human,Targets,Date Update,DrugBank,PubChem,PubMed,Unnamed: 16
0,Abacavir,1592U89,Anti-retroviral,Yes,Yes,Yes,No,Yes,No,Yes,Yes,pol; HLA-B;,25/10/2021,https://www.drugbank.ca/drugs/DB01048,https://pubchem.ncbi.nlm.nih.gov/compound/1364...,https://www.ncbi.nlm.nih.gov/pubmed/?term=(Aba...,
1,Acarbose,,Diabetes,No,Yes,No,Yes,No,No,Yes,Yes,MGAM; SI; AMY2A; GAA;,09/08/2022,https://www.drugbank.ca/drugs/DB00284,https://pubchem.ncbi.nlm.nih.gov/compound/5618...,https://www.ncbi.nlm.nih.gov/pubmed/?term=(Aca...,
2,Acetaminophen,Paracetamol,Analgesia,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,PTGES3; PTGS2; PTGS1; TRPV1;,13/07/2017,https://www.drugbank.ca/drugs/DB00316,https://pubchem.ncbi.nlm.nih.gov/compound/103-...,https://www.ncbi.nlm.nih.gov/pubmed/?term=(Ace...,
3,Acetazolamide,,"Glaucoma, diuretic, epilepsy",Yes,Yes,Yes,Yes,No,No,Yes,Yes,CA1; CA2; CA4; CA12; CA14; CA3; CA7; AQP1;,12/07/2017,https://www.drugbank.ca/drugs/DB00819,https://pubchem.ncbi.nlm.nih.gov/compound/59-66-5,https://www.ncbi.nlm.nih.gov/pubmed/?term=(Ace...,
4,Acetylsalicylic Acid,Aspirin,"Analgesia, swelling, prophylaxis of venous emb...",Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,PTGS1; PTGS2; AKR1C1; PRKAA1; EDNRA; TP53; HSP...,23/08/2017,https://www.drugbank.ca/drugs/DB00945,https://pubchem.ncbi.nlm.nih.gov/compound/50-78-2,https://www.ncbi.nlm.nih.gov/pubmed/?term=(Ace...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364,Warfarin,,"Prophylaxis of systemic embolism, of venous th...",Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,VKORC1; NR1I2;,04/01/2018,https://www.drugbank.ca/drugs/DB00682,https://pubchem.ncbi.nlm.nih.gov/compound/81-81-2,https://www.ncbi.nlm.nih.gov/pubmed/?term=(War...,
365,Yellow Fever vaccine,,Active immunization against yellow fever infec...,Yes,,Yes,Yes,No,Yes,No,Yes,,24/06/2021,https://www.drugbank.ca/drugs/DB10805,Not found in PubChem,https://www.ncbi.nlm.nih.gov/pubmed/?term=(Yel...,
366,Zidovudine,Azidothymidine,Anti-retroviral,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,pol; TERT;,22/02/2018,https://www.drugbank.ca/drugs/DB00495,https://pubchem.ncbi.nlm.nih.gov/compound/3051...,https://www.ncbi.nlm.nih.gov/pubmed/?term=(Zid...,
367,Zileuton,,Asthma prophylaxis,No,Yes,Yes,Yes,No,No,Yes,Yes,ALOX5;,22/01/2019,https://www.drugbank.ca/drugs/DB00744,https://pubchem.ncbi.nlm.nih.gov/compound/1114...,https://www.ncbi.nlm.nih.gov/pubmed/?term=(Zil...,


In [None]:
print(chembl_cancer_2.columns)

Index(['Drug', 'Synonym', 'Main Indications', 'WHO', 'Off-Patent', 'Vitro',
       'Vivo', 'Cases', 'Obs.', 'Trials', 'Human', 'Targets', 'Date Update',
       'DrugBank', 'PubChem', 'PubMed', 'Unnamed: 16'],
      dtype='object')


## Drug Repurposing Hub
All datasets are publicly available downloaded from the Drug Repurposing Hub visualization website or the prepared download files.

*   `drug_rp_sample_1` is from Drug Repurposing Hub downloads page, downloading the linked "drug information" file
*   `drug_rp_compound_1` is from Drug Repurposing Hub downloads page, downloading the linked "sample information" file
> This is identical to selecting all clinical phases (for compounds) and all column properties, resulting in viewing 6798/6798 compounds

*   `drug_rp_cancer_3` is from Drug Repurposing Hub, searching for cancer, then selecting all compounds and all column properties
> Named `drug_rp_cancer_3` and not `drug_rp_cancer_1`or `drug_rp_cancer_2` due to issues with uploading the file(`drug_rp_cancer_1` is the CSV file, same content) and not selecting cancer drugs exclusively

*   List item

### Dataset 1 `drug_rp_sample_1`
Access the dataset [here](https://clue.io/repurposing#download-data).

In [None]:
drug_rp_sample_1 = pd.read_csv('/content/drive/MyDrive/Data/drug_rp_sample_1.txt',  sep = "\t", header = 9)
display(drug_rp_sample_1)

Unnamed: 0,broad_id,pert_iname,qc_incompatible,purity,vendor,catalog_no,vendor_name,expected_mass,smiles,InChIKey,pubchem_cid,deprecated_broad_id
0,BRD-K76022557-003-28-9,(R)-(-)-apomorphine,0,98.90,MedChemEx,HY-12723A,Apomorphine (hydrochloride hemihydrate),267.126,CN1CCc2cccc-3c2[C@H]1Cc1ccc(O)c(O)c-31,VMWNQDUVQKEIOC-CYBMUJFWSA-N,6005.0,
1,BRD-K76022557-003-02-7,(R)-(-)-apomorphine,0,97.34,Tocris,2073,(R)-(-)-Apomorphine hydrochloride,267.126,CN1CCc2cccc-3c2[C@H]1Cc1ccc(O)c(O)c-31,VMWNQDUVQKEIOC-CYBMUJFWSA-N,6005.0,
2,BRD-K76022557-003-29-9,(R)-(-)-apomorphine,0,97.36,Tocris,2073,(R)-(-)-Apomorphine hydrochloride,267.126,CN1CCc2cccc-3c2[C@H]1Cc1ccc(O)c(O)c-31,VMWNQDUVQKEIOC-CYBMUJFWSA-N,6005.0,
3,BRD-K76022557-001-03-9,(R)-(-)-apomorphine,0,95.80,Selleck,S4350,R-(-)-Apomorphine HCl Hemihydrate,267.126,CN1CCc2cccc-3c2[C@H]1Cc1ccc(O)c(O)c-31,VMWNQDUVQKEIOC-CYBMUJFWSA-N,6005.0,
4,BRD-K75516118-001-04-1,(R)-(-)-rolipram,0,93.92,Tocris,1349,(R)-(-)-Rolipram,275.152,COc1ccc(cc1OC1CCCC1)[C@@H]1CNC(=O)C1,HJORMJIFDVBMOB-LBPRGKRZSA-N,448055.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
13548,BRD-K63068307-001-10-9,ZSTK-474,0,96.15,MedChemEx,HY-50847,ZSTK474,417.172,FC(F)c1nc2ccccc2n1-c1nc(nc(n1)N1CCOCC1)N1CCOCC1,HGVNLRPZOWWDKD-UHFFFAOYSA-N,11647372.0,
13549,BRD-K63068307-001-08-9,ZSTK-474,0,98.66,Selleck,S1072,ZSTK474,417.172,FC(F)c1nc2ccccc2n1-c1nc(nc(n1)N1CCOCC1)N1CCOCC1,HGVNLRPZOWWDKD-UHFFFAOYSA-N,11647372.0,
13550,BRD-K28761384-001-10-9,zuclopenthixol,0,94.95,MedChemEx,HY-A0163,Zuclopenthixol,400.138,OCCN1CCN(CC\C=C2\c3ccccc3Sc3ccc(Cl)cc23)CC1,WFPIAZLQTJBIFN-DVZOWYKESA-N,5311507.0,
13551,BRD-K28761384-300-01-0,zuclopenthixol,0,97.19,Prestwick,Prestw-998,Zuclopenthixol dihydrochloride,400.138,OCCN1CCN(CC\C=C2\c3ccccc3Sc3ccc(Cl)cc23)CC1,WFPIAZLQTJBIFN-DVZOWYKESA-N,5311507.0,


In [None]:
print(drug_rp_sample_1.columns)

Index(['broad_id', 'pert_iname', 'qc_incompatible', 'purity', 'vendor',
       'catalog_no', 'vendor_name', 'expected_mass', 'smiles', 'InChIKey',
       'pubchem_cid', 'deprecated_broad_id'],
      dtype='object')


### Dataset 2 `drug_rp_compound_1`
Access the dataset [here](https://clue.io/repurposing#download-data).

In [None]:
drug_rp_compound_1 = pd.read_csv('/content/drive/MyDrive/Data/drug_rp_compound_1.txt',sep = "\t", header = 9)
display(drug_rp_compound_1)

Unnamed: 0,pert_iname,clinical_phase,moa,target,disease_area,indication
0,(R)-(-)-apomorphine,Launched,dopamine receptor agonist,ADRA2A|ADRA2B|ADRA2C|CALY|DRD1|DRD2|DRD3|DRD4|...,neurology/psychiatry,Parkinson's Disease
1,(R)-(-)-rolipram,Phase 1,phosphodiesterase inhibitor,PDE4A|PDE4B|PDE4C|PDE4D|PDE5A,,
2,(R)-baclofen,Phase 3,benzodiazepine receptor agonist,GABBR1|GABBR2,,
3,(S)-(+)-rolipram,Phase 1,phosphodiesterase inhibitor,PDE4B|PDE4D,,
4,"[sar9,met(o2)11]-substance-p",Preclinical,tachykinin antagonist,TACR1,,
...,...,...,...,...,...,...
6793,8-M-PDOT,Preclinical,melatonin receptor agonist,MTNR1A|MTNR1B,,
6794,80841-78-7,Preclinical,,,,
6795,9-aminoacridine,Preclinical,,,,
6796,9-aminocamptothecin,Phase 2,topoisomerase inhibitor,TOP1,,


### Dataset 3 `drug_rp_cancer_1`
Access the dataset [here](https://clue.io/repurposing-app).

In [None]:
drug_rp_cancer_3 = pd.read_csv('/content/drive/MyDrive/Data/drug_rp_cancer_1.txt', sep = '\t')
display(drug_rp_cancer_3)

Unnamed: 0,Name,MOA,Target,Disease Area,Indication,Vendor,Purity,Id,SMILES,InChIKey,Phase,Deprecated ID
0,abemaciclib,CDK inhibitor,"CDK4, CDK6",oncology,breast cancer,"Selleck, Selleck, Selleck, Selleck, Selleck, S...",94.43,"BRD-K33622447-066-01-9, BRD-K33622447-066-01-9...",CCN1CCN(Cc2ccc(Nc3ncc(F)c(n3)-c3cc(F)c4nc(C)n(...,"UZWDCWONPYILKI-UHFFFAOYSA-N, UZWDCWONPYILKI-UH...",Launched,
1,abiraterone,androgen biosynthesis inhibitor,"CYP11B1, CYP17A1",oncology,prostate cancer,"MedChemEx, Selleck, Selleck",92.23,"BRD-K50071428-001-04-9, BRD-K50071428-001-03-3...",C[C@]12CC[C@H]3[C@@H](CC=C4C[C@@H](O)CC[C@]34C...,"GZOSMCIZMLWJML-VJLLXTKPSA-N, GZOSMCIZMLWJML-VJ...",Launched,"BRD-K00111504-001-01-9, BRD-K00111504-001-01-9"
2,abiraterone-acetate,androgen biosynthesis inhibitor,CYP17A1,oncology,prostate cancer,"MedChemEx, Selleck, Selleck, Selleck, Selleck",98.65,"BRD-K24048528-001-03-9, BRD-K24048528-001-01-7...",CC(=O)O[C@H]1CC[C@]2(C)[C@H]3CC[C@@]4(C)[C@@H]...,"UVIQSJCZCSLXRZ-UBUQANBQSA-N, UVIQSJCZCSLXRZ-UB...",Launched,"BRD-K16133773-001-01-9, BRD-K16133773-001-01-9"
3,acelarin,anticancer agent,,,,MedChemEx,97.41,BRD-K00003531-001-01-9,C[C@H](N[P@@](=O)(OC[C@H]1O[C@@H](n2ccc(N)nc2=...,NHTKGYOMICWFQZ-LHFSRKHSSA-N,Phase 2/Phase 3,
4,afatinib,EGFR inhibitor,"EGFR, ERBB2, ERBB4",oncology,non-small cell lung cancer (NSCLC),"MedChemEx, MedChemEx, MedChemEx, Selleck, Sell...",99.28,"BRD-K66175015-332-01-6, BRD-K66175015-332-01-6...",CN(C)C\C=C\C(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2...,"ULXXDDBFHOBEHA-CWDCEQMOSA-N, ULXXDDBFHOBEHA-CW...",Launched,
...,...,...,...,...,...,...,...,...,...,...,...,...
115,vindesine,tubulin polymerization inhibitor,"TUBB, TUBB1",oncology,"breast cancer, non-small cell lung cancer (NSC...","AvaChem, AvaChem, AvaChem, AvaChem",78.54,"BRD-K59753975-001-02-6, BRD-K59753975-001-02-6...",CC[C@]1(O)C[C@H]2CN(C1)CCc1c([nH]c3ccccc13)[C@...,"HHJUWIANJFBDHT-KOTLKJBCSA-N, HHJUWIANJFBDHT-KO...",Launched,
116,vinflunine,microtubule inhibitor,,oncology,bladder cancer,"Selleck, Selleck, AvaChem, AvaChem",95.66,"BRD-K64120610-046-01-9, BRD-K64120610-046-01-9...",CC[C@@]12C=CCN3CC[C@@]4([C@H]13)[C@@H](N(C)c1c...,"NOSAJPUYIASWEH-USRBKURZSA-N, NOSAJPUYIASWEH-US...",Launched,"BRD-A52946717-045-01-7, BRD-A52946717-045-01-7..."
117,vinorelbine,tubulin polymerization inhibitor,"TUBA1A, TUBA1B, TUBA1C, TUBA3C, TUBA3D, TUBA3E...",oncology,non-small cell lung cancer (NSCLC),"MedChemEx, MedChemEx",27.23,"BRD-K04269837-346-01-8, BRD-K04269837-346-03-9",CCC1=C[C@@H]2C[N@](C1)Cc1c([nH]c3ccccc13)[C@@]...,"GBABOYUKABKIAF-IELIFDKJSA-N, GBABOYUKABKIAF-IE...",Launched,
118,2-deoxy-2-([methyl(nitroso)carbamoyl]amino)hexose,,,"oncology, endocrinology","pancreatic cancer, hypoglycemia",Labotest,0.00,BRD-A55902763-001-01-3,CN([NH2+][O-])C(=O)N[C@@H](C=O)[C@@H](O)[C@@H]...,VUVHIMLGTHRGQZ-YTLHQDLWSA-N,Preclinical,


### Dataset 4 `drug_rp_noncancer_3`
Access the dataset [here](https://clue.io/repurposing-app).


In [None]:
drug_rp_noncancer_1 = pd.read_csv('/content/drive/MyDrive/Data/drug_rp_noncancer_1.txt', sep = '\t')
display(drug_rp_noncancer_1)

Unnamed: 0,Name,MOA,Target,Disease Area,Indication,Vendor,Purity,Id,SMILES,InChIKey,Phase,Deprecated ID
0,abemaciclib,CDK inhibitor,"CDK4, CDK6",oncology,breast cancer,"Selleck, Selleck, Selleck, Selleck, Selleck, S...",94.43,"BRD-K33622447-066-01-9, BRD-K33622447-066-01-9...",CCN1CCN(Cc2ccc(Nc3ncc(F)c(n3)-c3cc(F)c4nc(C)n(...,"UZWDCWONPYILKI-UHFFFAOYSA-N, UZWDCWONPYILKI-UH...",Launched,
1,abiraterone,androgen biosynthesis inhibitor,"CYP11B1, CYP17A1",oncology,prostate cancer,"MedChemEx, Selleck, Selleck",92.23,"BRD-K50071428-001-04-9, BRD-K50071428-001-03-3...",C[C@]12CC[C@H]3[C@@H](CC=C4C[C@@H](O)CC[C@]34C...,"GZOSMCIZMLWJML-VJLLXTKPSA-N, GZOSMCIZMLWJML-VJ...",Launched,"BRD-K00111504-001-01-9, BRD-K00111504-001-01-9"
2,abiraterone-acetate,androgen biosynthesis inhibitor,CYP17A1,oncology,prostate cancer,"MedChemEx, Selleck, Selleck, Selleck, Selleck",98.65,"BRD-K24048528-001-03-9, BRD-K24048528-001-01-7...",CC(=O)O[C@H]1CC[C@]2(C)[C@H]3CC[C@@]4(C)[C@@H]...,"UVIQSJCZCSLXRZ-UBUQANBQSA-N, UVIQSJCZCSLXRZ-UB...",Launched,"BRD-K16133773-001-01-9, BRD-K16133773-001-01-9"
3,acelarin,anticancer agent,,,,MedChemEx,97.41,BRD-K00003531-001-01-9,C[C@H](N[P@@](=O)(OC[C@H]1O[C@@H](n2ccc(N)nc2=...,NHTKGYOMICWFQZ-LHFSRKHSSA-N,Phase 2/Phase 3,
4,afatinib,EGFR inhibitor,"EGFR, ERBB2, ERBB4",oncology,non-small cell lung cancer (NSCLC),"MedChemEx, MedChemEx, MedChemEx, Selleck, Sell...",99.28,"BRD-K66175015-332-01-6, BRD-K66175015-332-01-6...",CN(C)C\C=C\C(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2...,"ULXXDDBFHOBEHA-CWDCEQMOSA-N, ULXXDDBFHOBEHA-CW...",Launched,
...,...,...,...,...,...,...,...,...,...,...,...,...
115,vindesine,tubulin polymerization inhibitor,"TUBB, TUBB1",oncology,"breast cancer, non-small cell lung cancer (NSC...","AvaChem, AvaChem, AvaChem, AvaChem",78.54,"BRD-K59753975-001-02-6, BRD-K59753975-001-02-6...",CC[C@]1(O)C[C@H]2CN(C1)CCc1c([nH]c3ccccc13)[C@...,"HHJUWIANJFBDHT-KOTLKJBCSA-N, HHJUWIANJFBDHT-KO...",Launched,
116,vinflunine,microtubule inhibitor,,oncology,bladder cancer,"Selleck, Selleck, AvaChem, AvaChem",95.66,"BRD-K64120610-046-01-9, BRD-K64120610-046-01-9...",CC[C@@]12C=CCN3CC[C@@]4([C@H]13)[C@@H](N(C)c1c...,"NOSAJPUYIASWEH-USRBKURZSA-N, NOSAJPUYIASWEH-US...",Launched,"BRD-A52946717-045-01-7, BRD-A52946717-045-01-7..."
117,vinorelbine,tubulin polymerization inhibitor,"TUBA1A, TUBA1B, TUBA1C, TUBA3C, TUBA3D, TUBA3E...",oncology,non-small cell lung cancer (NSCLC),"MedChemEx, MedChemEx",27.23,"BRD-K04269837-346-01-8, BRD-K04269837-346-03-9",CCC1=C[C@@H]2C[N@](C1)Cc1c([nH]c3ccccc13)[C@@]...,"GBABOYUKABKIAF-IELIFDKJSA-N, GBABOYUKABKIAF-IE...",Launched,
118,2-deoxy-2-([methyl(nitroso)carbamoyl]amino)hexose,,,"oncology, endocrinology","pancreatic cancer, hypoglycemia",Labotest,0.00,BRD-A55902763-001-01-3,CN([NH2+][O-])C(=O)N[C@@H](C=O)[C@@H](O)[C@@H]...,VUVHIMLGTHRGQZ-YTLHQDLWSA-N,Preclinical,


## PubChem
All datasets are publicly available downloaded from the PubChem visualization website or the prepared download files.

*   `pubchem_cancer_1` is from PubChem, searching for cancer, selecting compounds and exporting the dataset through the provided service

### Dataset 1 `pubchem_cancer_1`
Access this dataset [here](https://pubchem.ncbi.nlm.nih.gov/#query=cancer).

In [None]:
pubchem_cancer_1 = pd.read_csv('/content/drive/MyDrive/Data/pubchem_cancer_1.csv',  sep = ',')
display(pubchem_cancer_1)

Unnamed: 0,cid,cmpdname,cmpdsynonym,mw,mf,polararea,complexity,xlogp,heavycnt,hbonddonor,...,inchikey,iupacname,meshheadings,annothits,annothitcnt,aids,cidcdate,sidsrcname,depcatg,annotation
0,2719,Chloroquine,chloroquine|54-05-7|Aralen|Chlorochin|Chloraqu...,319.90,C18H26ClN3,28.2,309.0,4.6,22,1,...,WHTVZRBIWZFKQO-UHFFFAOYSA-N,"4-N-(7-chloroquinolin-4-yl)-1-N,1-N-diethylpen...",Chloroquine,Biological Test Results|Chemical and Physical ...,12,179|248|256|1332|2660|2666|2667|2668|8665|1746...,20050325,001Chemical|3B Scientific (Wuhan) Corp|3WAY PH...,Chemical Vendors|Curation Efforts|Governmental...,"COVID-19, COVID19, Coronavirus, Corona-virus, ..."
1,447043,Azithromycin,azithromycin|Zithromax|83905-01-5|Sumamed|Hemo...,749.00,C38H72N2O12,180.0,1150.0,4.0,52,5,...,MQTOSJVFKKJCRP-BICOPXKESA-N,"(2R,3S,4R,5R,8R,10R,11R,12S,13S,14R)-11-[(2S,3...",Azithromycin,Biological Test Results|Chemical and Physical ...,12,421|426|427|433|434|435|445|530|540|541|542|54...,20050624,3B Scientific (Wuhan) Corp|A2B Chem|AA BLOCKS|...,Chemical Vendors|Curation Efforts|Governmental...,"COVID-19, COVID19, Coronavirus, Corona-virus, ..."
2,5743,Dexamethasone,dexamethasone|50-02-2|Decadron|Maxidex|Dexamet...,392.50,C22H29FO5,94.8,805.0,1.9,28,3,...,UREBDLICKHMUKA-CXSFZGCWSA-N,"(8S,9R,10S,11S,13S,14S,16R,17R)-9-fluoro-11,17...",Dexamethasone,Biological Test Results|Chemical and Physical ...,13,1|3|5|9|13|15|17|19|21|23|25|27|29|31|33|35|37...,20050326,"3WAY PHARM INC|A&J Pharmtech CO., LTD.|A2B Che...",Chemical Vendors|Curation Efforts|Governmental...,"COVID-19, COVID19, Coronavirus, Corona-virus, ..."
3,6741,Methylprednisolone,methylprednisolone|83-43-2|Medrol|Metilprednis...,374.50,C22H30O5,94.8,754.0,1.9,27,3,...,VHRSUDSXCMQTMA-PJHHCJLFSA-N,"(6S,8S,9S,10R,11S,13S,14S,17R)-11,17-dihydroxy...",Methylprednisolone,Biological Test Results|Chemical and Physical ...,12,1|3|7|9|11|13|15|19|23|25|27|29|31|33|35|37|39...,20050326,3B Scientific (Wuhan) Corp|A2B Chem|AA BLOCKS|...,Chemical Vendors|Curation Efforts|Governmental...,"COVID-19, COVID19, Coronavirus, Corona-virus, ..."
4,3652,Hydroxychloroquine,hydroxychloroquine|118-42-3|Plaquenil|Oxichlor...,335.90,C18H26ClN3O,48.4,331.0,3.6,23,2,...,XXSMGPRMXLTPCZ-UHFFFAOYSA-N,2-[4-[(7-chloroquinolin-4-yl)amino]pentyl-ethy...,Hydroxychloroquine,Biological Test Results|Chemical and Physical ...,11,880|881|894|1030|1195|1379|1452|1457|1460|1463...,20050325,001Chemical|3B Scientific (Wuhan) Corp|3WAY PH...,Chemical Vendors|Curation Efforts|Governmental...,"COVID-19, COVID19, Coronavirus, Corona-virus, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55943,135935333,"3-phenyl-6H,7H-[1,2]thiazolo[4,5-d]pyrimidin-7...","3-phenyl-6H,7H-[1,2]thiazolo[4,5-d]pyrimidin-7...",229.26,C11H7N3OS,82.6,314.0,1.8,16,1,...,PYERYCKBJGGKEV-UHFFFAOYSA-N,"3-phenyl-6H-[1,2]thiazolo[4,5-d]pyrimidin-7-one",,Biological Test Results|Patents,2,1645853|1645854|1645855|1645856|1645857|164585...,20190119,AbaChemScene|AKos Consulting & Solutions|Ambin...,Chemical Vendors|Curation Efforts|Governmental...,
55944,139587882,Rhizoxin S1,Rhizoxin S1,613.70,C34H47NO9,163.0,1140.0,4.8,44,4,...,PXJPQCKYDGVLKQ-RSIKLJMGSA-N,"2-[(1R,2S,4S,7E,10R,12R,13R,14E,16R)-2,12-dihy...",,Biological Test Results|Classification|Literat...,4,1645853|1645854|1645855|1645856|1645857|164585...,20191104,"Buhrlage Lab, Dana-Farber Cancer Institute and...",Curation Efforts|Research and Development,
55945,139589220,5-[[5-Amino-1-[[10-(3-amino-3-oxopropyl)-4-but...,SNA-60-367-23,1489.80,C75H116N12O19,493.0,2870.0,5.1,106,15,...,UUXTVWJALWTUCH-UHFFFAOYSA-N,5-[[5-amino-1-[[10-(3-amino-3-oxopropyl)-4-but...,,Biological Test Results|Classification|Literat...,4,1645853|1645854|1645855|1645856|1645857|164585...,20191104,"Buhrlage Lab, Dana-Farber Cancer Institute and...",Curation Efforts|Research and Development,
55946,155900628,"(E)-2-hydroxy-N-[(4E,8E)-3-hydroxy-9-methyl-1-...",BS-1096,726.00,C41H75NO9,169.0,942.0,9.9,51,7,...,SUBYBSQARMSYNW-WXRURVHTSA-N,"(E)-2-hydroxy-N-[(4E,8E)-3-hydroxy-9-methyl-1-...",,Biological Test Results,1,1645853|1645854|1645855|1645856|1645857|164585...,20210428,"Buhrlage Lab, Dana-Farber Cancer Institute and...",Chemical Vendors|Research and Development,


**Column Key** `pubchem_cancer_1`
1.   Compound ID `cid`
2.   Primary Compound Name `cmpdname`
3.   Secondary Compound Names `cmpdsynonym`
4.   Molecular Weight `mw`
5.   Molecular Formula `mf`
6.   Polar Area `polararea`
7.   Complexity `complexity`
8.   `xlogp`
9.   Heavy Molecule Count `heavycnt`
10.   Hydrogen Bond Donors `hbonddonor`
11.   Hydrogen Bond Acceptors `hbondacc`
12.   Rotatable Bonds `rotbonds`
13.   `inchi`
14.   `isosmiles`
15.   List item

 'cmpdname',
 'cmpdsynonym',
 'mw',
 'mf',
 'polararea',
 'complexity',
 'xlogp',
 'heavycnt',
 'hbonddonor',
 'hbondacc',
 'rotbonds',
 'inchi',
 'isosmiles',
 'inchikey',
 'iupacname',
 'meshheadings',
 'annothits',
 'annothitcnt',
 'aids',
 'cidcdate',
 'sidsrcname',
 'depcatg',
 'annotation'




In [None]:
pubchem_cancer_1.columns.to_list() # Prints the columns of a data frame as a list.

['cid',
 'cmpdname',
 'cmpdsynonym',
 'mw',
 'mf',
 'polararea',
 'complexity',
 'xlogp',
 'heavycnt',
 'hbonddonor',
 'hbondacc',
 'rotbonds',
 'inchi',
 'isosmiles',
 'inchikey',
 'iupacname',
 'meshheadings',
 'annothits',
 'annothitcnt',
 'aids',
 'cidcdate',
 'sidsrcname',
 'depcatg',
 'annotation']

# Modeling

## SMILES Embedding

Reference: https://towardsdatascience.com/basic-molecular-representation-for-machine-learning-b6be52e9ff76

In [None]:
# define SMILES characters ----------------------------------------------------
SMILES_CHARS = [' ',
                '#', '%', '(', ')', '+', '-', '.', '/',
                '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                '=', '@',
                'A', 'B', 'C', 'F', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P',
                'R', 'S', 'T', 'V', 'X', 'Z',
                '[', '\\', ']',
                'a', 'b', 'c', 'e', 'g', 'i', 'l', 'n', 'o', 'p', 'r', 's',
                't', 'u']

# define encoder and decoder --------------------------------------------------
smi2index = dict( (c,i) for i,c in enumerate( SMILES_CHARS ) )
index2smi = dict( (i,c) for i,c in enumerate( SMILES_CHARS ) )

def smiles_encoder(smiles, maxlen=120):
    X = np.zeros( ( maxlen, len( SMILES_CHARS ) ) )
    for i, c in enumerate( smiles ):
        X[i, smi2index[c] ] = 1
    return X

def smiles_decoder( X ):
    smi = ''
    X = X.argmax( axis=-1 )
    for i in X:
        smi += index2smi[ i ]
    return smi

In [None]:
# Subset dataframe just for now so things compute faster
display(chembl_all_1)

cas1 = chembl_all_1[chembl_all_1.cancer_status == False].sample(10000)
cas2 = chembl_all_1[chembl_all_1.cancer_status == True]
chembl_all_1_smiles = pd.concat([cas1, cas2])
chembl_all_1_smiles.Smiles = chembl_all_1_smiles.Smiles.replace('nan', np.nan)
chembl_all_1_smiles = chembl_all_1_smiles[chembl_all_1_smiles['Smiles'].notna()]
chembl_all_1_smiles = chembl_all_1_smiles[['cancer_status', 'Smiles']]
chembl_all_1_smiles.shape
chembl_all_1_smiles.to_csv('/content/drive/MyDrive/Data/chembl_all_1_smiles.csv', index = False)

Unnamed: 0,ChEMBL ID,Name,Synonyms,Type,Max Phase,Molecular Weight,Targets,Bioactivities,AlogP,Polar Surface Area,...,Heavy Atoms,HBA (Lipinski),HBD (Lipinski),#RO5 Violations (Lipinski),Molecular Weight (Monoisotopic),Molecular Species,Molecular Formula,Smiles,Inchi Key,cancer_status
0,CHEMBL1206185,,,Small molecule,0,607.88,,,9.46,89.62,...,42,5,3,2,607.279,ACID,C35H45NO4S2,CCCCCCCCCCC#CC(N)c1ccccc1-c1ccc(Sc2ccc(OCCCC)c...,UFBLKYIDZFRLPR-UHFFFAOYSA-N,False
1,CHEMBL539070,,,Small molecule,0,286.79,1.0,1.0,2.28,73.06,...,17,5,3,0,250.0888,NEUTRAL,C11H15ClN4OS,CCCOc1ccccc1-c1nnc(NN)s1.Cl,WPEWNRKLKLNLSO-UHFFFAOYSA-N,False
2,CHEMBL3335528,,,Small molecule,0,842.80,2.0,6.0,0.18,269.57,...,60,19,5,2,842.2633,ACID,C41H46O19,COC(=O)[C@H](O[C@@H]1O[C@@H](C)[C@@H](O)[C@@H]...,KGUJQZWYZPYYRZ-LWEWUKDVSA-N,False
3,CHEMBL2419030,,,Small molecule,0,359.33,4.0,4.0,3.94,85.13,...,24,6,1,0,359.0551,NEUTRAL,C14H12F3N3O3S,O=c1nc(NC2CCCC2)sc2c([N+](=O)[O-])cc(C(F)(F)F)...,QGDMYSDFCXOKML-UHFFFAOYSA-N,False
4,CHEMBL4301448,,,Small molecule,0,465.55,,,5.09,105.28,...,33,7,5,1,465.1635,BASE,C24H24FN5O2S,N=C(N)NCCCOc1ccc(CNc2nc3ccc(Oc4ccc(F)cc4)cc3s2...,RXTJPHLPHOZLFS-UHFFFAOYSA-N,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2331695,CHEMBL2017916,,,Small molecule,0,312.35,3.0,3.0,2.86,77.00,...,22,6,1,0,312.0681,NEUTRAL,C15H12N4O2S,COc1ccc(-c2nnc(NC(=O)c3cccnc3)s2)cc1,XIZUJGDKNPVNQA-UHFFFAOYSA-N,False
2331696,CHEMBL374652,,,Small molecule,0,403.83,1.0,1.0,5.98,36.02,...,26,2,2,1,403.0421,NEUTRAL,C18H14ClF4NOS,CC(O)(CSc1ccc(F)cc1)c1cc2cc(Cl)c(C(F)(F)F)cc2[...,CRPQTBRTHURKII-UHFFFAOYSA-N,False
2331697,CHEMBL1416264,,,Small molecule,0,380.41,6.0,8.0,3.06,85.07,...,27,7,1,0,380.0856,NEUTRAL,C18H13FN6OS,O=C(CSc1ccc2nnc(-c3cccnc3)n2n1)Nc1ccc(F)cc1,QVYIEKHEJKFNAT-UHFFFAOYSA-N,False
2331698,CHEMBL213734,,,Small molecule,0,288.26,2.0,3.0,2.32,101.70,...,21,7,2,0,288.0746,NEUTRAL,C14H12N2O5,O=C(COc1ccccc1)Nc1ccc([N+](=O)[O-])cc1O,PZTWAHGBGTWVEB-UHFFFAOYSA-N,False


In [None]:
print(smiles_encoder("CN", maxlen =  2))

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0.]]


In [None]:
chembl_all_1_smiles = pd.read_csv('/content/drive/MyDrive/Data/chembl_all_1_smiles.csv')

In [None]:
max_length = max(chembl_all_1_smiles.Smiles.apply(lambda x: len(str(x))))

In [None]:
chembl_all_1_smiles_flat = chembl_all_1_smiles['Smiles'].apply(lambda x:  smiles_encoder(x, maxlen = max_length).flatten())

In [None]:
# chembl_all_1_smiles_flat  # this would be the input for the neural network: arrays

In [None]:
# the target value for the input is the cancer status. We assume, that the order of the rows is preserved:
targets = chembl_all_1_smiles['cancer_status']
# targets

##### Train a model

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define the CNN model
class CNN(nn.Module):
    def __init__(self, input_size, output_size, hidden_sizes, dropout=0.5):
        super(CNN, self).__init__()

        # Define the layers
        self.conv1 = nn.Conv1d(input_size, hidden_sizes[0], kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(hidden_sizes[0], hidden_sizes[1], kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(hidden_sizes[1], hidden_sizes[2], kernel_size=3, padding=1)
        self.conv4 = nn.Conv1d(hidden_sizes[2], hidden_sizes[3], kernel_size=3, padding=1)
        self.conv5 = nn.Conv1d(hidden_sizes[3], hidden_sizes[4], kernel_size=3, padding=1)

        # Define the activation function
        self.relu = nn.ReLU()

        # Define the dropout layer
        self.dropout = nn.Dropout(dropout)

        # Define the output layer
        self.fc = nn.Sequential(
            nn.Linear(hidden_sizes[-1], output_size),
            nn.Sigmoid()
        )

    def forward(self, x):
        # Apply the layers
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = self.relu(self.conv3(x))
        x = self.relu(self.conv4(x))
        x = self.relu(self.conv5(x))

        # Apply dropout and flatten the output
        x = self.dropout(x)
        x = x.view(x.size(0), -1)

        # Apply the output layer
        x = self.fc(x)

        return x

## Pytorch MLP

In [None]:
# Intializing PyTorch functions and libraries.
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn import ReLU
from torch.nn import Sigmoid

import sklearn as sk
from sklearn.preprocessing import LabelEncoder

from numpy import vstack

In [None]:
# set_seed = 1 # Defining seed, enables results of algorithm to be more reproducible.

In [None]:
class CSVDataset(Dataset): # "Dataset" from importing Dataset in previous code.
    def __init__(self, path):
        # Loading the CSV file as a dataframe (defined by "df").
        # Specifies where to retrieve the file from,and that there is no header (allows file to be read correctly, so first row is not neglected).
        df = pd.read_csv(path, header = None)
        # print(df.dtypes)

        # Storing the inputs and outputs, call by stating df.X or df.Y below.
        self.X = df.values[:, :-1] # Using slicing,
        self.y = df.values[:, -1]

        # Defining input data is a float type.
        self.X = self.X.astype('float32')
        self.y = LabelEncoder().fit_transform(self.y)

        # Casts objects as floats, creating a new version of this object with these properties.
        self.y = self.y.astype('float32')
        # Reshaping the array into a column vector (one column and rows are equal to the original number of columns).
        self.y = self.y.reshape((len(self.y), 1))

    # Function defining number of rows in dataset.
    def __len__(self):
        return len(self.X)
        # self.Y represents the number of columns in a dataset.

    # Function that can retrieve a row at an index.
    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx]]

    # get indexes for train and test rows
    def get_splits(self, n_test = 0.33): # Specifies fraction of data should be allocated to the test set.
        test_size = round(n_test * len(self.X)) # Rounds number of data points to integer value (in this case, the 33%)
        train_size = len(self.X) - test_size
        return random_split(self, [train_size, test_size]) # Splits the dataset into training and test.

`LabelEncoder` converts the labels of a dataset into a numeric form, hence it can be used in machine learning.
*   Important pre-processing step for a structured dataset in supervised learning



In [None]:
# Preparing the input data to create a PyTorch dataset class
path = '/content/drive/MyDrive/Data/chembl_all_1_prepared.csv'
# Defined a variable so instances of path in class CSVDataset does not have to include such a long string.

dataset = CSVDataset(path) # Utilizing
print(dataset)

<__main__.CSVDataset object at 0x7fa2c41a1250>



*   PyTorch dataset class is a class that represents a dataset in PyTorch
*   dataset class is responsible for loading and preprocessing data for use in training a machine learning model.

In [None]:
# Split the dataset to create a test set (will only use when model is final)
train_all, test_final = torch.utils.data.random_split(dataset, [0.8, 0.2], generator=torch.Generator().manual_seed(set_seed))

# Sometimes, you might want to know how your model performs at the moment, so you can split again
# Splitting the datasets to assess how model is performing at that exact moment
train, test = torch.utils.data.random_split(train_all, [0.8, 0.2], generator=torch.Generator().manual_seed(set_seed))

NameError: ignored

In [None]:
# Create a data loader
train_dl = DataLoader(train, batch_size = 32, shuffle = True)
test_dl = DataLoader(test, batch_size = 1024, shuffle = False)

print(len(train_dl.dataset), len(test_dl.dataset))

In [None]:
## Define the model
# Here, you can define more

# model definition
class MLP(torch.nn.Module):
    # define model elements
    def __init__(self, n_inputs):
        super(MLP, self).__init__()
        # input to first hidden layer
        self.hidden1 = torch.nn.Linear(n_inputs, 10)
        torch.nn.init.kaiming_uniform_(self.hidden1.weight, nonlinearity='relu')
        self.act1 = ReLU()
        # second hidden layer
        self.hidden2 = torch.nn.Linear(10, 8)
        torch.nn.init.kaiming_uniform_(self.hidden2.weight, nonlinearity='relu')
        self.act2 = ReLU()
        # third hidden layer and output
        self.hidden3 = torch.nn.Linear(8, 1)
        torch.nn.init.xavier_uniform_(self.hidden3.weight)
        self.act3 = Sigmoid()

    # forward propagate input
    def forward(self, X):
        # input to first hidden layer
        X = self.hidden1(X)
        X = self.act1(X)
         # second hidden layer
        X = self.hidden2(X)
        X = self.act2(X)
        # third hidden layer and output
        X = self.hidden3(X)
        X = self.act3(X)
        return X

In [None]:
## Train the model

# Call the model
model = MLP(17)

# Define the optimizer
criterion =  torch.nn.BCELoss() # MSELoss: mean squared loss for regression, BCELoss: Binary cross-entropy loss for binary classification, CrossEntropyLoss: Categorical cross-entropy loss for multi-class classification
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01, momentum = 0.9)

# Loop through the training epochs and make mini-batches for the stochastic gradient descent
for epoch in range(10): # usually, > 100, but likely around 1000x
  print(i)
  # enumerate mini batches
  for i, (inputs, targets) in enumerate(train_dl):
    #print(i, inputs, targets)
    # clear the gradients
    optimizer.zero_grad()
    # compute the model output
    yhat = model(inputs)
    # calculate loss
    loss = criterion(yhat, targets)
    # credit assignment
    loss.backward()
    # update model weights
    optimizer.step()

In [None]:
## Evaluate the model on the test set
from sklearn.metrics import accuracy_score
def evaluate_model(test_dl, model):
    predictions, actuals = list(), list()
    for i, (inputs, targets) in enumerate(test_dl):
        # evaluate the model on the test set
        yhat = model(inputs)
        # retrieve numpy array
        yhat = yhat.detach().numpy()
        actual = targets.numpy()
        actual = actual.reshape((len(actual), 1))
        # round to class values
        yhat = yhat.round()
        # store
        predictions.append(yhat)
        actuals.append(actual)
    predictions, actuals = np.vstack(predictions), np.vstack(actuals)
    # calculate accuracy
    acc = accuracy_score(actuals, predictions)
    return acc

acc = evaluate_model(test_dl, model)
print('Accuracy: %.3f' % acc)

In [None]:
## Make predictions

## Sklearn

In [None]:
import xgboost
import sklearn
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
def call_model(name, x = None, y = None, do_fit = False):
    if name == 'L1':
        model = l1_model(x,y, do_fit = do_fit)
    elif name == 'L2':
        model = l2_model(x,y, do_fit = do_fit)
    elif name == 'RandomForest':
        model = random_forest(x,y, do_fit = do_fit)
    elif name == 'XGBoost':
        model = xgb(x, y, do_fit = do_fit)
    elif name == 'MLP':
        model = mlp(x,y, do_fit = do_fit)
    return model

    def l1_model(x,y, do_fit=False):
      model = linear_model.Lasso(alpha=0.01)
      if do_fit:
        model.fit(x, y)
      return model

def l2_model(x,y, do_fit=False):
    model = linear_model.Ridge(alpha=0.01)
    if do_fit:
        model.fit(x, y)
    return model

def mlp(x,y, do_fit = False):
    model = MLPRegressor(
        alpha = 1,
        hidden_layer_sizes = (1000, 100))

    if do_fit:
        model.fit(x, y)

    return model

def xgb(x,y, do_fit = False):
    model = xgboost.XGBRegressor(
        n_estimators=100,
        max_depth=4,
        learning_rate=0.1,
        reg_alpha= 0.00001,
        reg_lambda= 0.1,
        colsample_bytree=1,
        gamma=0.1,
        objective='reg:squarederror',
    )

    if do_fit:
        model.fit(x, y)

    return model

def random_forest(x,y, do_fit=False):
    model = RandomForestRegressor(
        max_depth=None,
        random_state=0,
        n_estimators=1000,
        min_samples_leaf= 5,
    )
    if do_fit:
        model.fit(x, y)

    return model

In [None]:
chembl_all_1_nn.columns

In [None]:
# Split data for validation
X_col_names = ['Molecular Weight', 'Bioactivities', 'HBA', 'HBD', '#RO5 Violations',
       '#Rotatable Bonds', 'QED Weighted', 'CX Acidic pKa', 'CX Basic pKa',
       'CX LogP', 'CX LogD', 'Aromatic Rings', 'Heavy Atoms', 'HBA (Lipinski)',
       'HBD (Lipinski)', '#RO5 Violations (Lipinski)',
       'Molecular Weight (Monoisotopic)'] # todo: Check that all quantitative columns are type float and not string
X_train0, X_val, y_train0, y_val = train_test_split(chembl_all_1_nn[X_col_names], chembl_all_1_nn['cancer_status'], test_size=0.33, random_state=set_seed)

# Split data for training
X_train, X_val, y_train, y_val = train_test_split(X_train0, y_train0, test_size=0.33, random_state=set_seed)

# Call the model
model = call_model('XGBoost', x = X_train, y = y_train, do_fit = True)

# Test model on held out dataset
predictions = model.predict(X_val)
corr, _ = stats.pearsonr(predictions, y_val)
print(corr)