# Drug Annotation

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
import os,sys
from scipy import stats
from collections import Counter
plt.style.use('seaborn-white')

## GDSC1

In [2]:
gdsc1_cid=pd.read_csv('../data/GDSC1_drug_cid_table_manually_curated.csv')
gdsc1_info_sel=gdsc1_cid[~gdsc1_cid['pubchem'].isin(['-','None','none'])]

## GDSC2

In [3]:
gdsc2_cid=pd.read_csv('../data/GDSC2_drug_cid_table_manually_curated.csv')
gdsc2_cid['Drug_name_id']=['GDSC2_'+x+'_'+str(y) for x,y in zip(gdsc2_cid['drug_name'],gdsc2_cid['drug_id'])]
gdsc2_info_sel=gdsc2_cid[~gdsc2_cid['pubchem'].isin(['-','None','none'])]

## CTRP2

In [4]:
ctrp2_info=pd.read_csv('../../CellLine_HRD_DrugRes_dev/data/CTRPv2.0._INFORMER_SET.csv')
ctrp2_info_sel=ctrp2_info[ctrp2_info['cpd_status'].isin(['clinical','FDA'])]
ctrp2_info_sel=ctrp2_info_sel[~ctrp2_info_sel['master_cpd_id'].isna()]
dict_cpdid_name_ctrp2={k:v for k,v in zip(ctrp2_info_sel['master_cpd_id'],ctrp2_info_sel['cpd_name'])}

In [5]:
ctrp2_anno=pd.read_csv('../../CellLine_HRD_DrugRes_dev/data/CTRP2_pharmacoDB_drug_info_manually_curated.txt',sep='\t')
ctrp2_add=ctrp2_anno[['master_cpd_id','cid']].fillna(-1).astype(int)

In [6]:
ctrp2_info_sel=ctrp2_info_sel.merge(ctrp2_add,left_on='master_cpd_id',right_on='master_cpd_id',how='left')

## CTRP1

In [7]:
ctrp1_info=pd.read_csv('../../CellLine_HRD_DrugRes_dev/data/CTRPv1.0._INFORMER_SET.csv')
ctrp1_info_sel=ctrp1_info[ctrp1_info['cpd_status'].isin(['clinical candidate', 'FDA-approved'])]
ctrp1_info_sel=ctrp1_info_sel[~ctrp1_info_sel['master_cpd_id'].isna()]
dict_cpdname_did_ctrp1={v:k for k,v in zip(ctrp1_info_sel['master_cpd_id'],ctrp1_info_sel['cpd_name'])}

In [8]:
ctrp1_info_sel[ctrp1_info_sel['pubchem_cid'].isna()]

Unnamed: 0,cpd_name,cpd_synonym,cpd_status,target_or_activity_of_compound,gene_symbol_of_protein_target,top_test_conc_umol,percent_cpd_purity,cpd_smiles,pubchem_cid,broad_cpd_id,master_cpd_id
308,sirolimus,rapamycin;Rapamune,FDA-approved,inhibitor of mTOR via FRB domain (in complex w...,MTOR,37.0,,CO[C@@H]1C[C@H](C[C@@H](C)[C@@H]2CC(=O)[C@H](C...,,BRD-A23770159,411830
316,tacrolimus,FK-506;fujimycin;Prograf;Advagraf;Protopic,FDA-approved,inhibitor of calcineurin (in complex with FKBP),PPP3CA;PPP3CB;PPP3CC;PPP3R1;PPP3R2,74.0,,CO[C@@H]1CC(CC[C@H]1O)\C=C(/C)\[C@H]2OC(=O)[C@...,,BRD-K08845546,55450


In [9]:
ctrp1_info_sel.loc[308,'pubchem_cid']=5284616
ctrp1_info_sel.loc[316,'pubchem_cid']=445643

## PRISM

In [10]:
prism_anno=pd.read_csv('../../CellLine_HRD_DrugRes_dev/data/PRISM_pharmacoDB_drug_info_manually_curated.txt',sep='\t')
prism_anno['disease.area']=prism_anno['disease.area'].fillna('NoData')

In [11]:
flag=prism_anno['disease.area'].str.contains('oncology')|prism_anno['disease.area'].str.contains('malignancy')
prism_anno=prism_anno[flag]
prism_anno=prism_anno[~prism_anno['cid'].isna()]
prism_anno['cid']=prism_anno['cid'].astype(int)                    

In [12]:
prism_anno_dup=prism_anno[prism_anno['broad_id'].str.contains('///')]
prism_anno_ndup=prism_anno[~prism_anno['broad_id'].str.contains('///')]

In [13]:
dupids=prism_anno_dup['broad_id']

In [14]:
adds=[]
for dupid in dupids:
    bids=dupid.split('///')
    add=prism_anno_dup[prism_anno_dup['broad_id']==dupid]
    add=pd.concat([add,add])
    add.loc[:,'broad_id']=bids
    adds.append(add)
prism_anno_dup_c=pd.concat(adds)
prism_anno_dup_c

Unnamed: 0.1,Unnamed: 0,PRISM.drugid,broad_id,compound_plate,moa,target,disease.area,indication,smiles,phase,drugid,cid,inchikey
33,Abiraterone-acetate,abiraterone-acetate,BRD-K50071428-001-03-3,PREP020,androgen biosynthesis inhibitor,"CYP11B1, CYP17A1///CYP17A1",oncology,prostate cancer,CC(=O)O[C@H]1CC[C@]2(C)[C@H]3CC[C@@]4(C)[C@@H]...,Launched,Abiraterone,9821849,GZOSMCIZMLWJML-VJLLXTKPSA-N
33,Abiraterone-acetate,abiraterone-acetate,BRD-K24048528-001-02-5,PREP020,androgen biosynthesis inhibitor,"CYP11B1, CYP17A1///CYP17A1",oncology,prostate cancer,CC(=O)O[C@H]1CC[C@]2(C)[C@H]3CC[C@@]4(C)[C@@H]...,Launched,Abiraterone,9821849,GZOSMCIZMLWJML-VJLLXTKPSA-N
54,Afatinib,afatinib,BRD-K66175015-001-12-4,PROS003_PR500///PREP021,EGFR inhibitor,"EGFR, ERBB2, ERBB4",oncology,non-small cell lung cancer (NSCLC),CN(C)C\C=C\C(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2...,Launched,Afatinib,10184653,ULXXDDBFHOBEHA-CWDCEQMOSA-N
54,Afatinib,afatinib,BRD-K66175015-001-09-0,PROS003_PR500///PREP021,EGFR inhibitor,"EGFR, ERBB2, ERBB4",oncology,non-small cell lung cancer (NSCLC),CN(C)C\C=C\C(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2...,Launched,Afatinib,10184653,ULXXDDBFHOBEHA-CWDCEQMOSA-N
59,Alectinib,alectinib,BRD-K11267252-001-04-4,PREP020///PROS003_PR500,ALK tyrosine kinase receptor inhibitor,"ALK, MET",oncology,non-small cell lung cancer (NSCLC),CCc1cc2C(=O)c3c([nH]c4cc(ccc34)C#N)C(C)(C)c2cc...,Launched,Alectinib,49806720,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1379,Vemurafenib,vemurafenib,BRD-K56343971-001-10-6,PROS004_PR500///PREP022,RAF inhibitor,"BRAF, RAF1",oncology,melanoma,CCCS(=O)(=O)Nc1ccc(F)c(C(=O)c2c[nH]c3ncc(cc23)...,Launched,Vemurafenib,42611257,GPXBXXGIAQBQNI-UHFFFAOYSA-N
1380,Venetoclax,venetoclax,BRD-K62391742-001-09-7,PROS001_PR500///PREP045,BCL inhibitor,BCL2,hematologic malignancy,chronic lymphocytic leukemia (CLL),CC1(C)CCC(CN2CCN(CC2)c2ccc(C(=O)NS(=O)(=O)c3cc...,Launched,Venetoclax,49846579,
1380,Venetoclax,venetoclax,BRD-K62391742-001-03-0,PROS001_PR500///PREP045,BCL inhibitor,BCL2,hematologic malignancy,chronic lymphocytic leukemia (CLL),CC1(C)CCC(CN2CCN(CC2)c2ccc(C(=O)NS(=O)(=O)c3cc...,Launched,Venetoclax,49846579,
1394,Vorinostat,vorinostat,BRD-K81418486-001-47-5,PREP048///PROS001_PR500,HDAC inhibitor,"HDAC1, HDAC10, HDAC11, HDAC2, HDAC3, HDAC5, HD...",hematologic malignancy,cutaneous T-cell lymphoma (CTCL),"ONC(=O)CCCCCCC(=O)Nc1ccccc1, ONC(=O)CCCCCCC(=O...",Launched,Vorinostat,5311,WAEXFXRVDQXREF-UHFFFAOYSA-N


In [15]:
prism_anno_r=pd.concat([prism_anno_ndup,prism_anno_dup_c])
bid_dup=prism_anno_r[prism_anno_r['broad_id'].duplicated()]['broad_id'].values[0]
prism_anno_r[prism_anno_r['broad_id']==bid_dup]

Unnamed: 0.1,Unnamed: 0,PRISM.drugid,broad_id,compound_plate,moa,target,disease.area,indication,smiles,phase,drugid,cid,inchikey
32,Abiraterone,abiraterone,BRD-K50071428-001-03-3,PREP031,androgen biosynthesis inhibitor,"CYP11B1, CYP17A1///CYP17A1",oncology,prostate cancer,C[C@]12CC[C@H]3[C@@H](CC=C4C[C@@H](O)CC[C@]34C...,Launched,Abiraterone,132971,GZOSMCIZMLWJML-VJLLXTKPSA-N
33,Abiraterone-acetate,abiraterone-acetate,BRD-K50071428-001-03-3,PREP020,androgen biosynthesis inhibitor,"CYP11B1, CYP17A1///CYP17A1",oncology,prostate cancer,CC(=O)O[C@H]1CC[C@]2(C)[C@H]3CC[C@@]4(C)[C@@H]...,Launched,Abiraterone,9821849,GZOSMCIZMLWJML-VJLLXTKPSA-N


In [16]:
did_dup=prism_anno_r[prism_anno_r['PRISM.drugid'].duplicated()]['PRISM.drugid']
prism_anno_r[prism_anno_r['PRISM.drugid'].isin(did_dup)]

Unnamed: 0.1,Unnamed: 0,PRISM.drugid,broad_id,compound_plate,moa,target,disease.area,indication,smiles,phase,drugid,cid,inchikey
33,Abiraterone-acetate,abiraterone-acetate,BRD-K50071428-001-03-3,PREP020,androgen biosynthesis inhibitor,"CYP11B1, CYP17A1///CYP17A1",oncology,prostate cancer,CC(=O)O[C@H]1CC[C@]2(C)[C@H]3CC[C@@]4(C)[C@@H]...,Launched,Abiraterone,9821849,GZOSMCIZMLWJML-VJLLXTKPSA-N
33,Abiraterone-acetate,abiraterone-acetate,BRD-K24048528-001-02-5,PREP020,androgen biosynthesis inhibitor,"CYP11B1, CYP17A1///CYP17A1",oncology,prostate cancer,CC(=O)O[C@H]1CC[C@]2(C)[C@H]3CC[C@@]4(C)[C@@H]...,Launched,Abiraterone,9821849,GZOSMCIZMLWJML-VJLLXTKPSA-N
54,Afatinib,afatinib,BRD-K66175015-001-12-4,PROS003_PR500///PREP021,EGFR inhibitor,"EGFR, ERBB2, ERBB4",oncology,non-small cell lung cancer (NSCLC),CN(C)C\C=C\C(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2...,Launched,Afatinib,10184653,ULXXDDBFHOBEHA-CWDCEQMOSA-N
54,Afatinib,afatinib,BRD-K66175015-001-09-0,PROS003_PR500///PREP021,EGFR inhibitor,"EGFR, ERBB2, ERBB4",oncology,non-small cell lung cancer (NSCLC),CN(C)C\C=C\C(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2...,Launched,Afatinib,10184653,ULXXDDBFHOBEHA-CWDCEQMOSA-N
59,Alectinib,alectinib,BRD-K11267252-001-04-4,PREP020///PROS003_PR500,ALK tyrosine kinase receptor inhibitor,"ALK, MET",oncology,non-small cell lung cancer (NSCLC),CCc1cc2C(=O)c3c([nH]c4cc(ccc34)C#N)C(C)(C)c2cc...,Launched,Alectinib,49806720,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1379,Vemurafenib,vemurafenib,BRD-K56343971-001-10-6,PROS004_PR500///PREP022,RAF inhibitor,"BRAF, RAF1",oncology,melanoma,CCCS(=O)(=O)Nc1ccc(F)c(C(=O)c2c[nH]c3ncc(cc23)...,Launched,Vemurafenib,42611257,GPXBXXGIAQBQNI-UHFFFAOYSA-N
1380,Venetoclax,venetoclax,BRD-K62391742-001-09-7,PROS001_PR500///PREP045,BCL inhibitor,BCL2,hematologic malignancy,chronic lymphocytic leukemia (CLL),CC1(C)CCC(CN2CCN(CC2)c2ccc(C(=O)NS(=O)(=O)c3cc...,Launched,Venetoclax,49846579,
1380,Venetoclax,venetoclax,BRD-K62391742-001-03-0,PROS001_PR500///PREP045,BCL inhibitor,BCL2,hematologic malignancy,chronic lymphocytic leukemia (CLL),CC1(C)CCC(CN2CCN(CC2)c2ccc(C(=O)NS(=O)(=O)c3cc...,Launched,Venetoclax,49846579,
1394,Vorinostat,vorinostat,BRD-K81418486-001-47-5,PREP048///PROS001_PR500,HDAC inhibitor,"HDAC1, HDAC10, HDAC11, HDAC2, HDAC3, HDAC5, HD...",hematologic malignancy,cutaneous T-cell lymphoma (CTCL),"ONC(=O)CCCCCCC(=O)Nc1ccccc1, ONC(=O)CCCCCCC(=O...",Launched,Vorinostat,5311,WAEXFXRVDQXREF-UHFFFAOYSA-N


In [17]:
prism_info_sel=prism_anno_r[~prism_anno_r['broad_id'].duplicated()]
prism_info_sel=prism_info_sel.reset_index(drop=True)

## gCSI

In [18]:
gcsi_anno=pd.read_csv('../../CellLine_HRD_DrugRes_dev/data/gCSI2019_drug_info.txt'
                     ,sep='\t',index_col=0)
gcsi_info_sel=gcsi_anno[~gcsi_anno['cid'].isna()]
gcsi_info_sel['cid']=gcsi_info_sel['cid'].astype(int).astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gcsi_info_sel['cid']=gcsi_info_sel['cid'].astype(int).astype(str)


In [19]:
cids= set(gdsc1_info_sel['pubchem'].astype(int))|set(gdsc2_info_sel['pubchem'].astype(int))|set(ctrp2_info_sel['cid'].astype(int))|\
set(prism_info_sel['cid'].astype(int))|set(gcsi_info_sel['cid'].astype(int))|set(ctrp1_info_sel['pubchem_cid'].astype(int))
cids=sorted(cids)[1:]
with open('../processed_data/'+'All_cids.csv','w') as fw:
    fw.write( '\n'.join(list(map(str,cids)))+'\n' )

* this file is converted by Pubchem website, https://pubchem.ncbi.nlm.nih.gov/#upload=true
to "All_cids_from_pubchem.csv"

In [20]:
pubchem=pd.read_csv('../processed_data/All_cids_converted_from_pubchem.csv')
dict_cid_name={k:v for k,v in zip(pubchem['cid'],pubchem['cmpdname'])}
pubchem['cmpdname'].duplicated().sum()

0

# Drug categorization

## GDSC

In [21]:
Counter(gdsc1_info_sel['pathway_name']).most_common()

[('Other', 32),
 ('Other, kinases', 27),
 ('RTK signaling', 26),
 ('PI3K/MTOR signaling', 23),
 ('ERK MAPK signaling', 17),
 ('DNA replication', 12),
 ('Cell cycle', 12),
 ('Mitosis', 11),
 ('Chromatin histone acetylation', 10),
 ('Protein stability and degradation', 8),
 ('Genome integrity', 8),
 ('EGFR signaling', 7),
 ('Cytoskeleton', 7),
 ('Chromatin other', 6),
 ('JNK and p38 signaling', 6),
 ('Apoptosis regulation', 6),
 ('Metabolism', 5),
 ('WNT signaling', 4),
 ('Hormone-related', 3),
 ('IGF1R signaling', 3),
 ('Chromatin histone methylation', 3),
 ('p53 pathway', 3),
 ('ABL signaling', 2)]

In [22]:
Counter(gdsc2_info_sel['pathway_name']).most_common()

[('PI3K/MTOR signaling', 21),
 ('DNA replication', 20),
 ('Other', 17),
 ('Other, kinases', 14),
 ('Genome integrity', 13),
 ('ERK MAPK signaling', 13),
 ('Apoptosis regulation', 12),
 ('RTK signaling', 11),
 ('Cell cycle', 10),
 ('WNT signaling', 9),
 ('Protein stability and degradation', 9),
 ('Mitosis', 8),
 ('EGFR signaling', 7),
 ('Chromatin histone methylation', 7),
 ('Chromatin other', 7),
 ('Hormone-related', 5),
 ('Metabolism', 4),
 ('IGF1R signaling', 4),
 ('Chromatin histone acetylation', 4),
 ('p53 pathway', 4),
 ('-', 2),
 ('JNK and p38 signaling', 2),
 ('Cytoskeleton', 2),
 ('ABL signaling', 1)]

In [23]:
Counter(gdsc1_info_sel['targets']).most_common()[:50]

[('MEK1, MEK2', 7),
 ('BRAF', 6),
 ('HSP90', 5),
 ('PARP1, PARP2', 5),
 ('AKT1, AKT2, AKT3', 4),
 ('EGFR', 4),
 ('IGF1R, IR', 4),
 ('HDAC1', 4),
 ('EGFR, ERBB2', 3),
 ('Microtubule stabiliser', 3),
 ('GSK3A, GSK3B', 3),
 ('PI3K (class 1)', 3),
 ('ROCK1, ROCK2', 3),
 ('AR', 2),
 ('Antimetabolite', 2),
 ('dsDNA break induction', 2),
 ('CDC7', 2),
 ('MTORC1, MTORC2', 2),
 ('G9a and GLP methyltransferases', 2),
 ('Microtubule destabiliser', 2),
 ('MTOR', 2),
 ('PDK1 (PDPK1)', 2),
 ('Amyloid beta20, Amyloid beta40', 2),
 ('Farnesyl-transferase (FNTA)', 2),
 ('MDM2', 2),
 ('SMO', 2),
 ('ATM', 2),
 ('PI3K (class 1), MTORC1, MTORC2', 2),
 ('ALK', 2),
 ('BRD2, BRD3, BRD4, BRDT', 2),
 ('DNA crosslinker', 2),
 ('PI3Kbeta', 2),
 ('SRC, ABL, TEC', 1),
 ('PDGFR, KIT, VEGFR', 1),
 ('MET', 1),
 ('PKC, PPK, FLT1, c-FGR, others', 1),
 ('PB1, SMARCA4, SMARCA2', 1),
 ('JNK2, JNK3', 1),
 ('IGF1R', 1),
 ('BTK', 1),
 ('IKK1, IKK2', 1),
 ('JAK1, JAK2', 1),
 ('JNK1, JNK2, JNK2', 1),
 ('PI3Kgamma', 1),
 ('ERK5,

In [24]:
drug_anno=[]
for idx in gdsc1_info_sel.index:
    name=gdsc1_info_sel.loc[idx,'drug_name']
    pathway=gdsc1_info_sel.loc[idx,'pathway_name']
    target=gdsc1_info_sel.loc[idx,'targets']
    if 'platin' in name:
        drug_anno.append('Platinum')
    elif 'parib' in name:
        drug_anno.append('PARP inhibitor')
    elif 'Microtubule' in target:
        drug_anno.append('Antimicrotubule')
    elif 'Pyrimidine synthesis inhibitor' in target:
        drug_anno.append('Antimetabolite')
    elif 'Alkylating agent' in target:
        drug_anno.append('DNA alkylator')
    elif 'Anthracycline'  in target:
        drug_anno.append('DNA inhibitor')
    elif 'Pyrimidine antimetabolite' in target:
        drug_anno.append('Antimetabolite')
    elif 'Antimetabolite' in target:
        drug_anno.append('Antimetabolite')
    elif 'Anti-metabolite' in target:
        drug_anno.append('Antimetabolite')
    elif 'Dihydrofolate reductase (DHFR)' in target:
        drug_anno.append('Antimetabolite')
    elif 'DNA crosslinker' in target:
        drug_anno.append('DNA alkylator')
    elif 'dsDNA break induction' in target:
        drug_anno.append('DNA inhibitor')
    elif 'DNA replication' in pathway:
        drug_anno.append('DNA inhibitor')
    elif 'DNA alkylating agent' in target:
        drug_anno.append('DNA alkylator')
    elif 'PARP' in target:
        drug_anno.append('PARP inhibitor')
    elif 'TOP' in target:
        drug_anno.append('Topoisomerase inhibitor') 
    elif name in ['Fludarabine','Nelarabine','Dactinomycin','Mitoxantrone',
                 'Teniposide']:
        drug_anno.append('DNA inhibitor')
    elif name in ['Carmustine']:
        drug_anno.append('DNA alkylator')
    elif pathway in ['ERK MAPK signaling','RTK signaling', 'PI3K/MTOR signaling','Cell cycle',
                    'WNT signaling','Protein stability and degradation','WNT signaling',
                     'EGFR signaling','JNK and p38 signaling','p53 pathway','Hormone-related',
                    'Apoptosis regulation','Cytoskeleton','Metabolism','Metabolism',
                     'IGF1R signaling','Genome integrity']:
        drug_anno.append(pathway)
    elif pathway in ['Chromatin histone acetylation','Chromatin histone methylation',
                     'Chromatin other',]:
        drug_anno.append('Chromatin-related')
    elif pathway in ['ABL signaling','Mitosis','IGF1R signaling',]:
        drug_anno.append('Others')   
    else:
        drug_anno.append('Others')
Counter(drug_anno).most_common()

[('Others', 64),
 ('RTK signaling', 26),
 ('PI3K/MTOR signaling', 23),
 ('Chromatin-related', 19),
 ('ERK MAPK signaling', 17),
 ('Cell cycle', 12),
 ('Protein stability and degradation', 8),
 ('DNA inhibitor', 8),
 ('EGFR signaling', 7),
 ('Cytoskeleton', 7),
 ('JNK and p38 signaling', 6),
 ('Apoptosis regulation', 6),
 ('PARP inhibitor', 5),
 ('Antimetabolite', 5),
 ('Antimicrotubule', 5),
 ('Metabolism', 5),
 ('WNT signaling', 4),
 ('Hormone-related', 3),
 ('IGF1R signaling', 3),
 ('Genome integrity', 3),
 ('p53 pathway', 3),
 ('Platinum', 1),
 ('DNA alkylator', 1)]

In [25]:
gdsc1_info_sel['drug_anno']=drug_anno

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdsc1_info_sel['drug_anno']=drug_anno


In [26]:
drug_anno=[]
for idx in gdsc2_info_sel.index:
    name=gdsc2_info_sel.loc[idx,'drug_name']
    pathway=gdsc2_info_sel.loc[idx,'pathway_name']
    target=gdsc2_info_sel.loc[idx,'targets']
    if 'platin' in name:
        drug_anno.append('Platinum')
    elif 'parib' in name:
        drug_anno.append('PARP inhibitor')
    elif name in ['Irinotecan']:
        drug_anno.append('Topoisomerase inhibitor')
    elif 'Microtubule' in target:
        drug_anno.append('Antimicrotubule')
    elif 'Pyrimidine synthesis inhibitor' in target:
        drug_anno.append('Antimetabolite')
    elif 'Alkylating agent' in target:
        drug_anno.append('DNA alkylator')
    elif 'Anthracycline'  in target:
        drug_anno.append('DNA inhibitor')
    elif 'Pyrimidine antimetabolite' in target:
        drug_anno.append('Antimetabolite')
    elif 'Antimetabolite' in target:
        drug_anno.append('Antimetabolite')
    elif 'Anti-metabolite' in target:
        drug_anno.append('Antimetabolite')
    elif 'Dihydrofolate reductase (DHFR)' in target:
        drug_anno.append('Antimetabolite')
    elif 'DNA crosslinker' in target:
        drug_anno.append('DNA alkylator')
    elif 'dsDNA break induction' in target:
        drug_anno.append('DNA inhibitor')
    elif 'DNA replication' in pathway:
        drug_anno.append('DNA inhibitor')
    elif 'DNA alkylating agent' in target:
        drug_anno.append('DNA alkylator')
    elif 'PARP' in target:
        drug_anno.append('PARP inhibitor')
    elif name in ['Fludarabine','Nelarabine','Dactinomycin','Mitoxantrone',
                 'Teniposide']:
        drug_anno.append('DNA inhibitor')
    elif name in ['Carmustine']:
        drug_anno.append('DNA alkylator')
    elif pathway in ['ERK MAPK signaling','RTK signaling', 'PI3K/MTOR signaling','Cell cycle',
                    'WNT signaling','Protein stability and degradation','WNT signaling',
                     'EGFR signaling','JNK and p38 signaling','p53 pathway','Hormone-related',
                    'Apoptosis regulation','Cytoskeleton','Metabolism','Metabolism',
                     'IGF1R signaling','Genome integrity']:
        drug_anno.append(pathway)
    elif pathway in ['Chromatin histone acetylation','Chromatin histone methylation',
                     'Chromatin other',]:
        drug_anno.append('Chromatin-related')
    elif pathway in ['ABL signaling','Mitosis','IGF1R signaling',]:
        drug_anno.append('Others')   
    else:
        drug_anno.append('Others')
Counter(drug_anno).most_common()

[('Others', 32),
 ('PI3K/MTOR signaling', 21),
 ('Chromatin-related', 18),
 ('DNA inhibitor', 13),
 ('ERK MAPK signaling', 13),
 ('Apoptosis regulation', 12),
 ('RTK signaling', 11),
 ('Cell cycle', 10),
 ('WNT signaling', 9),
 ('Protein stability and degradation', 9),
 ('Genome integrity', 8),
 ('EGFR signaling', 7),
 ('Antimetabolite', 6),
 ('Hormone-related', 5),
 ('Antimicrotubule', 5),
 ('PARP inhibitor', 5),
 ('Metabolism', 4),
 ('IGF1R signaling', 4),
 ('p53 pathway', 4),
 ('Platinum', 3),
 ('DNA alkylator', 2),
 ('JNK and p38 signaling', 2),
 ('Cytoskeleton', 2),
 ('Topoisomerase inhibitor', 1)]

In [27]:
gdsc2_info_sel['drug_anno']=drug_anno

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdsc2_info_sel['drug_anno']=drug_anno


In [28]:
dict_cid_anno={int(k):v for k,v in zip(gdsc1_info_sel['pubchem'],gdsc1_info_sel['drug_anno'])}
for idx in gdsc2_info_sel.index:
    k=int(gdsc2_info_sel.loc[idx,'pubchem'])
    v=gdsc2_info_sel.loc[idx,'drug_anno']
    dict_cid_anno[k]=v

## CTRP2

In [29]:
drug_anno=[]
for idx in ctrp2_info_sel.index:
    name=ctrp2_info_sel.loc[idx,'cpd_name']
    target= ctrp2_info_sel.loc[idx,'target_or_activity_of_compound']
    gene=ctrp2_info_sel.loc[idx,'gene_symbol_of_protein_target']
    cid=gene=int(ctrp2_info_sel.loc[idx,'cid'])
    if cid in dict_cid_anno:
        drug_anno.append(dict_cid_anno[cid])
    elif 'platin' in name:
        drug_anno.append('Platinum')
    elif 'parib' in name:
        drug_anno.append('PARP inhibitor')
    elif 'topoisomerase' in target:
        drug_anno.append('Topoisomerase inhibitor')
    elif 'tubule' in target :
        drug_anno.append('Antimicrotubule')
    elif 'DNA alkylator' in target:
        drug_anno.append('DNA alkylator')
    elif 'inducer of DNA damage' in target:
        drug_anno.append('DNA inhibitor')
    elif 'DNA crosslinker' in target:
        drug_anno.append('DNA alkylator')
    elif 'inhibitor of DNA synthesis' in target:
        drug_anno.append('Antimetabolite')
    elif 'inhibitor of DNA replication' in target:
        drug_anno.append('DNA inhibitor')
    elif 'inhibitor of thymidylate synthase' in target:
        drug_anno.append('Antimetabolite')
    elif 'inhibitor of cyclin-dependent kinases' in target:
        drug_anno.append('Cell cycle')
    elif 'inhibitor of PI3K and mTOR kinase activity' in target:
        drug_anno.append('PI3K/MTOR signaling')
    elif 'inhibitor of mTOR and PI3K kinase acitivities' in target:
        drug_anno.append('PI3K/MTOR signaling')
    elif 'inhibitor of mTOR'  in target:
        drug_anno.append('PI3K/MTOR signaling')
    elif 'inhibitor of p53-MDM2 interaction' in target:
        drug_anno.append('p53 pathway')
    elif 'inhibitor of HDAC1, HDAC2, HDAC3, HDAC6, and HDAC8' in target:
        drug_anno.append('Chromatin-related')
    elif 'modulator of estrogen receptors'  in target:
        drug_anno.append('Hormone-related')
    else:
        drug_anno.append('Others')
Counter(drug_anno)

Counter({'Others': 91,
         'DNA alkylator': 7,
         'Antimetabolite': 2,
         'Antimicrotubule': 4,
         'Hormone-related': 2,
         'Platinum': 2,
         'DNA inhibitor': 9,
         'Topoisomerase inhibitor': 2,
         'Protein stability and degradation': 2,
         'EGFR signaling': 3,
         'Chromatin-related': 5,
         'PI3K/MTOR signaling': 13,
         'Cell cycle': 7,
         'RTK signaling': 11,
         'ERK MAPK signaling': 3,
         'Apoptosis regulation': 4,
         'p53 pathway': 2,
         'PARP inhibitor': 2,
         'Metabolism': 1,
         'IGF1R signaling': 1})

In [30]:
ctrp2_info_sel['drug_anno']=drug_anno

## CTRP1

In [31]:
drug_anno=[]
for idx in ctrp1_info_sel.index:
    name=ctrp1_info_sel.loc[idx,'cpd_name']
    target= ctrp1_info_sel.loc[idx,'target_or_activity_of_compound']
    gene=ctrp1_info_sel.loc[idx,'gene_symbol_of_protein_target']
    cid=gene=int(ctrp1_info_sel.loc[idx,'pubchem_cid'])
    if cid in dict_cid_anno:
        drug_anno.append(dict_cid_anno[cid])
    elif 'platin' in name:
        drug_anno.append('Platinum')
    elif 'parib' in name:
        drug_anno.append('PARP inhibitor')
    elif 'topoisomerase' in target:
        drug_anno.append('Topoisomerase inhibitor')
    elif 'tubule' in target :
        drug_anno.append('Antimicrotubule')
    elif 'tubulin' in target :
        drug_anno.append('Antimicrotubule')
    elif 'DNA alkylator' in target:
        drug_anno.append('DNA alkylator')
    elif 'inducer of DNA damage' in target:
        drug_anno.append('DNA inhibitor')
    elif 'DNA crosslinker' in target:
        drug_anno.append('DNA alkylator')
    elif 'inhibitor of DNA synthesis' in target:
        drug_anno.append('Antimetabolite')
    elif 'thymidylate' in target:
        drug_anno.append('Antimetabolite')
    elif 'inhibitor of DNA replication' in target:
        drug_anno.append('DNA inhibitor')
    elif 'inhibitor of thymidylate synthase' in target:
        drug_anno.append('Antimetabolite')
    elif 'inhibitor of cyclin-dependent kinases' in target:
        drug_anno.append('Cell cycle')
    elif 'inhibitor of PI3K and mTOR kinase activity' in target:
        drug_anno.append('PI3K/MTOR signaling')
    elif 'inhibitor of mTOR and PI3K kinase acitivities' in target:
        drug_anno.append('PI3K/MTOR signaling')
    elif 'inhibitor of mTOR'  in target:
        drug_anno.append('PI3K/MTOR signaling')
    elif 'inhibitor of p53-MDM2 interaction' in target:
        drug_anno.append('p53 pathway')
    elif 'inhibitor of HDAC1, HDAC2, HDAC3, HDAC6, and HDAC8' in target:
        drug_anno.append('Chromatin-related')
    elif 'modulator of estrogen receptors'  in target:
        drug_anno.append('Hormone-related')
    elif 'antagonist of androgen receptor'  in target:
        drug_anno.append('Hormone-related')
    else:
        drug_anno.append('Others')
Counter(drug_anno)

Counter({'Others': 61,
         'Apoptosis regulation': 2,
         'Antimicrotubule': 2,
         'PI3K/MTOR signaling': 2,
         'Cell cycle': 1,
         'Hormone-related': 4,
         'RTK signaling': 2,
         'Protein stability and degradation': 3,
         'DNA inhibitor': 2,
         'Metabolism': 1,
         'Antimetabolite': 2,
         'Topoisomerase inhibitor': 1,
         'IGF1R signaling': 1,
         'DNA alkylator': 1,
         'PARP inhibitor': 3,
         'ERK MAPK signaling': 1,
         'Chromatin-related': 1})

In [32]:
ctrp1_info_sel['drug_anno']=drug_anno

## PRISM

In [33]:
prism_info_sel['moa']=prism_info_sel['moa'].fillna('NoData')

In [34]:
drug_anno=[]
for idx in prism_info_sel.index:
    name=prism_info_sel.loc[idx,'PRISM.drugid']
    cid=int(prism_info_sel.loc[idx,'cid'])
    moa=prism_info_sel.loc[idx,'moa']
    if cid in dict_cid_anno:
        drug_anno.append(dict_cid_anno[cid])
    elif 'platin' in name:
        drug_anno.append('Platinum')
    elif 'parib' in name:
        drug_anno.append('PARP inhibitor')
    elif 'topoisomerase inhibitor' in moa:
        drug_anno.append('Topoisomerase inhibitor')
    elif 'tubul' in moa :
        drug_anno.append('Antimicrotubule')
    elif 'DNA alkylating agent' in moa:
        drug_anno.append('DNA alkylator')
    elif 'DNA synthesis inhibitor' in moa:
        drug_anno.append('Antimetabolite')
    elif 'ribonucleotide reductase inhibitor' in moa:
        drug_anno.append('Antimetabolite')
    elif 'thymidylate synthase inhibitor' in moa:
        drug_anno.append('Antimetabolite')
    elif 'adenosine deaminase inhibitor' in moa:
        drug_anno.append('Antimetabolite')
    elif 'dihydrofolate reductase inhibitor	DHFR' in moa:
        drug_anno.append('Antimetabolite')
    elif 'dihydrofolate reductase inhibitor' in moa:
        drug_anno.append('Antimetabolite')
    elif name in ['azacitidine']:
        drug_anno.append('Antimetabolite')
    elif 'PARP' in moa:
        drug_anno.append('PARP inhibitor')
    elif 'thymidylate synthase inhibitor' in moa:
        drug_anno.append('Antimetabolite')
    elif  'DNA intercalating agent' in moa:
        drug_anno.append('other DNA inhibitor')
    elif 'purine antagonist' in moa:
        drug_anno.append('Antimetabolite')
    elif 'dihydropyrimidine dehydrogenase inhibitor' in moa:
        drug_anno.append('Antimetabolite')
    elif 'DNA damage inducer' in moa:
        drug_anno.append('DNA inhibitor')
    elif 'DNA inhibitor' in moa:
        drug_anno.append('DNA inhibitor')
    elif moa in ['aromatase inhibitor','androgen receptor antagonist',
                'estrogen receptor antagonist, selective estrogen receptor modulator (SERM)',
                'androgen biosynthesis inhibitor']:
        drug_anno.append('Hormone-related')
    elif moa in ['EGFR inhibitor']:
        drug_anno.append('EGFR signaling')
    elif moa in ['proteasome inhibitor']:
        drug_anno.append('Protein stability and degradation')    
    elif moa in ['HDAC inhibitor']:
        drug_anno.append('Chromatin-related')    
    elif moa in ['CDK inhibitor']:
        drug_anno.append('Cell cycle') 
    elif moa in ['MEK inhibitor']:
        drug_anno.append('ERK MAPK signaling') 
    elif moa in ['mTOR inhibitor']:
        drug_anno.append('PI3K/MTOR signaling') 
    else:
        drug_anno.append('Others')
Counter(drug_anno).most_common()

[('Others', 43),
 ('Antimetabolite', 23),
 ('Hormone-related', 15),
 ('DNA inhibitor', 12),
 ('Antimicrotubule', 12),
 ('Topoisomerase inhibitor', 10),
 ('EGFR signaling', 10),
 ('DNA alkylator', 8),
 ('RTK signaling', 6),
 ('Protein stability and degradation', 6),
 ('Chromatin-related', 5),
 ('ERK MAPK signaling', 5),
 ('Cell cycle', 4),
 ('PI3K/MTOR signaling', 4),
 ('PARP inhibitor', 4),
 ('Platinum', 3),
 ('Apoptosis regulation', 2)]

In [35]:
prism_info_sel['drug_anno']=drug_anno

## gCSI

In [36]:
drug_anno=[]
for idx in gcsi_info_sel.index:
    name=gcsi_info_sel.loc[idx,'drugid']
    cid=int(gcsi_info_sel.loc[idx,'cid'])
    if cid in dict_cid_anno:
        drug_anno.append(dict_cid_anno[cid])
    elif 'platin' in name:
        drug_anno.append('Platinum')
    elif name in ['Vincristine','Vincaleukoblastine']:
        drug_anno.append('Antimicrotubule')
    elif name in ['5-Fluorouracil','Gemcitabine']:
        drug_anno.append('Antimetabolite')
    elif name in ['Erlotinib']:
        drug_anno.append('EGFR signaling')
    elif name in ['Crizotinib']:
        drug_anno.append('RTK signaling')
    else:
        drug_anno.append('Others')
Counter(drug_anno)

Counter({'Antimetabolite': 3,
         'ERK MAPK signaling': 3,
         'Others': 14,
         'Cell cycle': 2,
         'PI3K/MTOR signaling': 6,
         'Protein stability and degradation': 3,
         'WNT signaling': 1,
         'Platinum': 1,
         'RTK signaling': 1,
         'Antimicrotubule': 3,
         'EGFR signaling': 3,
         'DNA inhibitor': 1,
         'Genome integrity': 1,
         'Chromatin-related': 1})

In [37]:
gcsi_info_sel['drug_anno']=drug_anno

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gcsi_info_sel['drug_anno']=drug_anno


## Write out

In [38]:
gdsc1_info_sel['drug_name_common']=['GDSC1_'+x+'_'+str(y) for x,y in zip(gdsc1_info_sel['drug_name'],gdsc1_info_sel['drug_id']) ]
gdsc2_info_sel['drug_name_common']=['GDSC2_'+x+'_'+str(y) for x,y in zip(gdsc2_info_sel['drug_name'],gdsc2_info_sel['drug_id']) ]
ctrp1_info_sel['drug_name_common']=['CTRP1_'+x+'_'+str(y) for x,y in zip(ctrp1_info_sel['cpd_name'],ctrp1_info_sel['master_cpd_id']) ]
ctrp2_info_sel['drug_name_common']=['CTRP2_'+x+'_'+str(y) for x,y in zip(ctrp2_info_sel['cpd_name'],ctrp2_info_sel['master_cpd_id']) ]
prism_info_sel['drug_name_common']=['PRISM_'+x+'_'+str(y) for x,y in zip(prism_info_sel['PRISM.drugid'],prism_info_sel['broad_id']) ]
gcsi_info_sel['drug_name_common']=['gCSI_'+x+'_CID'+str(y) for x,y in zip(gcsi_info_sel.index,gcsi_info_sel['cid']) ]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdsc1_info_sel['drug_name_common']=['GDSC1_'+x+'_'+str(y) for x,y in zip(gdsc1_info_sel['drug_name'],gdsc1_info_sel['drug_id']) ]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdsc2_info_sel['drug_name_common']=['GDSC2_'+x+'_'+str(y) for x,y in zip(gdsc2_info_sel['drug_name'],gdsc2_info_sel['drug_id']) ]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/use

In [39]:
gdsc1_info_sel.to_csv('../processed_data/GDSC1_drug_category.txt',sep='\t')
gdsc2_info_sel.to_csv('../processed_data/GDSC2_drug_category.txt',sep='\t')
ctrp1_info_sel.to_csv('../processed_data/CTRP1_drug_category.txt',sep='\t')
ctrp2_info_sel.to_csv('../processed_data/CTRP2_drug_category.txt',sep='\t')
prism_info_sel.to_csv('../processed_data/PRISM_drug_category.txt',sep='\t')
gcsi_info_sel.to_csv('../processed_data/gCSI_drug_category.txt',sep='\t')

In [41]:
# Relation to the primary information
gdsc1_info_sel['drug_anno_primary']=gdsc1_info_sel['pathway_name']
gdsc2_info_sel['drug_anno_primary']=gdsc2_info_sel['pathway_name']
ctrp2_info_sel['drug_anno_primary']=ctrp2_info_sel['target_or_activity_of_compound']
ctrp1_info_sel['drug_anno_primary']=ctrp1_info_sel['target_or_activity_of_compound']
prism_info_sel['drug_anno_primary']=prism_info_sel['moa']
gcsi_info_sel['drug_anno_primary']="None"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdsc1_info_sel['drug_anno_primary']=gdsc1_info_sel['pathway_name']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdsc2_info_sel['drug_anno_primary']=gdsc2_info_sel['pathway_name']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gcsi_info_sel['drug_anno_primary']="None"


In [42]:
add1=gdsc1_info_sel[['drug_name_common','drug_anno','drug_anno_primary','pubchem']].rename(columns={'pubchem':'cid'})
add2=gdsc2_info_sel[['drug_name_common','drug_anno','drug_anno_primary','pubchem']].rename(columns={'pubchem':'cid'})
add3=ctrp1_info_sel[['drug_name_common','drug_anno','drug_anno_primary','pubchem_cid']].rename(columns={'pubchem_cid':'cid'})
add4=ctrp2_info_sel[['drug_name_common','drug_anno','drug_anno_primary','cid']]
add5=prism_info_sel[['drug_name_common','drug_anno','drug_anno_primary','cid']]
add6=gcsi_info_sel[['drug_name_common','drug_anno','drug_anno_primary','cid']]

In [43]:
dfw=pd.concat([add1,add2,add3,add4,add5,add6])

In [44]:
Counter(dfw['drug_anno']).most_common()

[('Others', 305),
 ('PI3K/MTOR signaling', 69),
 ('RTK signaling', 57),
 ('Chromatin-related', 49),
 ('DNA inhibitor', 45),
 ('ERK MAPK signaling', 42),
 ('Antimetabolite', 41),
 ('Cell cycle', 36),
 ('Protein stability and degradation', 31),
 ('Antimicrotubule', 31),
 ('EGFR signaling', 30),
 ('Hormone-related', 29),
 ('Apoptosis regulation', 26),
 ('PARP inhibitor', 19),
 ('DNA alkylator', 19),
 ('WNT signaling', 14),
 ('Topoisomerase inhibitor', 14),
 ('Genome integrity', 12),
 ('Metabolism', 11),
 ('Platinum', 10),
 ('IGF1R signaling', 9),
 ('Cytoskeleton', 9),
 ('p53 pathway', 9),
 ('JNK and p38 signaling', 8)]

In [45]:
dfw.to_csv('../processed_data/Drug_annotaion_integrated.txt',sep='\t',index=False)