# Create and visualize basic data

In [3]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import os
#!pip install combat
from combat.pycombat import pycombat
import matplotlib.pyplot as plt
import utils as utils

    
%pylab inline
%load_ext autoreload
%autoreload 2

Populating the interactive namespace from numpy and matplotlib


In [2]:
# GDSC
gdsc_data_path = 'data/GDSC1_fitted_dose_response_25Feb20.xlsx'
gdsc_rma_proc_basalexp_path = 'data/Cell_line_RMA_proc_basalExp.txt'

# Beat-AML
beat_aml_rnaseq_path = 'data/beat_aml_rnaseq.csv'
beat_aml_auc_path = 'data/beat_aml_aucs.csv'

## GDSC data

In [3]:
gdsc_data = pd.read_excel(gdsc_data_path)
gdsc_data.head()

Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,PATHWAY_NAME,COMPANY_ID,WEBRELEASE,MIN_CONC,MAX_CONC,LN_IC50,AUC,RMSE,Z_SCORE
0,GDSC1,281,12974350,683665,MC-CAR,SIDM00636,MM,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,2.395685,0.982114,0.022521,-0.189576
1,GDSC1,281,12975300,684055,ES3,SIDM00265,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,3.140923,0.984816,0.03184,0.508635
2,GDSC1,281,12975647,684057,ES5,SIDM00263,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,3.968757,0.985693,0.026052,1.284229
3,GDSC1,281,12975980,684059,ES7,SIDM00269,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,2.692768,0.972699,0.110056,0.08876
4,GDSC1,281,12976330,684062,EW-11,SIDM00203,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,2.478678,0.944462,0.087011,-0.11182


In [5]:
gdsc_rma_proc_basalexp = pd.read_csv(gdsc_rma_proc_basalexp_path,sep='\t')
gdsc_rma_proc_basalexp.head()

Unnamed: 0,GENE_SYMBOLS,GENE_title,DATA.906826,DATA.687983,DATA.910927,DATA.1240138,DATA.1240139,DATA.906792,DATA.910688,DATA.1240135,...,DATA.753584,DATA.907044,DATA.998184,DATA.908145,DATA.1659787,DATA.1298157,DATA.1480372,DATA.1298533,DATA.930299,DATA.905954.1
0,TSPAN6,tetraspanin 6 [Source:HGNC Symbol;Acc:11858],7.632023,7.548671,8.712338,7.797142,7.729268,7.074533,3.285198,6.961606,...,7.105637,3.236503,3.038892,8.373223,6.932178,8.441628,8.422922,8.089255,3.112333,7.153127
1,TNMD,tenomodulin [Source:HGNC Symbol;Acc:17757],2.964585,2.777716,2.643508,2.817923,2.957739,2.889677,2.828203,2.874751,...,2.798847,2.745137,2.976406,2.852552,2.62263,2.639276,2.87989,2.521169,2.870468,2.834285
2,DPM1,dolichyl-phosphate mannosyltransferase polypep...,10.379553,11.807341,9.880733,9.883471,10.41884,9.773987,10.264385,10.205931,...,10.486486,10.442951,10.311962,10.45483,10.418475,11.463742,10.557777,10.79275,9.873902,10.788218
3,SCYL3,SCY1-like 3 (S. cerevisiae) [Source:HGNC Symbo...,3.614794,4.066887,3.95623,4.063701,4.3415,4.270903,5.968168,3.715033,...,3.696835,4.624013,4.348524,3.858121,3.947561,4.425849,3.55039,4.443337,4.266828,4.100493
4,C1orf112,chromosome 1 open reading frame 112 [Source:HG...,3.380681,3.732485,3.23662,3.558414,3.840373,3.815055,3.011867,3.268449,...,3.726833,3.947744,3.806584,3.196988,3.814831,4.384732,4.247189,3.071359,3.230197,3.435795


### Extract inchi, inchi-key, smiles for compounds

In [39]:
unique_compounds = list(np.unique(gdsc_data['DRUG_NAME']))
use_fields = ['inchi','inchikey','canonical_smiles']
save_path = 'data/gdsc_compound_inchi_smiles.csv'
flag_redo = False
if not os.path.exists(save_path) or flag_redo:
    compound_dict = {'DRUG_NAME':[]}
    for i in tqdm(np.arange(len(unique_compounds))):
        cur_compound = unique_compounds[i]
        tmp_dict = utils.get_prop_dict_for_compound(cur_compound)
        for field in use_fields:
            if field not in compound_dict:
                compound_dict[field] = []
            try:
                cur_val = tmp_dict[field]
                compound_dict[field].append(cur_val)
            except:
                compound_dict[field].append(np.nan)
        compound_dict['DRUG_NAME'].append(cur_compound)
    pd.DataFrame(compound_dict).to_csv(save_path)

In [49]:
gdsc_smiles_data_path = 'data/gdsc_compound_inchi_smiles.csv'
gdsc_smiles_data = pd.read_csv(gdsc_smiles_data_path)
not_nan_instances = np.sum(np.array(gdsc_smiles_data['canonical_smiles'],dtype=np.str) != 'nan')
print('data for ' + str(not_nan_instances) + ' compounds [of ' +\
      str(gdsc_smiles_data.shape[0]) + ']')
gdsc_smiles_data.head()

data for 282 compounds [of 345]


Unnamed: 0.1,Unnamed: 0,DRUG_NAME,inchi,inchikey,canonical_smiles
0,0,(5Z)-7-Oxozeaenol,InChI=1S/C19H22O7/c1-11-5-3-7-14(20)18(23)15(2...,NEQZWEXWOFPKOT-BYRRXHGESA-N,CC1CC=CC(=O)C(C(CC=CC2=C(C(=CC(=C2)OC)O)C(=O)O...
1,1,5-Fluorouracil,"InChI=1S/C4H3FN2O2/c5-2-1-6-4(9)7-3(2)8/h1H,(H...",GHASVSINZRGABV-UHFFFAOYSA-N,C1=C(C(=O)NC(=O)N1)F
2,2,A-443654,InChI=1S/C24H23N5O/c1-15-22-10-16(6-7-24(22)29...,YWTBGJGMTBHQTM-IBGZPJMESA-N,CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CNC5=...
3,3,A-770041,InChI=1S/C34H39N9O3/c1-21(44)41-14-16-42(17-15...,ZMNWFTYYYCSSTF-UHFFFAOYSA-N,CC(=O)N1CCN(CC1)C2CCC(CC2)N3C4=NC=NC(=C4C(=N3)...
4,4,A-83-01,InChI=1S/C25H19N5S/c1-17-8-7-13-23(27-17)24-21...,HIJMSZGHKQPPJS-UHFFFAOYSA-N,CC1=NC(=CC=C1)C2=NN(C=C2C3=CC=NC4=CC=CC=C34)C(...


## Beat-AML data

In [6]:
beat_aml_rnaseq = pd.read_csv(beat_aml_rnaseq_path)
beat_aml_rnaseq.head()

Unnamed: 0,Gene,Symbol,13-00098,13-00118,13-00149,13-00157,13-00160,13-00165,13-00166,13-00186,...,16-01191,16-01201,16-01216,16-01219,16-01223,16-01225,16-01237,16-01254,16-01262,16-01270
0,ENSG00000000003,TSPAN6,-4.283015,-2.579843,-2.038435,-4.283015,0.52485,-1.702019,-1.988387,-2.825933,...,-1.0785,0.176421,-0.515411,-2.872676,0.104199,-1.255918,-1.73568,-3.0481,-2.370723,-2.306173
1,ENSG00000000005,TNMD,-4.283015,-4.283015,-4.283015,-4.283015,-4.283015,-4.283015,-4.283015,-4.283015,...,-4.283015,-4.283015,-4.283015,-4.283015,-4.283015,-4.283015,-4.283015,-4.283015,-4.283015,-4.283015
2,ENSG00000000419,DPM1,5.293654,4.592634,4.898245,5.029179,5.113583,4.76674,4.935838,4.830074,...,4.436039,4.955578,5.098456,4.839901,4.926395,5.169932,5.278429,4.686827,5.076239,4.054189
3,ENSG00000000457,SCYL3,3.953188,4.675118,4.524586,5.279625,3.076162,4.989021,5.354993,4.60761,...,4.224279,4.739571,3.756682,4.382689,4.399412,4.572803,2.866336,4.428773,4.044699,4.022405
4,ENSG00000000460,C1orf112,4.557584,3.644984,4.723479,3.101024,3.697294,3.642136,2.988082,3.838717,...,3.41551,3.516894,2.527958,4.08017,2.709617,4.04473,2.2909,3.405635,3.120103,2.805191


In [7]:
beat_aml_auc = pd.read_csv(beat_aml_auc_path)
beat_aml_auc.head()

Unnamed: 0,lab_id,inhibitor,auc
0,13-00098,17-AAG (Tanespimycin),230.223782
1,13-00118,17-AAG (Tanespimycin),217.469453
2,13-00149,17-AAG (Tanespimycin),206.326341
3,13-00157,17-AAG (Tanespimycin),140.603252
4,13-00160,17-AAG (Tanespimycin),138.379558


### Extract inchi, inchi-key, smiles for compounds

In [44]:
unique_compounds = list(np.unique(beat_aml_auc['inhibitor']))
use_fields = ['inchi','inchikey','canonical_smiles']
save_path = 'data/beat_aml_inhibitor_inchi_smiles.csv'
flag_redo = False
if not os.path.exists(save_path) or flag_redo:
    compound_beat_aml_dict = {'inhibitor':[]}
    for i in tqdm(np.arange(len(unique_compounds))):
        cur_compound = unique_compounds[i]
        tmp_dict = utils.get_prop_dict_for_compound(cur_compound)
        for field in use_fields:
            if field not in compound_beat_aml_dict:
                compound_beat_aml_dict[field] = []
            try:
                cur_val = tmp_dict[field]
                compound_beat_aml_dict[field].append(cur_val)
            except:
                compound_beat_aml_dict[field].append(np.nan)
        compound_beat_aml_dict['inhibitor'].append(cur_compound)
    pd.DataFrame(compound_beat_aml_dict).to_csv(save_path)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=122.0), HTML(value='')))




In [50]:
beat_aml_smiles_data_path = 'data/beat_aml_inhibitor_inchi_smiles.csv'
beat_aml_smiles_data = pd.read_csv(beat_aml_smiles_data_path)
not_nan_instances = np.sum(np.array(beat_aml_smiles_data['canonical_smiles'],dtype=np.str) != 'nan')
print('data for ' + str(not_nan_instances) + ' compounds [of ' +\
      str(beat_aml_smiles_data.shape[0]) + ']')
beat_aml_smiles_data.head()

data for 109 compounds [of 122]


Unnamed: 0.1,Unnamed: 0,inhibitor,inchi,inchikey,canonical_smiles
0,0,17-AAG (Tanespimycin),InChI=1S/C31H43N3O8/c1-8-12-33-26-21-13-17(2)1...,AYUNIORJHRXIBJ-TXHRRWQRSA-N,CC1CC(C(C(C=C(C(C(C=CC=C(C(=O)NC2=CC(=O)C(=C(C...
1,1,A-674563,InChI=1S/C22H22N4O/c1-15-21-11-17(7-8-22(21)26...,BPNUQXPIQBZCMR-IBGZPJMESA-N,CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CC=CC...
2,2,ABT-737,InChI=1S/C42H45ClN6O5S2/c1-46(2)23-22-35(30-55...,HPLNQCPCUACXLM-PGUFJCEWSA-N,CN(C)CCC(CSC1=CC=CC=C1)NC2=C(C=C(C=C2)S(=O)(=O...
3,3,AT7519,InChI=1S/C16H17Cl2N5O2/c17-10-2-1-3-11(18)13(1...,OVPNQJVDAFNBDN-UHFFFAOYSA-N,C1CNCCC1NC(=O)C2=C(C=NN2)NC(=O)C3=C(C=CC=C3Cl)Cl
4,4,AZD1480,InChI=1S/C14H14ClFN8/c1-7-3-11(24-23-7)21-13-1...,PDOQBOJDRPLBQU-QMMMGPOBSA-N,CC1=CC(=NN1)NC2=NC(=NC=C2Cl)NC(C)C3=NC=C(C=N3)F


## Pancreas data

In [10]:
pancreas_drug_data_chemo = pd.read_excel('data/organoid_pancreas/S4_199398_2_supp_4775187_p95dln.xlsx',
                                  sheet_name = 1, header = 1)
print(pancreas_drug_data_chemo.shape)
pancreas_drug_data_chemo.head()

(66, 7)


Unnamed: 0,Sample ID,Gemcitabine,Paclitaxel,SN-38,5-FU,Oxaliplatin,Insensitive
0,hF2,0.554358,0.712962,0.49995,0.782922,0.887246,
1,hF23,0.523995,0.486711,0.54786,0.781999,0.820843,
2,hF24,0.609942,0.742389,0.549167,0.671272,0.809528,
3,hF27,0.677162,0.636766,0.567865,0.838669,0.852603,YES
4,hF28,0.68501,0.662779,0.651442,0.861745,0.829213,YES


In [103]:
pancreas_id_organoid_mapping_data = pd.read_excel('data/organoid_pancreas/S1_199398_2_supp_4775186_p95dln.xlsx',
                                                 sheet_name = 1, header = 1)
number_organoid_dict = dict()
nrs = list(pancreas_id_organoid_mapping_data['#'])
organoids = list(pancreas_id_organoid_mapping_data['Organoid'])
for i in range(len(nrs)):
    number_organoid_dict[nrs[i]] = organoids[i]
pancreas_id_organoid_mapping_data.head()

Unnamed: 0,#,Patient number,Organoid,Site,Type of Sample,Sex,Age,Race,Stage,Tumor,...,WES passage,WGS,WGS passage,RNAseq,RNAseq passage,Chemo,Chemo passage,Targeted,Targeted passage,Treatment prior to organoid
0,1,1,hT1,Pancreas,Resection,M,65.0,W,2B,YES,...,25.0,NO,,YES,17,YES,"20, 22",YES,"20, 22",NO
1,2,2,hT3,Pancreas,Resection,M,35.0,W,4,YES,...,22.0,NO,,YES,"2, 12",YES,"5, 8",YES,8,YES
2,3,3,hT25,Pancreas,Resection,F,61.0,W,2B,YES,...,16.0,NO,,YES,9,YES,"6, 9",YES,15,NO
3,4,4,hT30,Pancreas,Resection,M,67.0,W,2B,YES,...,32.0,YES,5.0,YES,31,YES,29,YES,35,NO
4,5,5,hT44,Pancreas,Resection,M,68.0,W,2B,YES,...,18.0,NO,,NO,,YES,9,YES,17,NO


In [104]:
sample_path = 'data/organoid_pancreas/sample.tsv'
sample_data = pd.read_csv(sample_path,sep='\t')
case_ids = list(sample_data['case_id'])
case_sub_ids = list(sample_data['case_submitter_id'])
case_id_sub_id_mapping = dict()
for i in range(len(case_ids)):
    case_id_sub_id_mapping[case_ids[i]] = case_sub_ids[i]
sample_data.head()

Unnamed: 0,project_id,case_id,case_submitter_id,sample_id,sample_submitter_id,biospecimen_anatomic_site,biospecimen_laterality,catalog_reference,composition,current_weight,...,sample_type_id,shortest_dimension,state,time_between_clamping_and_freezing,time_between_excision_and_freezing,tissue_collection_type,tissue_type,tumor_code,tumor_code_id,tumor_descriptor
0,ORGANOID-PANCREATIC,aaece851-ced6-4816-9cf2-2fa506f2bc79,64,21646f4b-040f-4b44-82b4-2f621d7da457,S104,'--,'--,'--,3D Organoid,'--,...,'--,'--,released,'--,'--,'--,Normal,'--,'--,'--
1,ORGANOID-PANCREATIC,aaece851-ced6-4816-9cf2-2fa506f2bc79,64,7d47c77f-d2a1-4950-b6b9-336d211fa67e,S191,'--,'--,'--,3D Organoid,'--,...,'--,'--,released,'--,'--,'--,Normal,'--,'--,'--
2,ORGANOID-PANCREATIC,d277d0dd-4be9-484f-ba57-2d9ac16c6736,6,a3f6c436-04c0-41eb-acb6-d320a2a321a8,S184,'--,'--,'--,3D Organoid,'--,...,'--,'--,released,'--,'--,'--,Tumor,'--,'--,'--
3,ORGANOID-PANCREATIC,d277d0dd-4be9-484f-ba57-2d9ac16c6736,6,98faa1da-4b3b-41d2-b9eb-8ca30aa79dae,S133,'--,'--,'--,3D Organoid,'--,...,'--,'--,released,'--,'--,'--,Tumor,'--,'--,'--
4,ORGANOID-PANCREATIC,1c49e2cd-e5f3-4fbd-a8f0-4714ed8fb818,44,544b8fb0-a8f8-4323-9c4c-da9ab460516f,S78,'--,'--,'--,3D Organoid,'--,...,'--,'--,released,'--,'--,'--,Tumor,'--,'--,'--


In [101]:
file_id_to_sample_path = 'data/organoid_pancreas/file_ids_to_sample_ids.txt'
file_id_to_sample_df = pd.read_csv(file_id_to_sample_path,sep='\t')
print(file_id_to_sample_df.shape)
file_id_to_sample_df.head()

(220, 4)


Unnamed: 0,cases.0.case_id,file_id,file_name,id
0,d7f95afc-d9d9-497e-b7ba-cdaec4d300b4,cf80f48f-7ecb-41e5-8aae-726c69344ef8,84bba63e-1ed1-4832-9559-d7752cc741b2.htseq_cou...,cf80f48f-7ecb-41e5-8aae-726c69344ef8
1,08f63445-b236-4f97-b76a-2e5f4b3c5ff5,1ba65919-7c51-4dd8-8c34-c404fb6fc325,40c3d223-a76c-4433-a18a-ca45d03072fa.FPKM-UQ.t...,1ba65919-7c51-4dd8-8c34-c404fb6fc325
2,51ef4f43-6fb7-45c2-86fc-40c359af4ea8,18f62c21-1343-4e18-8d48-cd69a3300344,9d26edab-7937-423c-8b5a-ac333211a29b.FPKM.txt.gz,18f62c21-1343-4e18-8d48-cd69a3300344
3,eee1adb4-f965-43a2-8037-ff706c37e0cc,0a0d2eee-27c9-4d14-9c25-899acde4a743,cef42a2e-ed4c-488e-aba4-41aad7bc52f9.rna_seq.s...,0a0d2eee-27c9-4d14-9c25-899acde4a743
4,eee1adb4-f965-43a2-8037-ff706c37e0cc,54ec2e9f-d716-4b67-b10f-e85bc841e62e,cdd8996e-fb92-4499-83fa-ef393afbe86c.FPKM-UQ.t...,54ec2e9f-d716-4b67-b10f-e85bc841e62e


In [109]:
file_name_organid_dict = dict()
filenames = list(file_id_to_sample_df['file_name'])
ids       = list(file_id_to_sample_df['cases.0.case_id'])
for i in range(len(filenames)):
    file_name_organid_dict[filenames[i]] = number_organoid_dict[case_id_sub_id_mapping[ids[i]]]

In [110]:
file_name_organid_dict

{'84bba63e-1ed1-4832-9559-d7752cc741b2.htseq_counts.txt.gz': 'hT96',
 '40c3d223-a76c-4433-a18a-ca45d03072fa.FPKM-UQ.txt.gz': 'hM17D',
 '9d26edab-7937-423c-8b5a-ac333211a29b.FPKM.txt.gz': 'hT108',
 'cef42a2e-ed4c-488e-aba4-41aad7bc52f9.rna_seq.star_gene_counts.tsv.gz': 'hM1A',
 'cdd8996e-fb92-4499-83fa-ef393afbe86c.FPKM-UQ.txt.gz': 'hM1A',
 '955f9541-b394-4b28-90de-9101d24f72ad.FPKM.txt.gz': 'hF31',
 'ec21019e-472d-4f1f-8cfc-bf4dd8bee7ab.FPKM-UQ.txt.gz': 'hM21F',
 'f888eea4-4fda-4f9f-a8cb-66e8219ce268.FPKM.txt.gz': 'hM19C',
 'f7b2476f-2087-4e10-8571-bd8e1fcbb835.htseq_counts.txt.gz': 'hM1A',
 '955f9541-b394-4b28-90de-9101d24f72ad.FPKM-UQ.txt.gz': 'hF31',
 '97512c95-1685-46a9-88db-071766889490.FPKM-UQ.txt.gz': 'hN31',
 'd4f8538e-93d1-40ba-9c89-ad59ab96c1f7.FPKM.txt.gz': 'hF39',
 '4584bfa8-d55e-4bdc-a045-5b036c653a2a.htseq_counts.txt.gz': 'hM1F',
 '0cb86557-2800-425b-9593-10dbafc99f02.rna_seq.star_gene_counts.tsv.gz': 'hF28',
 '1c75df78-5aca-4280-a22b-044670ba86ab.htseq_counts.txt.gz': 'h

In [115]:
file_name_organid_save_path = 'data/organoid_pancreas/file_name_to_organoid_mapping.tsv'
save_filenames = []
save_organoids = []
for key in file_name_organid_dict:
    save_filenames.append(key)
    save_organoids.append(file_name_organid_dict[key])
pd.DataFrame({'filename':save_filenames,
              'organoid':save_organoids}).to_csv(file_name_organid_save_path,index=None,sep='\t')

In [15]:
drug_list = list(pancreas_drug_data_chemo.keys())

In [11]:
pancreas_drug_data_targeted = pd.read_excel('data/organoid_pancreas/S4_199398_2_supp_4775187_p95dln.xlsx',
                                  sheet_name = 2, header = 1)
print(pancreas_drug_data_targeted.shape)
pancreas_drug_data_targeted.head()

(66, 22)


Unnamed: 0,Sample ID,Selumetinib,Afatinib,Bortezomib,Lapatinib,Sunitinib,Olaparib,Nutlin-3,MK-2206,KU-55933,...,Ruxolitinib,SB5225334,SGI-1776,OSI-420,TPCA-1,LY2874455,SF1670,K-ras(G12C) Inhibitor 9,WIKI4,Disulfuram
0,hF2,0.728,0.772,0.25,0.75,0.74,0.716,0.8,0.756,0.812,...,0.86,0.786,0.788,0.806,0.784,0.654,0.734,0.824,0.698,0.538
1,hF23,0.692,0.548,0.232,0.762,0.814,0.856,0.928,0.752,0.876,...,0.848,0.748,0.844,0.7,0.764,0.63,0.776,0.85,0.838,0.516
2,hF24,0.758,0.632,0.238,0.862,0.916,0.854,0.91,0.78,0.842,...,0.9,0.838,0.862,0.798,0.838,0.678,0.854,0.83,0.858,0.578
3,hF27,0.816,0.828,0.274,0.816,0.78,0.854,0.872,0.798,0.786,...,0.862,0.828,0.854,0.794,0.74,0.722,0.696,0.818,0.798,0.572
4,hF28,0.7,0.668,0.236,0.842,0.844,0.838,0.856,0.768,0.866,...,0.886,0.668,0.794,0.838,0.824,0.57,0.766,0.826,0.818,0.47


In [22]:
drug_list += list(pancreas_drug_data_targeted.keys())

In [12]:
pancreas_drug_data_targeted_for_chem_refractory = pd.read_excel('data/organoid_pancreas/S4_199398_2_supp_4775187_p95dln.xlsx',
                                  sheet_name = 3, header = 1)
print(pancreas_drug_data_targeted_for_chem_refractory.shape)
pancreas_drug_data_targeted_for_chem_refractory.head()

(22, 29)


Unnamed: 0,Sample ID,Basal-like?,Gemcitabine,Paclitaxel,SN-38,5-FU,Oxaliplatin,Selumetinib,Afatinib,Bortezomib,...,SB5225334,SGI-1776,OSI-420,TPCA-1,LY2874455,SF1670,K-ras(G12C) Inhibitor 9,WIKI4,Disulfuram,Alternative?
0,hF27,,0.677162,0.636766,0.567865,0.838669,0.852603,0.816,0.828,0.274,...,0.828,0.854,0.794,0.74,0.722,0.696,0.818,0.798,0.572,YES
1,hF28,,0.68501,0.662779,0.651442,0.861745,0.829213,0.7,0.668,0.236,...,0.668,0.794,0.838,0.824,0.57,0.766,0.826,0.818,0.47,YES
2,hF45,,0.653679,0.680984,0.715255,0.855193,0.918398,0.814,0.698,0.256,...,0.842,0.77,0.828,0.812,0.698,0.794,0.792,0.846,0.554,NO
3,hF70,,0.917493,0.759485,0.880286,0.971617,0.98154,0.802,0.844,0.274,...,0.768,0.798,0.826,0.882,0.764,0.798,0.832,0.882,0.564,YES
4,hF77,,0.743855,0.596951,0.658731,0.764028,0.893167,0.786,0.722,0.264,...,0.666,0.768,0.824,0.84,0.662,0.894,0.816,0.746,0.534,YES


In [23]:
drug_list += list(pancreas_drug_data_targeted_for_chem_refractory.keys())

In [24]:
print(list(set(drug_list)))

['Insensitive', 'Paclitaxel', 'Bortezomib', 'WIKI4', 'Disulfuram', 'SF1670', 'Selumetinib', 'Sunitinib', 'Everolimus', 'Sample ID', 'Lapatinib', 'Nutlin-3', 'LY2874455', 'TPCA-1', 'Gemcitabine', 'Afatinib', 'Basal-like?', '5-FU', 'MK-2206', 'Olaparib', 'Ruxolitinib', 'OSI-420', 'K-ras(G12C) Inhibitor 9', 'KU-55933', 'SB5225334', 'SGI-1776', 'Alternative?', 'Oxaliplatin', 'Celecoxib', 'SN-38']


### Extract inchi, inchi-key, smiles for compounds

In [25]:
unique_compounds = list(np.unique(drug_list))
use_fields = ['inchi','inchikey','canonical_smiles']
save_path = 'data/organoid_pancreas/pancreas_inhibitor_inchi_smiles.csv'
flag_redo = False
if not os.path.exists(save_path) or flag_redo:
    compound_pancreas_dict = {'inhibitor':[]}
    for i in tqdm(np.arange(len(unique_compounds))):
        cur_compound = unique_compounds[i]
        tmp_dict = utils.get_prop_dict_for_compound(cur_compound)
        for field in use_fields:
            if field not in compound_pancreas_dict:
                compound_pancreas_dict[field] = []
            try:
                cur_val = tmp_dict[field]
                compound_pancreas_dict[field].append(cur_val)
            except:
                compound_pancreas_dict[field].append(np.nan)
        compound_pancreas_dict['inhibitor'].append(cur_compound)
    pd.DataFrame(compound_pancreas_dict).to_csv(save_path)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=30.0), HTML(value='')))




In [121]:
pancreas_smiles_data_path = 'data/organoid_pancreas/pancreas_inhibitor_inchi_smiles.csv'
pancreas_smiles_data = pd.read_csv(pancreas_smiles_data_path)
not_nan_instances = np.sum(np.array(pancreas_smiles_data['canonical_smiles'],dtype=np.str) != 'nan')
print('data for ' + str(not_nan_instances) + ' compounds [of ' +\
      str(pancreas_smiles_data.shape[0]) + ']')
pancreas_smiles_data.head()

data for 25 compounds [of 30]


Unnamed: 0.1,Unnamed: 0,inhibitor,inchi,inchikey,canonical_smiles
0,0,5-FU,"InChI=1S/C4H3FN2O2/c5-2-1-6-4(9)7-3(2)8/h1H,(H...",GHASVSINZRGABV-UHFFFAOYSA-N,C1=C(C(=O)NC(=O)N1)F
1,1,Afatinib,InChI=1S/C24H25ClFN5O3/c1-31(2)8-3-4-23(32)30-...,ULXXDDBFHOBEHA-CWDCEQMOSA-N,CN(C)CC=CC(=O)NC1=C(C=C2C(=C1)C(=NC=N2)NC3=CC(...
2,2,Alternative?,,,
3,3,Basal-like?,,,
4,4,Bortezomib,InChI=1S/C19H25BN4O4/c1-13(2)10-17(20(27)28)24...,GXJABQQUPOEUTA-RDJZCZTQSA-N,B(C(CC(C)C)NC(=O)C(CC1=CC=CC=C1)NC(=O)C2=NC=CN...


In [122]:
use_drugs = []
all_drugs = list(pancreas_smiles_data['inhibitor'])
all_inchis = list(pancreas_smiles_data['inchi'])
for i in range(len(all_inchis)):
    if str(all_inchis[i]) != 'nan':
        use_drugs.append(all_drugs[i])
print('number of used drugs: ' + str(len(use_drugs)))

number of used drugs: 25


### Create label file for data points with gene data

In [123]:
pancreas_drug_data_chemo.head()

Unnamed: 0,Sample ID,Gemcitabine,Paclitaxel,SN-38,5-FU,Oxaliplatin,Insensitive
0,hF2,0.554358,0.712962,0.49995,0.782922,0.887246,
1,hF23,0.523995,0.486711,0.54786,0.781999,0.820843,
2,hF24,0.609942,0.742389,0.549167,0.671272,0.809528,
3,hF27,0.677162,0.636766,0.567865,0.838669,0.852603,YES
4,hF28,0.68501,0.662779,0.651442,0.861745,0.829213,YES


In [140]:
# merge the date using the key
gt_data = pancreas_drug_data_targeted_for_chem_refractory
gt_data = gt_data[['Sample ID'] + use_drugs]
gt_data.head()

Unnamed: 0,Sample ID,5-FU,Afatinib,Bortezomib,Celecoxib,Disulfuram,Everolimus,Gemcitabine,K-ras(G12C) Inhibitor 9,KU-55933,...,Oxaliplatin,Paclitaxel,Ruxolitinib,SF1670,SGI-1776,SN-38,Selumetinib,Sunitinib,TPCA-1,WIKI4
0,hF27,0.838669,0.828,0.274,0.798,0.572,0.764,0.677162,0.818,0.786,...,0.852603,0.636766,0.862,0.696,0.854,0.567865,0.816,0.78,0.74,0.798
1,hF28,0.861745,0.668,0.236,0.89,0.47,0.75,0.68501,0.826,0.866,...,0.829213,0.662779,0.886,0.766,0.794,0.651442,0.7,0.844,0.824,0.818
2,hF45,0.855193,0.698,0.256,0.86,0.554,0.864,0.653679,0.792,0.872,...,0.918398,0.680984,0.82,0.794,0.77,0.715255,0.814,0.8,0.812,0.846
3,hF70,0.971617,0.844,0.274,0.746,0.564,0.724,0.917493,0.832,0.83,...,0.98154,0.759485,0.888,0.798,0.798,0.880286,0.802,0.804,0.882,0.882
4,hF77,0.764028,0.722,0.264,0.84,0.534,0.894,0.743855,0.816,0.826,...,0.893167,0.596951,0.9,0.894,0.768,0.658731,0.786,0.858,0.84,0.746


In [154]:
pancreas_value_save_path = 'data/organoid_pancreas/organoid_value.tsv'
organoids    = []
inhibitor = []
value     = []

data_dfs = [pancreas_drug_data_chemo,
            pancreas_drug_data_targeted,
            pancreas_drug_data_targeted_for_chem_refractory]
# used to store used combinations
used_combos = set()
for gt_data in data_dfs:
    organs = list(gt_data['Sample ID'])
    drugs = list(gt_data.columns[1:])
    values = np.array(gt_data.values[:,1:])
    print(values.shape)
    for i in range(len(organs)):
        cur_pat = organs[i]
        for j in range(len(drugs)):
            cur_drug = drugs[j]
            if cur_drug in use_drugs:
                cur_val = values[i,j]
                if np.isnan(cur_val):
                    continue
                cur_combo = cur_pat + '_' + cur_drug
                if cur_combo in used_combos:
                    continue
                used_combos.add(cur_combo)
                organoids.append(cur_pat)
                inhibitor.append(cur_drug)
                value.append(cur_val)

(66, 6)
(66, 21)
(22, 28)


In [155]:
print('number of organoids: ' + str(len(organoids)))
print('number of inhibitor: ' + str(len(inhibitor)))
print('number of value: ' + str(len(value)))

number of organoids: 1645
number of inhibitor: 1645
number of value: 1645


In [156]:
pancread_value_data = pd.DataFrame({'organoid':organoids,
                                'inhibitor':inhibitor,
                                'value':value})
pancread_value_data.to_csv(pancreas_value_save_path,sep='\t',index=None)

In [39]:
fpkm_path = 'data/organoid_pancreas/pancreas_organoid_fileids_fpkm.tsv'
fpkm_data = pd.read_csv(fpkm_path,sep='\t')
print(fpkm_data.shape)
fpkm_data.head()

(55, 8)


Unnamed: 0,Access,File Name,Cases,Project,Data Category,Data Format,File Size,Annotations
0,open,9d26edab-7937-423c-8b5a-ac333211a29b.FPKM.txt.gz,1,ORGANOID-PANCREATIC,Transcriptome Profiling,TXT,303.81 KB,0
1,open,955f9541-b394-4b28-90de-9101d24f72ad.FPKM.txt.gz,1,ORGANOID-PANCREATIC,Transcriptome Profiling,TXT,306.34 KB,0
2,open,f888eea4-4fda-4f9f-a8cb-66e8219ce268.FPKM.txt.gz,1,ORGANOID-PANCREATIC,Transcriptome Profiling,TXT,301.9 KB,0
3,open,d4f8538e-93d1-40ba-9c89-ad59ab96c1f7.FPKM.txt.gz,1,ORGANOID-PANCREATIC,Transcriptome Profiling,TXT,304.1 KB,0
4,open,dec5e5a1-09c1-404c-8f50-ff351412dc31.FPKM.txt.gz,1,ORGANOID-PANCREATIC,Transcriptome Profiling,TXT,310.21 KB,0


In [158]:
fpkm_data = pd.read_csv('data/organoid_pancreas/organoid_pancreas_fpkm.txt',sep='\t')
fpkm_data

Unnamed: 0,id_0,11c215d3-3a8d-43e5-8227-f8e993e1e021.FPKM.txt,9d26edab-7937-423c-8b5a-ac333211a29b.FPKM.txt,6b2583f1-bbea-4020-ad73-8a59589b8cc3.FPKM.txt,044f8701-5e9f-4300-beb2-41fef8096685.FPKM.txt,30520ecd-c6bd-4039-9b1a-d3f999235598.FPKM.txt,97512c95-1685-46a9-88db-071766889490.FPKM.txt,8a84b3e2-92a0-4c2b-986b-0efa8a22a8f3.FPKM.txt,dec5e5a1-09c1-404c-8f50-ff351412dc31.FPKM.txt,30471596-3380-456a-8e55-2925b108e017.FPKM.txt,...,d7c7d4b8-19a5-4926-b215-067bd1d16c5a.FPKM.txt,80b9fcf4-185c-4e52-bcff-8ab52ab4bfa5.FPKM.txt,33aea564-8086-4b2d-a28d-eccc379ecfa6.FPKM.txt,8236bc9d-2156-4991-a5eb-223e02229b3a.FPKM.txt,d4f8538e-93d1-40ba-9c89-ad59ab96c1f7.FPKM.txt,719f7392-0c7b-4529-99c3-b3db6c42b382.FPKM.txt,dd3f3d1f-12b8-4cc7-9f4c-105faef04a51.FPKM.txt,38b05dca-6d3f-4999-aded-0870bfd8f7e7.FPKM.txt,7a33674a-4f33-452b-9ac5-cf692bd50229.FPKM.txt,fc29320e-ef34-49e4-97a8-642487dcc95a.FPKM.txt
0,ENSG00000000003.13,9.0100,5.6222,5.8812,12.9486,16.2632,15.9627,15.2014,14.3398,9.7621,...,7.5498,14.5059,9.5317,18.7515,9.0659,5.8760,10.0711,15.5382,8.1557,6.9453
1,ENSG00000000005.5,0.0000,0.0000,0.0000,0.0000,0.0304,0.0000,0.0000,0.0000,0.0149,...,0.0136,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0146,0.0000,0.0000
2,ENSG00000000419.11,48.2772,41.1670,61.5613,81.6943,36.3870,45.3314,74.0640,90.4289,39.7451,...,50.8754,37.8825,61.9678,85.2760,64.1764,49.1493,38.6678,40.0392,64.8625,77.9880
3,ENSG00000000457.12,4.2398,3.3293,2.8488,5.2604,2.5943,3.8542,2.8813,5.0461,3.5407,...,4.9141,2.6460,4.3635,3.1361,2.7565,3.4693,3.8800,2.2515,5.8210,3.3545
4,ENSG00000000460.15,3.5345,3.5172,2.5072,7.0305,0.8210,2.7092,4.8934,6.4371,1.7642,...,4.0045,2.0781,5.6139,5.4499,2.6231,3.2286,1.7451,1.2433,2.2674,5.2872
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60478,ENSGR0000275287.3,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
60479,ENSGR0000276543.3,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
60480,ENSGR0000277120.3,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
60481,ENSGR0000280767.1,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000


In [159]:
import mygene as mygene
mg = mygene.MyGeneInfo()

In [161]:
from tqdm.notebook import tqdm
genes = list(fpkm_data['id_0'])
names = []
symbols = []
for i in tqdm(np.arange(len(genes))):
    cur_gene = genes[i]
    try:
        cur_gene_dict = mg.getgene(cur_gene.split('.')[0],fields='name,symbol')
        cur_symbol = cur_gene_dict['symbol']
        cur_name = cur_gene_dict['name']
        names.append(cur_name)
        symbols.append(cur_symbol)
    except:
        names.append(np.nan)
        symbols.append(np.nan)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=60483.0), HTML(value='')))

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)






In [162]:
save_path = 'data/organoid_pancreas/gene_name_symbol_mapping.tsv'
save_data = pd.DataFrame({'gene':genes[0:len(names)],
                          'symbol':symbols,
                          'name':names}).to_csv(save_path,index=None,sep='\t')

## Xenografts data

In [42]:
xenografts_data = pd.read_csv('data/lung_cancer_xenografts/sample_info.txt',sep='\t')
print(xenografts_data.shape)
xenografts_data.head()

(53, 31)


Unnamed: 0,sample_name,file_name,group,patient,pat_id,input,9mb_filename,histology,age,sex,...,stage2,stage3,stage4,stage5,treatment,etp_chs,carpl_chs,gem_chs,paltx_chs,erlo_chs
0,3T,3T_MRS447_ACTTGA_L002_bwa_hg19_unique_sorted.bam,Tumour,Pat3,7166.0,,/project/43/SOLiD/PREDICT/bioscope/7166LuCa3TU...,LCC,70.0,M,...,pN2,cM0,G3,R0,No,++,++,+,-,-
1,4T,4T_MRS449_GGCTAC_L002_bwa_hg19_unique_sorted.bam,Tumour,Pat4,7177.0,mpimg_L5312-1_Pat4-7177-tumor_CGATGT_L008_bwa_...,/project/43/SOLiD/PREDICT/bioscope/7177LuCa4TU...,SQC,42.0,M,...,ypN2,cM0,G3,R0,5 cycles carboplatin/paclitaxel,-,++,+++,-,++
2,5T,5T_MRS403_ACTTGA_L001_bwa_hg19_unique_sorted.bam,Tumour,Pat5,7187.0,mpimg_L5316-1_Pat5-7187-tumor_GCCAAT_L008_bwa_...,/project/43/SOLiD/PREDICT/bioscope/7187LuCa5TU...,PLC,38.0,F,...,pN0,cM0,G3,R0,No,-,++,+++,-,-
3,6T,6T_MRS403_GGCTAC_L001_bwa_hg19_unique_sorted.bam,Tumour,Pat6,7198.0,mpimg_L5318-1_Pat6-7198-tumor_ACTTGA_L008_bwa_...,/project/43/SOLiD/PREDICT/bioscope/7198LuCa6TU...,ADC,60.0,F,...,pN0,cM0,G3,R0,No,-,+,+,+,-
4,7T,7T_MRS475_ACTTGA_L001_bwa_hg19_unique_sorted.bam,Tumour,Pat7,7298.0,,/project/43/SOLiD/PREDICT/bioscope/7298LuCa7TU...,SQC,73.0,M,...,pN2,cM0,G3,R1,No,+,+,++++,-,


In [72]:
np.unique(xenografts_data['pat_id'])

array([7166., 7177., 7187., 7198., 7298., 7336., 7343., 7406., 7414.,
       7433., 7462., 7466., 7506., 7530., 7558., 7612., 7668., 7700.,
       7747., 7766., 7860., 7913., 9643.,   nan,   nan,   nan,   nan])

In [62]:
gene_data_path = 'data/lung_cancer_xenografts/normalized_expression.remoatFiltered.geneLevel.txt'
gene_data = pd.read_csv(gene_data_path,sep='\t')
gene_data.head()

Unnamed: 0,HGNC,Entrez,7177.N_9347469042_C.intensity,7177.N_9347469042_C.detectionPvalue,7177.X_9347469042_D.intensity,7177.X_9347469042_D.detectionPvalue,7187.N_9347469042_E.intensity,7187.N_9347469042_E.detectionPvalue,7187.X_9347469042_F.intensity,7187.X_9347469042_F.detectionPvalue,...,10711.X_9481417016_H.intensity,10711.X_9481417016_H.detectionPvalue,10684.X_9481417016_I.intensity,10684.X_9481417016_I.detectionPvalue,10855.X_9481417016_J.intensity,10855.X_9481417016_J.detectionPvalue,10872.X_9481417016_K.intensity,10872.X_9481417016_K.detectionPvalue,11187.X_9481417016_L.intensity,11187.X_9481417016_L.detectionPvalue
0,7A5,346389,132.667209,0.287013,129.242318,0.474026,140.573763,0.051948,143.664824,0.025974,...,146.627596,0.000427,126.215411,0.831135,125.411244,0.880294,154.101187,0.000496,134.972453,0.18076
1,A1BG,1,130.056266,0.409091,127.412775,0.601299,137.783727,0.088312,129.688301,0.448052,...,133.145629,0.302001,129.843623,0.497602,130.534919,0.441108,131.158108,0.419154,125.174744,0.937832
2,A1CF,29974,127.681633,0.555844,129.255691,0.481818,128.692008,0.478355,130.113771,0.444156,...,133.79317,0.275176,132.84554,0.373295,123.976642,0.980969,129.734518,0.567463,133.081947,0.433837
3,A26C3,23784,131.0842,0.475758,129.942883,0.435065,126.868342,0.607792,122.097278,0.867532,...,132.033661,0.361921,128.01457,0.662429,129.431746,0.452032,131.062658,0.501986,133.528947,0.446474
4,A2BP1,54715,127.507817,0.565584,129.514538,0.459091,128.363275,0.509416,127.956717,0.566558,...,126.420781,0.775223,131.994498,0.372284,124.248804,0.950678,133.418111,0.333986,127.600072,0.616958


In [65]:
patient_column = ['pat_id']
drug_names = ['Carboplatin','Gemcitabin','Etoposid','Paclitaxel','Erlotinib','Cetuximab','Bevacizumab']
save_drug_response_path = 'data/lung_cancer_xenografts/drug_response.tsv'
save_data = xenografts_data[patient_column + drug_names]
save_data.head()
save_data.to_csv(save_drug_response_path,sep='\t',index=None)

In [74]:
drug_list = list(drug_names)

#### save data in format pat_id, drug, value

In [92]:
drugs    = list(save_data.columns)[1:]
patients = list(np.array(save_data['pat_id']))
values   = np.array(save_data.values[:,1:])
print(len(drugs))
print(len(patients))
print(values.shape)

7
53
(53, 7)


In [97]:
pat_id    = []
inhibitor = []
value     = []
for i in range(len(patients)):
    cur_pat = patients[i]
    for j in range(len(drugs)):
        cur_drug = drugs[j]
        cur_val = values[i,j]
        if np.isnan(cur_pat):
            continue
        pat_id.append(cur_pat)
        inhibitor.append(cur_drug)
        value.append(cur_val)
xeno_value_data = pd.DataFrame({'pat_id':pat_id,
                                'inhibitor':inhibitor,
                                'value':value})
save_xenografts_response_path = 'data/lung_cancer_xenografts/xenografts_value.tsv'
xeno_value_data.to_csv(save_xenografts_response_path,sep='\t',index=None)

### Extract inchi, inchi-key, smiles for compounds

In [56]:
unique_compounds = list(np.unique(drug_list))
use_fields = ['inchi','inchikey','canonical_smiles']
save_path = 'data/lung_cancer_xenografts/xenografts_inhibitor_inchi_smiles.csv'
flag_redo = False
if not os.path.exists(save_path) or flag_redo:
    compound_xenografts_dict = {'inhibitor':[]}
    for i in tqdm(np.arange(len(unique_compounds))):
        cur_compound = unique_compounds[i]
        tmp_dict = utils.get_prop_dict_for_compound(cur_compound)
        for field in use_fields:
            if field not in compound_xenografts_dict:
                compound_xenografts_dict[field] = []
            try:
                cur_val = tmp_dict[field]
                compound_xenografts_dict[field].append(cur_val)
            except:
                compound_xenografts_dict[field].append(np.nan)
        compound_xenografts_dict['inhibitor'].append(cur_compound)
    pd.DataFrame(compound_xenografts_dict).to_csv(save_path)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=7.0), HTML(value='')))




In [98]:
xenografts_smiles_data_path = 'data/lung_cancer_xenografts/xenografts_inhibitor_inchi_smiles.csv'
xenografts_smiles_data = pd.read_csv(xenografts_smiles_data_path)
not_nan_instances = np.sum(np.array(xenografts_smiles_data['canonical_smiles'],dtype=np.str) != 'nan')
print('data for ' + str(not_nan_instances) + ' compounds [of ' +\
      str(xenografts_smiles_data.shape[0]) + ']')
xenografts_smiles_data.head()

data for 3 compounds [of 7]


Unnamed: 0.1,Unnamed: 0,inhibitor,inchi,inchikey,canonical_smiles
0,0,Bevacizumab,,,
1,1,Carboplatin,InChI=1S/C6H8O4.2H2N.Pt/c7-4(8)6(5(9)10)2-1-3-...,VSRXQHXAPYXROS-UHFFFAOYSA-N,C1CC(C1)(C(=O)O)C(=O)O.[NH2-].[NH2-].[Pt+2]
2,2,Cetuximab,,,
3,3,Erlotinib,InChI=1S/C22H23N3O4/c1-4-16-6-5-7-17(12-16)25-...,AAKJLRGGTJKAMG-UHFFFAOYSA-N,COCCOC1=C(C=C2C(=C1)C(=NC=N2)NC3=CC=CC(=C3)C#C...
4,4,Etoposid,,,


## CCLE data

In [4]:
ccle_cell_line_annotations = pd.read_csv('data/ccle/CCLE_sample_info_file_2012-10-18.txt',sep='\t')
print(ccle_cell_line_annotations.shape)
ccle_cell_line_annotations.head()

(1046, 13)


Unnamed: 0,CCLE name,Cell line primary name,Cell line aliases,Gender,Site Primary,Histology,Hist Subtype1,Notes,Source,Expression arrays,SNP arrays,Oncomap,Hybrid Capture Sequencing
0,1321N1_CENTRAL_NERVOUS_SYSTEM,1321N1,,M,central_nervous_system,glioma,astrocytoma,"Identical lines: U-118 MG, U-138 MG and 1321N1...",ECACC,NIECE_p_NCLE_RNA3_HG-U133_Plus_2_B06_296024,HONEY_p_NCLE_DNAAffy3_S_GenomeWideSNP_6_E09_29...,yes,
1,143B_BONE,143B,,F,bone,osteosarcoma,NS,"Identical lines: HTK-, HOS and 143B share high...",ATCC,MAKER_p_NCLE_RNA7_HG-U133_Plus_2_F09_454702,BOWER_p_NCLE_DNAAffy8_GenomeWideSNP_6_D02_464552,yes,
2,22RV1_PROSTATE,22Rv1,,M,prostate,carcinoma,NS,,ATCC,NIECE_p_NCLE_RNA3_HG-U133_Plus_2_F06_296120,LIMPS_p_NCLE_DNA2N_GenomeWideSNP_6_C09_246674,yes,yes
3,2313287_STOMACH,23132/87,,M,stomach,carcinoma,adenocarcinoma,,DSMZ,WATCH_p_NCLE_RNA8_HG-U133_Plus_2_E11_474718,CHARY_p_NCLE_DNAAffy9_GenomeWideSNP_6_D06_490336,yes,yes
4,253JBV_URINARY_TRACT,253J-BV,,U,urinary_tract,carcinoma,transitional_cell_carcinoma,Identical lines: 253J and 253J-BV share high S...,KCLB,CRAZY_p_NCLE_RNA10_HG-U133_Plus_2_A05_569490,,yes,yes


In [6]:
pharm_drug_data = pd.read_csv('data/ccle/CCLE_NP24.2009_Drug_data_2015.02.24.csv',sep=',')
print(pharm_drug_data.shape)
pharm_drug_data.head()

(11670, 13)


Unnamed: 0,CCLE Cell Line Name,Primary Cell Line Name,Compound,Target,Doses (uM),Activity Data (median),Activity SD,Num Data,FitType,EC50 (uM),IC50 (uM),Amax,ActArea
0,1321N1_CENTRAL_NERVOUS_SYSTEM,1321N1,AEW541,IGF1R,".0025,.0080,.025,.080,.25,.80,2.53,8","8.67,11.0,2.16,.27,-10,-13,-26,-43","3.31,3.72,5.36,4.67,13.1,.18,2.42,7.51",8,Sigmoid,8.717774,8.0,-42.558014,0.7124
1,22RV1_PROSTATE,22Rv1,AEW541,IGF1R,".0025,.0080,.025,.080,.25,.80,2.53,8",".94,12.5,-14,4.16,-25,-32,-52,-71","1.95,13.3,6.98,21.8,16.0,18.8,4.84,7.93",8,Sigmoid,8.165164,2.329924,-71.58934,1.6723
2,42MGBA_CENTRAL_NERVOUS_SYSTEM,42-MG-BA,AEW541,IGF1R,".0025,.0080,.025,.080,.25,.80,2.53,8","8.91,8.39,-3.5,12.4,-.55,-6.2,-48,-63","13.7,7.70,11.1,6.43,24.0,9.57,9.57,10.4",8,Sigmoid,1.514508,2.68213,-63.491371,1.1852
3,5637_URINARY_TRACT,5637,AEW541,IGF1R,".0025,.0080,.025,.080,.25,.80,2.53,8","2.15,9.91,-3.5,.056,-2.1,-14,-30,-62","4.05,9.75,12.7,4.36,11.0,10.0,24.6,.14",8,Sigmoid,8.006595,5.002314,-62.352776,0.9948
4,639V_URINARY_TRACT,639-V,AEW541,IGF1R,".0025,.0080,.025,.080,.25,.80,2.53,8","11.8,-7.3,-9.4,-15,-11,-21,-53,-50",".95,5.67,11.1,.68,31.6,22.3,1.45,3.08",8,Sigmoid,0.931196,1.736181,-51.959808,1.5436


In [13]:
pat_id    = []
inhibitor = []
value     = []
patients  = list(pharm_drug_data['CCLE Cell Line Name'])
drugs     = list(pharm_drug_data['Compound'])
values    = list(pharm_drug_data['IC50 (uM)'])
for i in range(len(patients)):
    cur_pat = patients[i]
    cur_drug = drugs[i]
    cur_val = values[i]
    pat_id.append(cur_pat)
    inhibitor.append(cur_drug)
    value.append(cur_val)
ccle_value_data = pd.DataFrame({'pat_id':pat_id,
                                'inhibitor':inhibitor,
                                'value':value})
save_ccle_response_path = 'data/ccle/ccle_value.tsv'
ccle_value_data.to_csv(save_ccle_response_path,sep='\t',index=None)

### Extract inchi, inchi-key, smiles for compounds

In [11]:
drug_list = list(pharm_drug_data['Compound'])
unique_compounds = list(np.unique(drug_list))
use_fields = ['inchi','inchikey','canonical_smiles']
save_path = 'data/ccle/ccle_inhibitor_inchi_smiles.csv'
flag_redo = False
if not os.path.exists(save_path) or flag_redo:
    compound_ccle_dict = {'inhibitor':[]}
    for i in tqdm(np.arange(len(unique_compounds))):
        cur_compound = unique_compounds[i]
        tmp_dict = utils.get_prop_dict_for_compound(cur_compound)
        for field in use_fields:
            if field not in compound_ccle_dict:
                compound_ccle_dict[field] = []
            try:
                cur_val = tmp_dict[field]
                compound_ccle_dict[field].append(cur_val)
            except:
                compound_ccle_dict[field].append(np.nan)
        compound_ccle_dict['inhibitor'].append(cur_compound)
    pd.DataFrame(compound_ccle_dict).to_csv(save_path)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=24.0), HTML(value='')))




In [12]:
pd.DataFrame(compound_ccle_dict).head()

Unnamed: 0,inhibitor,inchi,inchikey,canonical_smiles
0,17-AAG,InChI=1S/C31H43N3O8/c1-8-12-33-26-21-13-17(2)1...,AYUNIORJHRXIBJ-TXHRRWQRSA-N,CC1CC(C(C(C=C(C(C(C=CC=C(C(=O)NC2=CC(=O)C(=C(C...
1,AEW541,InChI=1S/C27H29N5O/c28-26-25-24(21-8-4-9-23(14...,AECDBHGVIIRMOI-UHFFFAOYSA-N,C1CN(C1)CC2CC(C2)N3C=C(C4=C(N=CN=C43)N)C5=CC(=...
2,AZD0530,InChI=1S/C27H32ClN5O5/c1-32-6-8-33(9-7-32)10-1...,OUKYUETWWIPKQR-UHFFFAOYSA-N,CN1CCN(CC1)CCOC2=CC3=C(C(=C2)OC4CCOCC4)C(=NC=N...
3,AZD6244,InChI=1S/C17H15BrClFN4O3/c1-24-8-21-16-13(24)7...,CYOHGALHFOKKQC-UHFFFAOYSA-N,CN1C=NC2=C1C=C(C(=C2F)NC3=C(C=C(C=C3)Br)Cl)C(=...
4,Erlotinib,InChI=1S/C22H23N3O4/c1-4-16-6-5-7-17(12-16)25-...,AAKJLRGGTJKAMG-UHFFFAOYSA-N,COCCOC1=C(C=C2C(=C1)C(=NC=N2)NC3=CC=CC(=C3)C#C...


In [7]:
drug_profiling_data = pd.read_csv('data/ccle/CCLE_NP24.2009_profiling_2012.02.20.csv',sep=',')
print(drug_profiling_data.shape)
drug_profiling_data.head()

(24, 7)


Unnamed: 0,Compound (code or generic name),Compound (brand name),Target(s),Mechanism of action,Class,Highest Phase,Organization
0,Erlotinib,Tarceva,EGFR,EGFR Inhibitor,Kinase inhibitor,Launched-2004,Genentech
1,Lapatinib,Tykerb,"EGFR, HER2",EGFR and HER2 Inhibitor,Kinase inhibitor,Launched-2007,GlaxoSmithKline
2,PHA-665752,,c-MET,c-MET Inhibitor,Kinase inhibitor,Preclinical,Pfizer
3,PF-2341066,Crizotinib,"c-MET, ALK",c-MET and ALK Inhibitor,Kinase inhibitor,Launched-2011,Pfizer
4,TAE684,,ALK,ALK Inhibitor,Kinase inhibitor,Preclinical,Novartis


In [8]:
rpkm_gene_data = pd.read_csv('data/ccle/CCLE_DepMap_18q3_RNAseq_RPKM_20180718.gct', sep='\t',skiprows=2)
print(rpkm_gene_data.shape)
rpkm_gene_data.head()

(56318, 1158)


Unnamed: 0,Name,Description,22RV1_PROSTATE (ACH-000956),2313287_STOMACH (ACH-000948),253JBV_URINARY_TRACT (ACH-000026),253J_URINARY_TRACT (ACH-000011),42MGBA_CENTRAL_NERVOUS_SYSTEM (ACH-000323),5637_URINARY_TRACT (ACH-000905),59M_OVARY (ACH-000520),639V_URINARY_TRACT (ACH-000973),...,UMUC16_URINARY_TRACT (ACH-001409),UMUC4_URINARY_TRACT (ACH-001410),UMUC5_URINARY_TRACT (ACH-001411),UMUC6_URINARY_TRACT (ACH-001414),UMUC7_URINARY_TRACT (ACH-001415),UMUC9_URINARY_TRACT (ACH-001416),UPCISCC152_UPPER_AERODIGESTIVE_TRACT (ACH-001228),UW228_CENTRAL_NERVOUS_SYSTEM (ACH-001232),Y79_AUTONOMIC_GANGLIA (ACH-001295),YAMATO_SOFT_TISSUE (ACH-001277)
0,ENSG00000223972.4,DDX11L1,0.0,0.03755,0.0,0.06507,0.0,0.01378,0.01463,0.03085,...,0.0,0.02977,0.02999,0.03532,0.03831,0.03236,0.0,0.03799,0.02907,0.0
1,ENSG00000227232.4,WASH7P,12.63011,10.14155,6.20657,6.55835,7.61752,6.19356,4.62277,6.00767,...,6.42321,8.17443,7.75149,9.97036,8.1708,8.70551,15.23348,6.47732,8.45568,11.81688
2,ENSG00000243485.2,MIR1302-11,0.04289,0.01037,0.16955,0.23364,0.08765,0.0,0.00808,0.09373,...,0.12367,0.19183,0.10492,0.18537,0.07408,0.07747,0.07448,0.01049,0.39607,0.17586
3,ENSG00000237613.2,FAM138A,0.0,0.00869,0.22423,0.16558,0.0,0.00637,0.0,0.01427,...,0.01554,0.00459,0.0185,0.04903,0.0133,0.0,0.02674,0.00879,0.26449,0.02266
4,ENSG00000268020.2,OR4G4P,0.0,0.0,0.02886,0.03875,0.0,0.0,0.0,0.0,...,0.00667,0.01182,0.01786,0.01052,0.01141,0.00642,0.0,0.0,0.0,0.01459


In [20]:
cols = rpkm_gene_data.columns
use_cols = []
for i in range(len(cols)):
    if '(ACH' in cols[i]:
        use_cols.append(cols[i].split('(ACH-')[0].strip())
    else:
        use_cols.append(cols[i])

In [22]:
rpkm_gene_data.columns = use_cols
rpkm_gene_data.head()

Unnamed: 0,Name,Description,22RV1_PROSTATE,2313287_STOMACH,253JBV_URINARY_TRACT,253J_URINARY_TRACT,42MGBA_CENTRAL_NERVOUS_SYSTEM,5637_URINARY_TRACT,59M_OVARY,639V_URINARY_TRACT,...,UMUC16_URINARY_TRACT,UMUC4_URINARY_TRACT,UMUC5_URINARY_TRACT,UMUC6_URINARY_TRACT,UMUC7_URINARY_TRACT,UMUC9_URINARY_TRACT,UPCISCC152_UPPER_AERODIGESTIVE_TRACT,UW228_CENTRAL_NERVOUS_SYSTEM,Y79_AUTONOMIC_GANGLIA,YAMATO_SOFT_TISSUE
0,ENSG00000223972.4,DDX11L1,0.0,0.03755,0.0,0.06507,0.0,0.01378,0.01463,0.03085,...,0.0,0.02977,0.02999,0.03532,0.03831,0.03236,0.0,0.03799,0.02907,0.0
1,ENSG00000227232.4,WASH7P,12.63011,10.14155,6.20657,6.55835,7.61752,6.19356,4.62277,6.00767,...,6.42321,8.17443,7.75149,9.97036,8.1708,8.70551,15.23348,6.47732,8.45568,11.81688
2,ENSG00000243485.2,MIR1302-11,0.04289,0.01037,0.16955,0.23364,0.08765,0.0,0.00808,0.09373,...,0.12367,0.19183,0.10492,0.18537,0.07408,0.07747,0.07448,0.01049,0.39607,0.17586
3,ENSG00000237613.2,FAM138A,0.0,0.00869,0.22423,0.16558,0.0,0.00637,0.0,0.01427,...,0.01554,0.00459,0.0185,0.04903,0.0133,0.0,0.02674,0.00879,0.26449,0.02266
4,ENSG00000268020.2,OR4G4P,0.0,0.0,0.02886,0.03875,0.0,0.0,0.0,0.0,...,0.00667,0.01182,0.01786,0.01052,0.01141,0.00642,0.0,0.0,0.0,0.01459


In [24]:
rpkm_gene_data.to_csv('data/ccle/rpkm_gene_data.csv',sep='\t',index=None)