# LINCS Data from Phase II - GSE70138

L1000 Connectivity Map perturbational profiles from Broad Institute LINCS Center for Transcriptomics LINCS PHASE *II* (n=354,123; updated March 30, 2017).

Guía para el uso de los datos: https://clue.io/GEO-guide (documento word)

Link de GEO a descarga completa: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE70138

Tutorial sobre manejo de archivo .gctx: https://github.com/cmap/cmapPy/blob/master/tutorials/cmapPy_pandasGEXpress_tutorial.ipynb


In [1]:
# paquete que se utiliza para procesar los datos
pip install cmapPy

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'C:\Users\Andrea\Downloads\gene_data_expression\Scripts\python.exe -m pip install --upgrade pip' command.


In [5]:
import pandas as pd
import gzip, shutil
import os
import pkg_resources

from cmapPy.pandasGEXpress.parse import parse

In [6]:
# metadata

gene_info = pd.read_csv('data/GSE70138_clue/GSE92742_Broad_LINCS_gene_info.txt', sep = '\t')
cell_info = pd.read_csv('data/GSE70138_clue/GSE92742_Broad_LINCS_cell_info.txt', sep = '\t')
pert_info = pd.read_csv('data/GSE70138_clue/GSE70138_Broad_LINCS_pert_info.txt', sep = '\t')
inst_info = pd.read_csv('data/GSE70138_clue/GSE70138_Broad_LINCS_inst_info.txt', sep = '\t')
sig_info = pd.read_csv('data/GSE70138_clue/GSE70138_Broad_LINCS_sig_info.txt', sep = '\t')
sig_metrics = pd.read_csv('data/GSE70138_clue/GSE70138_Broad_LINCS_sig_metrics.txt', sep = '\t')

In [16]:
cell_info

Unnamed: 0,cell_id,cell_type,base_cell_id,precursor_cell_id,modification,sample_type,primary_site,subtype,original_growth_pattern,provider_catalog_id,original_source_vendor,donor_age,donor_sex,donor_ethnicity
0,A375,cell line,A375,-666,-666,tumor,skin,malignant melanoma,adherent,CRL-1619,ATCC,54,F,-666
1,A375.311,cell line,A375,A375,genetically modified to stably express Cas9 pr...,tumor,skin,malignant melanoma,adherent,CRL-1619,ATCC,54,F,-666
2,A549,cell line,A549,-666,-666,tumor,lung,non small cell lung cancer| carcinoma,adherent,CCL-185,ATCC,58,M,Caucasian
3,A549.311,cell line,A549,A549,genetically modified to stably express Cas9 p...,tumor,lung,non small cell lung cancer| carcinoma,adherent,CCL-185,ATCC,58,M,Caucasian
4,A673,cell line,A673,-666,-666,tumor,bone,ewing's sarcoma,adherent,CRL-1598,ATCC,-666,F,-666
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,CD34,primary,CD34,-666,-666,normal,bone,bone marrow,suspension,-666,-666,-666,-666,-666
94,PHH,primary,PHH,-666,-666,primary,liver,normal primary liver,-666,-666,CellzDirect,-666,-666,-666
95,SKB,primary,SKB,-666,-666,normal,muscle,myoblast,-666,CC-2580,Lonza,-666,-666,-666
96,SKL,primary,SKL,-666,-666,primary,muscle,normal primary skeletal muscle cells,adherent,CC-2561,LONZA,-666,-666,-666


In [8]:
cell_info[(cell_info['cell_type'] == 'cell line') & (cell_info['subtype'] == 'neuroblastoma')]

Unnamed: 0,cell_id,cell_type,base_cell_id,precursor_cell_id,modification,sample_type,primary_site,subtype,original_growth_pattern,provider_catalog_id,original_source_vendor,donor_age,donor_sex,donor_ethnicity
61,SHSY5Y,cell line,SHSY5Y,-666,-666,tumor,autonomic ganglia,neuroblastoma,mix,CRL-2266,ATCC,-666,F,-666


In [9]:
cell_info[cell_info['cell_id'] == 'SHSY5Y']

Unnamed: 0,cell_id,cell_type,base_cell_id,precursor_cell_id,modification,sample_type,primary_site,subtype,original_growth_pattern,provider_catalog_id,original_source_vendor,donor_age,donor_sex,donor_ethnicity
61,SHSY5Y,cell line,SHSY5Y,-666,-666,tumor,autonomic ganglia,neuroblastoma,mix,CRL-2266,ATCC,-666,F,-666


In [17]:
sig_info[sig_info['cell_id'] == 'A375']

Unnamed: 0,sig_id,pert_id,pert_iname,pert_type,cell_id,pert_idose,pert_itime,distil_id
0,LJP005_A375_24H:A03,DMSO,DMSO,ctl_vehicle,A375,-666,24 h,LJP005_A375_24H_X1_B19:A03|LJP005_A375_24H_X2_...
1,LJP005_A375_24H:A04,DMSO,DMSO,ctl_vehicle,A375,-666,24 h,LJP005_A375_24H_X1_B19:A04|LJP005_A375_24H_X2_...
2,LJP005_A375_24H:A05,DMSO,DMSO,ctl_vehicle,A375,-666,24 h,LJP005_A375_24H_X1_B19:A05|LJP005_A375_24H_X2_...
3,LJP005_A375_24H:A06,DMSO,DMSO,ctl_vehicle,A375,-666,24 h,LJP005_A375_24H_X1_B19:A06|LJP005_A375_24H_X2_...
4,LJP005_A375_24H:A07,BRD-K76908866,CP-724714,trt_cp,A375,10.0 um,24 h,LJP005_A375_24H_X1_B19:A07|LJP005_A375_24H_X2_...
...,...,...,...,...,...,...,...,...
111588,REP.A028_A375_24H:K08,BRD-K60230970,MG-132,trt_cp,A375,20.0 um,24 h,REP.A028_A375_24H_X2_B25:K08|REP.A028_A375_24H...
111589,REP.A028_A375_24H:K11,BRD-K60230970,MG-132,trt_cp,A375,20.0 um,24 h,REP.A028_A375_24H_X2_B25:K11|REP.A028_A375_24H...
111590,REP.A028_A375_24H:O02,BRD-K60230970,MG-132,trt_cp,A375,20.0 um,24 h,REP.A028_A375_24H_X2_B25:O02|REP.A028_A375_24H...
111591,REP.A028_A375_24H:C22,BRD-K50691590,bortezomib,trt_cp,A375,20.0 um,24 h,REP.A028_A375_24H_X3_B25:C22


In [15]:
cell_info['subtype'].unique()

array(['malignant melanoma', 'non small cell lung cancer| carcinoma',
       "ewing's sarcoma", 'adenocarcinoma', 'carcinoma',
       'colorectal adenocarcinoma',
       'non small cell lung cancer| large cell carcinoma',
       'carcinoma| epithelial-mucinous',
       'non small cell lung cancer| adenocarcinoma',
       'mucinous papillary adenocarcinoma', 'normal kidney',
       'non small cell lung cancer| squamous cell carcinoma',
       'colorectal carcinoma', 'endometrial adenocarcinoma',
       'embryonal kidney', 'epithelial', 'hepatocellular carcinoma',
       'acute myelogenous leukemia (AML)| M3 (promyelocytic)', '-666',
       'bone marrow', 'normal endothelial cell|umbilical cord',
       'endometrial adenocarcinoma| endometrioid carcinoma',
       'acute lymphoblastic leukemia (ALL)| T-cell',
       'carcinoma| prostate', 'skin fibroblast',
       'small cell lung cancer| carcinoma',
       'non small cell lung cancer| adenosquamous carcinoma',
       'kidney epithelial',

In [7]:
cell_normal = cell_info[cell_info['sample_type'] == 'normal']
pert_cp = pert_info[pert_info['pert_type'] == 'trt_cp']

In [8]:
sig_cp_normal = sig_info[(sig_info['pert_id'].isin(pert_cp['pert_id'])) & (sig_info['cell_id'].isin(cell_normal['cell_id']))]

In [9]:
print('Number of signatures with filters: ', sig_cp_normal['sig_id'].nunique())
sigs_ids = sig_cp_normal['sig_id'].unique()

Number of signatures with filters:  20586


In [10]:
# get all samples (across all cell types, doses, and other treatment conditions) that were treated with aspirin
aspirin_ids = sig_info["sig_id"][sig_info["pert_iname"] == "aspirin"]
# how many samples are there in this data set?
print("number of samples treated with aspirin:")
len(aspirin_ids)

number of samples treated with aspirin:


42

In [11]:
# con esto saco solo aquellas firmas cuyas columnas sean las firmas del farmaco que he seleccionado antes (en el ejemplo, aspirina)

level5_lincs_subset = parse("data/GSE70138_clue/GSE70138_Broad_LINCS_Level5_COMPZ_n118050x12328.gctx", cid=aspirin_ids)

  meta_df = meta_df.apply(lambda x: pd.to_numeric(x, errors="ignore"))
  meta_df = meta_df.apply(lambda x: pd.to_numeric(x, errors="ignore"))


In [15]:
level5_lincs_subset.data_df

cid,REP.A005_A375_24H:O19,REP.A005_A375_24H:O20,REP.A005_A375_24H:O21,REP.A005_A375_24H:O22,REP.A005_A375_24H:O23,REP.A005_A375_24H:O24,REP.A005_HA1E_24H:O19,REP.A005_HA1E_24H:O20,REP.A005_HA1E_24H:O21,REP.A005_HA1E_24H:O22,...,REP.A005_PC3_24H:O21,REP.A005_PC3_24H:O22,REP.A005_PC3_24H:O23,REP.A005_PC3_24H:O24,REP.A005_YAPC_24H:O19,REP.A005_YAPC_24H:O20,REP.A005_YAPC_24H:O21,REP.A005_YAPC_24H:O22,REP.A005_YAPC_24H:O23,REP.A005_YAPC_24H:O24
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
780,-1.066597,-0.057207,-0.103873,-0.288828,0.779908,1.350515,0.001975,-0.348126,0.291030,-0.141140,...,0.026461,0.394372,0.243405,0.616837,-0.003931,0.658367,0.127969,-0.821831,0.404308,0.185003
7849,-0.706760,-0.434623,0.386524,-0.148737,0.422625,0.212619,-0.032856,0.498379,0.965349,0.698047,...,0.252639,0.357011,-1.770535,-0.253520,-0.635555,0.097100,-0.631431,-0.532659,0.501220,-1.964956
2978,0.282225,0.332037,0.651430,-0.866460,0.072821,0.217659,0.433709,-0.097779,-0.504462,0.632869,...,-1.574813,-0.426488,-0.456232,0.399714,-0.467675,-0.657933,-0.497620,-0.613044,-0.188928,0.155048
2049,0.035275,-0.339165,-0.597136,-0.741564,0.648332,-0.036273,0.549065,-0.857887,-0.349849,0.751394,...,-0.383363,0.227270,-1.401871,1.300922,-0.007132,0.566133,0.407120,1.174043,-0.098495,0.674446
2101,-0.876039,-0.014522,-0.646597,0.524340,-0.532493,0.366084,0.376153,-0.161820,1.164897,0.293366,...,0.341900,-0.141457,0.041808,0.284998,-0.407251,0.650467,0.411282,1.389244,0.124413,1.345006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4034,-0.319974,-0.866041,0.830469,-0.523067,0.198005,0.477380,0.094949,0.084976,0.038574,-0.502701,...,-0.421446,-0.792560,-0.257680,-0.112217,0.385247,-0.760667,0.368667,0.566173,-0.516652,-0.264597
399664,0.236260,-0.648895,-0.222002,0.033785,0.226874,0.980972,-0.598188,0.050047,0.486417,-0.882917,...,-1.019533,0.593550,-0.239055,0.420942,-0.750949,0.109700,0.266753,0.461201,1.292419,0.592850
54869,-0.869802,0.492935,-0.337015,-1.059525,0.149646,-1.250934,0.013872,-0.575882,-0.329532,1.055121,...,-0.964751,-0.894256,-0.146907,1.333266,0.207185,0.563867,-0.511584,0.874438,-0.096312,-0.274042
90379,0.176941,1.672894,0.965444,-1.057914,-0.124347,0.479184,-0.143484,-1.053318,0.275087,0.861329,...,-0.196969,-0.089075,0.359849,0.199248,-0.106367,-0.534100,0.821637,0.849118,-1.030468,-1.244341


In [17]:
columnas_filtradas = [nombre for nombre in level5_lincs_subset.col_metadata_df if "SHSY5Y" in nombre]
print(columnas_filtradas)

# en este caso no tengo firmas de la aspirina en la linea celular del ejemplo

[]


In [34]:
# aqui estoy buscando una firma concreta
sig_info[sig_info['distil_id'] == 'LJP009_SHSY5Y_24H_X1_B33:N06|LJP009_SHSY5Y_24H_X2_B33:N06|LJP009_SHSY5Y_24H_X3.A2_B33:N06']

Unnamed: 0,sig_id,pert_id,pert_iname,pert_type,cell_id,pert_idose,pert_itime,distil_id


In [62]:
sig_info[sig_info['sig_id'] == 'REP.A005_A375_24H:O19']

Unnamed: 0,sig_id,pert_id,pert_iname,pert_type,cell_id,pert_idose,pert_itime,distil_id
50545,REP.A005_A375_24H:O19,BRD-K11433652,aspirin,trt_cp,A375,10.0 um,24 h,REP.A005_A375_24H_X1_B22:O19|REP.A005_A375_24H...


In [20]:
# aqui cargo el archivo completo, no hago una seleccion anterior por farmaco

level5_lincs = parse("data/GSE70138_clue/GSE70138_Broad_LINCS_Level5_COMPZ_n118050x12328.gctx")

  meta_df = meta_df.apply(lambda x: pd.to_numeric(x, errors="ignore"))
  meta_df = meta_df.apply(lambda x: pd.to_numeric(x, errors="ignore"))


In [21]:
level5_lincs.col_metadata_df

chd
cid
REP.A001_A375_24H:A03
REP.A001_A375_24H:A04
REP.A001_A375_24H:A05
REP.A001_A375_24H:A06
REP.A001_A375_24H:A07
...
LJP007_SKL_24H:O13
LJP007_SKL_24H:O14
LJP007_SKL_24H:O24
LJP007_SKL_24H:P24


In [30]:
col = level5_lincs.col_metadata_df

columnas_filtradas = [nombre for nombre in col if "SHSY5Y" in nombre]

print(columnas_filtradas)

[]


In [24]:
level5_lincs.data_df

cid,REP.A001_A375_24H:A03,REP.A001_A375_24H:A04,REP.A001_A375_24H:A05,REP.A001_A375_24H:A06,REP.A001_A375_24H:A07,REP.A001_A375_24H:A08,REP.A001_A375_24H:A09,REP.A001_A375_24H:A10,REP.A001_A375_24H:A11,REP.A001_A375_24H:A12,...,LJP007_SKL_24H:P19,LJP007_SKL_24H:P20,LJP007_SKL_24H:P21,LJP007_SKL_24H:P22,LJP007_SKL_24H:E21,LJP007_SKL_24H:O13,LJP007_SKL_24H:O14,LJP007_SKL_24H:O24,LJP007_SKL_24H:P24,LJP007_SKL_24H:C19
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
780,4.264143,-0.382211,-0.571711,0.584376,0.658348,-0.004232,-0.314762,-0.049558,-0.909517,-0.850654,...,1.091158,0.264409,0.711080,0.768569,4.4460,4.4395,6.1750,8.0582,10.0000,3.0807
7849,0.057249,0.304313,-0.754999,-0.589973,-0.226854,-0.363419,-0.691129,-0.684283,0.521503,-0.640316,...,-0.493212,-0.041785,-0.606896,0.819984,6.6313,10.0000,2.8649,0.4905,9.1524,4.5834
2978,-1.012480,-0.674992,0.414515,-0.227603,0.287899,0.239820,-0.077976,0.365245,-0.349694,-0.493557,...,-0.461965,-0.651098,-0.445217,-0.177543,2.6472,2.6765,1.3496,-0.1573,3.0650,3.2772
2049,0.308898,-0.335931,-0.502323,-1.775247,-0.666601,0.080279,0.035644,-0.540970,0.503692,-1.418259,...,0.260368,0.906001,1.230669,0.448981,-1.3394,0.3803,1.6567,-0.4138,-2.9559,-2.5385
2101,-0.104070,0.324702,0.495425,-0.107543,-0.091924,0.645074,-0.035445,-0.643081,-0.050036,-0.320833,...,-0.676510,0.153707,-0.923612,0.281000,0.2792,0.2364,2.2745,-0.4215,4.9306,-0.3057
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4034,0.645438,-0.916510,0.678068,-0.466665,-0.777092,-0.048832,-0.803753,-0.490556,0.020871,0.449013,...,1.177493,0.670997,0.735648,0.862986,1.4647,0.3859,0.2262,0.7516,3.4061,-0.3348
399664,-1.011237,-0.350702,-0.548015,0.336222,0.927741,0.585799,-1.301060,-0.224759,-0.328396,0.118584,...,-0.414293,-0.927003,0.052080,-0.498292,1.1236,2.2361,0.9255,-1.4057,0.2197,1.8597
54869,-1.272611,-0.471564,-0.318550,0.585188,-0.029780,-0.728792,0.568284,0.255533,0.758844,0.213864,...,0.930385,0.458312,-0.082488,0.941404,3.4528,5.0802,2.4934,3.2928,-4.5793,2.9312
90379,-0.770175,0.012531,-0.865771,-0.187241,-0.972365,-0.118854,-0.741667,-0.766926,0.142690,1.482587,...,0.235752,-0.256095,0.628243,-0.316403,2.6078,3.6382,2.7465,2.0752,7.7701,2.5543


In [26]:
sig_info.columns

Index(['sig_id', 'pert_id', 'pert_iname', 'pert_type', 'cell_id', 'pert_idose',
       'pert_itime', 'distil_id'],
      dtype='object')

In [38]:
gene_info.columns

Index(['pr_gene_id', 'pr_gene_symbol', 'pr_gene_title', 'pr_is_lm',
       'pr_is_bing'],
      dtype='object')

In [39]:
gene_info.head(5)

Unnamed: 0,pr_gene_id,pr_gene_symbol,pr_gene_title,pr_is_lm,pr_is_bing
0,780,DDR1,discoidin domain receptor tyrosine kinase 1,1,1
1,7849,PAX8,paired box 8,1,1
2,2978,GUCA1A,guanylate cyclase activator 1A,0,0
3,2049,EPHB3,EPH receptor B3,0,1
4,2101,ESRRA,estrogen related receptor alpha,0,1


In [28]:
pert_info['pert_type'].unique()

array(['trt_cp', 'ctl_vehicle', 'trt_xpr', 'ctl_untrt', 'ctl_vector'],
      dtype=object)