In [1]:
import pandas as pd
import numpy as np

from google.cloud import bigquery as bq

%matplotlib inline

In [2]:
probe = pd.read_table("./metadata/sig-DMD.union.includeERR.hm450probes", sep="\t", header=None)
probe.columns = ['chrom', 'start', 'end', 'probe_id', 
                 'UCSC.RefGene_Group', 'UCSC.RefGene_Accession', 'UCSC.RefGene_Name', 
                 'sig-DMD', 'DMV', 'pairs', 'no_pairs']

print(probe.shape)
probe.head()

(25329, 11)


Unnamed: 0,chrom,start,end,probe_id,UCSC.RefGene_Group,UCSC.RefGene_Accession,UCSC.RefGene_Name,sig-DMD,DMV,pairs,no_pairs
0,chr1,864878,864880,cg02896266,Body,NM_152486,SAMD11,sig-DMD_1,DMV_4,"nontrip_trip,normal_trip",2
1,chr1,931326,931328,cg03648020,,,,sig-DMD_2,DMV_6,"nontrip_trip,normal_nontrip,normal_trip",3
2,chr1,933305,933307,cg01729262,,,,sig-DMD_2,DMV_6,"nontrip_trip,normal_nontrip,normal_trip",3
3,chr1,933387,933389,cg15882305,,,,sig-DMD_2,DMV_6,"nontrip_trip,normal_nontrip,normal_trip",3
4,chr1,933684,933686,cg15713103,,,,sig-DMD_2,DMV_6,"nontrip_trip,normal_nontrip,normal_trip",3


In [3]:
probe['pairs'].value_counts()

normal_nontrip,normal_trip                         12326
nontrip_trip,normal_nontrip,normal_trip             4568
nontrip_trip,normal_nontrip                         4007
nontrip_trip,normal_trip                            2701
ers_err,nontrip_trip,normal_nontrip,normal_trip      504
ers_err,normal_nontrip,normal_trip                   464
ers_err,nontrip_trip,normal_nontrip                  272
ers_err,normal_nontrip                               247
ers_err                                              108
ers_err,normal_trip                                   79
ers_err,nontrip_trip,normal_trip                      32
ers_err,nontrip_trip                                  21
Name: pairs, dtype: int64

In [4]:
sample = pd.read_table("./metadata/subtype_metadata.tsv", sep="\t")

print(sample.shape)
sample.head()

(1282, 12)


Unnamed: 0,case_barcode,sample_barcode,sample_type,sample_type_name,subtype,ER,PR,Her2_IHC,Her2_ISH,Her2,triple_negative,subtype_sub
0,TCGA-A2-A04P,TCGA-A2-A04P-01A,1,Primary solid Tumor,Basal,Negative,Negative,,Negative,Negative,Yes,Basal-TN
1,TCGA-AR-A0TP,TCGA-AR-A0TP-01A,1,Primary solid Tumor,Basal,Positive,Negative,,,,,Basal
2,TCGA-GM-A2DF,TCGA-GM-A2DF-01A,1,Primary solid Tumor,Basal,Negative,Negative,1+,Negative,Negative,Yes,Basal-TN
3,TCGA-BH-A0C0,TCGA-BH-A0C0-01A,1,Primary solid Tumor,LumB,Positive,Positive,2+,Positive,,,LumB
4,TCGA-D8-A141,TCGA-D8-A141-01A,1,Primary solid Tumor,LumA,Positive,Positive,1+,,,,LumA


In [5]:
# fill NaN value in subtype, triple_negative into Unknown
sample.fillna(value={'subtype': 'Unknown', 'triple_negative': 'Unknown', 'subtype_sub': 'Unknown'}, inplace=True)
sample.subtype_sub.value_counts()

LumA        581
LumB        219
Normal      143
Basal       101
Basal-TN     91
Her2         82
Unknown      65
Name: subtype_sub, dtype: int64

-----------------------------------------------

In [6]:
sample_barcodes = ", ".join("'%s'" % w for w in sample.sample_barcode)
probe_ids = ", ".join("'%s'" % w for w in probe.probe_id)

In [7]:
client = bq.Client()
methylation = 'isb-cgc.TCGA_hg19_data_v0.DNA_Methylation'

# some aliquots are duplicated => we take the average of those duplicates 
query="""\
SELECT 
    case_barcode, sample_barcode, aliquot_barcode, probe_id,
    AVG(beta_value) AS avg_beta_value
FROM
    `{}`
WHERE
    platform = "HumanMethylation450" AND
    sample_barcode IN ({}) AND probe_id IN ({})
GROUP BY
    case_barcode, sample_barcode, aliquot_barcode, probe_id
""".format(methylation, sample_barcodes, probe_ids)

meth = client.query(query).to_dataframe()

print(meth.shape)

meth.head()

(20313452, 5)


Unnamed: 0,case_barcode,sample_barcode,aliquot_barcode,probe_id,avg_beta_value
0,TCGA-D8-A1JT,TCGA-D8-A1JT-01A,TCGA-D8-A1JT-01A-31D-A13K-05,cg21679391,0.97
1,TCGA-GM-A3NW,TCGA-GM-A3NW-01A,TCGA-GM-A3NW-01A-21D-A22B-05,cg01846046,0.95
2,TCGA-BH-A0B2,TCGA-BH-A0B2-01A,TCGA-BH-A0B2-01A-11D-A10N-05,cg12967902,0.97
3,TCGA-LL-A7T0,TCGA-LL-A7T0-01A,TCGA-LL-A7T0-01A-31D-A357-05,cg21919790,0.99
4,TCGA-AO-A0JG,TCGA-AO-A0JG-01A,TCGA-AO-A0JG-01A-31D-A10P-05,cg01263854,0.99


In [8]:
meth.to_csv("./datasets/methylation.HM450.tsv", sep="\t", index=False)

# Subtype

In [6]:
meth = pd.read_table("./datasets/methylation.HM450.tsv", sep="\t")
print(meth.shape)
meth.head()

(20313452, 5)


Unnamed: 0,case_barcode,sample_barcode,aliquot_barcode,probe_id,avg_beta_value
0,TCGA-D8-A1JT,TCGA-D8-A1JT-01A,TCGA-D8-A1JT-01A-31D-A13K-05,cg21679391,0.97
1,TCGA-GM-A3NW,TCGA-GM-A3NW-01A,TCGA-GM-A3NW-01A-21D-A22B-05,cg01846046,0.95
2,TCGA-BH-A0B2,TCGA-BH-A0B2-01A,TCGA-BH-A0B2-01A-11D-A10N-05,cg12967902,0.97
3,TCGA-LL-A7T0,TCGA-LL-A7T0-01A,TCGA-LL-A7T0-01A-31D-A357-05,cg21919790,0.99
4,TCGA-AO-A0JG,TCGA-AO-A0JG-01A,TCGA-AO-A0JG-01A-31D-A10P-05,cg01263854,0.99


In [7]:
probe_subtype = probe.loc[probe['pairs'] != "ers_err",:]
print(probe_subtype.shape)
probe_subtype.head()

(25221, 11)


Unnamed: 0,chrom,start,end,probe_id,UCSC.RefGene_Group,UCSC.RefGene_Accession,UCSC.RefGene_Name,sig-DMD,DMV,pairs,no_pairs
0,chr1,864878,864880,cg02896266,Body,NM_152486,SAMD11,sig-DMD_1,DMV_4,"nontrip_trip,normal_trip",2
1,chr1,931326,931328,cg03648020,,,,sig-DMD_2,DMV_6,"nontrip_trip,normal_nontrip,normal_trip",3
2,chr1,933305,933307,cg01729262,,,,sig-DMD_2,DMV_6,"nontrip_trip,normal_nontrip,normal_trip",3
3,chr1,933387,933389,cg15882305,,,,sig-DMD_2,DMV_6,"nontrip_trip,normal_nontrip,normal_trip",3
4,chr1,933684,933686,cg15713103,,,,sig-DMD_2,DMV_6,"nontrip_trip,normal_nontrip,normal_trip",3


In [8]:
probe_subtype['pairs'].value_counts()

normal_nontrip,normal_trip                         12326
nontrip_trip,normal_nontrip,normal_trip             4568
nontrip_trip,normal_nontrip                         4007
nontrip_trip,normal_trip                            2701
ers_err,nontrip_trip,normal_nontrip,normal_trip      504
ers_err,normal_nontrip,normal_trip                   464
ers_err,nontrip_trip,normal_nontrip                  272
ers_err,normal_nontrip                               247
ers_err,normal_trip                                   79
ers_err,nontrip_trip,normal_trip                      32
ers_err,nontrip_trip                                  21
Name: pairs, dtype: int64

In [9]:
subtype_meth = pd.merge(left=meth, right=probe_subtype[['probe_id']], 
                        how='inner', on=["probe_id"])
print(subtype_meth.shape)
subtype_meth.head()

(20220571, 5)


Unnamed: 0,case_barcode,sample_barcode,aliquot_barcode,probe_id,avg_beta_value
0,TCGA-D8-A1JT,TCGA-D8-A1JT-01A,TCGA-D8-A1JT-01A-31D-A13K-05,cg21679391,0.97
1,TCGA-LL-A6FP,TCGA-LL-A6FP-01A,TCGA-LL-A6FP-01A-11D-A31V-05,cg21679391,0.96
2,TCGA-E9-A249,TCGA-E9-A249-01A,TCGA-E9-A249-01A-11D-A16A-05,cg21679391,0.95
3,TCGA-E9-A1ND,TCGA-E9-A1ND-01A,TCGA-E9-A1ND-01A-11D-A145-05,cg21679391,0.96
4,TCGA-BH-A1FJ,TCGA-BH-A1FJ-01A,TCGA-BH-A1FJ-01A-11D-A13K-05,cg21679391,0.97


In [10]:
subtype_meth_bigtable = pd.merge(left=subtype_meth, right=sample, how='inner', on=["sample_barcode", "case_barcode"])

print(subtype_meth_bigtable.shape)
subtype_meth_bigtable.head()

(20220571, 15)


Unnamed: 0,case_barcode,sample_barcode,aliquot_barcode,probe_id,avg_beta_value,sample_type,sample_type_name,subtype,ER,PR,Her2_IHC,Her2_ISH,Her2,triple_negative,subtype_sub
0,TCGA-D8-A1JT,TCGA-D8-A1JT-01A,TCGA-D8-A1JT-01A-31D-A13K-05,cg21679391,0.97,1,Primary solid Tumor,LumB,Positive,Positive,2+,Negative,,Unknown,LumB
1,TCGA-D8-A1JT,TCGA-D8-A1JT-01A,TCGA-D8-A1JT-01A-31D-A13K-05,cg01846046,0.94,1,Primary solid Tumor,LumB,Positive,Positive,2+,Negative,,Unknown,LumB
2,TCGA-D8-A1JT,TCGA-D8-A1JT-01A,TCGA-D8-A1JT-01A-31D-A13K-05,cg12967902,0.94,1,Primary solid Tumor,LumB,Positive,Positive,2+,Negative,,Unknown,LumB
3,TCGA-D8-A1JT,TCGA-D8-A1JT-01A,TCGA-D8-A1JT-01A-31D-A13K-05,cg21919790,0.99,1,Primary solid Tumor,LumB,Positive,Positive,2+,Negative,,Unknown,LumB
4,TCGA-D8-A1JT,TCGA-D8-A1JT-01A,TCGA-D8-A1JT-01A-31D-A13K-05,cg01263854,0.98,1,Primary solid Tumor,LumB,Positive,Positive,2+,Negative,,Unknown,LumB


In [11]:
df = subtype_meth_bigtable.drop_duplicates(subset="sample_barcode")

print("Sample types")
print(df.sample_type_name.value_counts())
print("\n")

print("BRCA subtypes")
print(df.loc[df['sample_type']==1].subtype_sub.value_counts())

Sample types
Primary solid Tumor    789
Solid Tissue Normal     98
Metastatic               5
Name: sample_type_name, dtype: int64


BRCA subtypes
LumA        419
LumB        147
Basal        78
Basal-TN     57
Her2         46
Normal       34
Unknown       8
Name: subtype_sub, dtype: int64


In [12]:
subtype_meth_bigtable.to_csv("./datasets/subtype_meth_bigtable.HM450.tsv", sep="\t", index=False)

# Hormone

In [8]:
select = [True if "ers_err" in l else False for l in probe['pairs'].str.split(",")]

probe_hormone = probe.loc[select,:]
print(probe_hormone.shape)
probe_hormone.head()

(1727, 11)


Unnamed: 0,chrom,start,end,probe_id,UCSC.RefGene_Group,UCSC.RefGene_Accession,UCSC.RefGene_Name,sig-DMD,DMV,pairs,no_pairs
11,chr1,1286916,1286918,cg21679391,,,,sig-DMD_4,DMV_9,"ers_err,normal_nontrip,normal_trip",3
12,chr1,1287258,1287260,cg21118819,,,,sig-DMD_4,DMV_9,"ers_err,normal_nontrip,normal_trip",3
13,chr1,1288585,1288587,cg11414742,3'UTR,NM_032348,MXRA8,sig-DMD_4,DMV_9,"ers_err,normal_nontrip,normal_trip",3
14,chr1,1288925,1288927,cg15472728,3'UTR,NM_032348,MXRA8,sig-DMD_4,DMV_9,"ers_err,normal_nontrip,normal_trip",3
15,chr1,1289805,1289807,cg14270725,Body,NM_032348,MXRA8,sig-DMD_4,DMV_9,"ers_err,normal_nontrip,normal_trip",3


In [9]:
sample = pd.read_table("./metadata/hormone_metadata.tsv", sep="\t")

print(sample.shape)
sample.head()

(540, 15)


Unnamed: 0,case_barcode,sample_barcode,sample_type,sample_type_name,initial_response,recurrence_status,recurrence_log,subtype,ER,PR,Her2_IHC,Her2_ISH,Her2,triple_negative,subtype_sub
0,TCGA-BH-A0C0,TCGA-BH-A0C0-01A,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ...",LumB,Positive,Positive,2+,Positive,,,LumB
1,TCGA-BH-A0C0,TCGA-BH-A0C0-11A,11,Solid Tissue Normal,Unknown,Unknown,"Alive tumor free, last follow-up within risky ...",Normal,Positive,Positive,2+,Positive,,,Normal
2,TCGA-D8-A141,TCGA-D8-A141-01A,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ...",LumA,Positive,Positive,1+,,,,LumA
3,TCGA-EW-A424,TCGA-EW-A424-01A,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ...",LumA,Positive,Positive,,,,,LumA
4,TCGA-AO-A12G,TCGA-AO-A12G-01A,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ...",LumA,Positive,Positive,2+,Negative,,,LumA


In [10]:
sample_barcodes = ", ".join("'%s'" % w for w in sample.sample_barcode)
probe_ids = ", ".join("'%s'" % w for w in probe_hormone.probe_id)

In [11]:
client = bq.Client()
methylation = 'isb-cgc.TCGA_hg19_data_v0.DNA_Methylation'

query="""\
SELECT 
    case_barcode, sample_barcode, aliquot_barcode, probe_id, beta_value
FROM
    `{}`
WHERE
    platform = "HumanMethylation450" AND
    sample_barcode IN ({}) AND probe_id IN ({})
""".format(methylation, sample_barcodes, probe_ids)

hormone_meth = client.query(query).to_dataframe()

print(hormone_meth.shape)

hormone_meth.head()

(625379, 5)


Unnamed: 0,case_barcode,sample_barcode,aliquot_barcode,probe_id,beta_value
0,TCGA-D8-A27V,TCGA-D8-A27V-01A,TCGA-D8-A27V-01A-12D-A17F-05,cg22251148,0.9
1,TCGA-A8-A08O,TCGA-A8-A08O-01A,TCGA-A8-A08O-01A-21D-A10P-05,cg15447017,0.86
2,TCGA-AR-A0TZ,TCGA-AR-A0TZ-01A,TCGA-AR-A0TZ-01A-12D-A10P-05,cg02476744,0.91
3,TCGA-AQ-A1H2,TCGA-AQ-A1H2-01A,TCGA-AQ-A1H2-01A-11D-A13K-05,cg20181887,0.94
4,TCGA-E2-A109,TCGA-E2-A109-01A,TCGA-E2-A109-01A-11D-A10N-05,cg17489939,0.92


In [12]:
hormone_meth_bigtable = pd.merge(hormone_meth, sample, how='inner', on=["sample_barcode", "case_barcode"])

print(hormone_meth_bigtable.shape)
hormone_meth_bigtable.head()

(625379, 18)


Unnamed: 0,case_barcode,sample_barcode,aliquot_barcode,probe_id,beta_value,sample_type,sample_type_name,initial_response,recurrence_status,recurrence_log,subtype,ER,PR,Her2_IHC,Her2_ISH,Her2,triple_negative,subtype_sub
0,TCGA-D8-A27V,TCGA-D8-A27V-01A,TCGA-D8-A27V-01A-12D-A17F-05,cg22251148,0.9,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ...",LumA,Positive,Positive,1+,,,,LumA
1,TCGA-D8-A27V,TCGA-D8-A27V-01A,TCGA-D8-A27V-01A-12D-A17F-05,cg07658280,0.95,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ...",LumA,Positive,Positive,1+,,,,LumA
2,TCGA-D8-A27V,TCGA-D8-A27V-01A,TCGA-D8-A27V-01A-12D-A17F-05,cg03955927,0.93,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ...",LumA,Positive,Positive,1+,,,,LumA
3,TCGA-D8-A27V,TCGA-D8-A27V-01A,TCGA-D8-A27V-01A-12D-A17F-05,cg12756504,0.88,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ...",LumA,Positive,Positive,1+,,,,LumA
4,TCGA-D8-A27V,TCGA-D8-A27V-01A,TCGA-D8-A27V-01A-12D-A17F-05,cg23657179,0.87,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ...",LumA,Positive,Positive,1+,,,,LumA


In [13]:
print("Sample types")
print(hormone_meth_bigtable.drop_duplicates(subset="sample_barcode").sample_type_name.value_counts())
print("\n")

print("Cases - Initial responses")
print(hormone_meth_bigtable.drop_duplicates(subset="case_barcode").initial_response.value_counts())
print("\n")

print("Cases - Recurrence status")
print(hormone_meth_bigtable.drop_duplicates(subset="case_barcode").recurrence_status.value_counts())
print("\n")

print("Cases - Recurrence log")
print(hormone_meth_bigtable.drop_duplicates(subset="case_barcode").recurrence_log.value_counts())
print("\n")

Sample types
Primary solid Tumor    353
Solid Tissue Normal     35
Name: sample_type_name, dtype: int64


Cases - Initial responses
Unknown               313
Complete response      34
Progressive/Stable      7
Name: initial_response, dtype: int64


Cases - Recurrence status
Unknown     312
Occured      30
Low risk      7
Name: recurrence_status, dtype: int64


Cases - Recurrence log
Alive tumor free, last follow-up within risky period             279
New tumor after/during treatment                                  21
Drug start and end dates unknown                                  14
Unknown neoplasm status                                            9
Dead with tumor                                                    9
Alive tumor free after risky period                                7
Dead tumor free during risky period                                5
Reported having tumor during follow-up but unknown recurrence      3
Alive tumor free but unknown follow-up date                   

In [14]:
hormone_tumors = hormone_meth_bigtable.loc[hormone_meth_bigtable['sample_type']==1].drop_duplicates(subset="case_barcode")
pd.crosstab(hormone_tumors['initial_response'], hormone_tumors['subtype_sub'], margins=True)

subtype_sub,Basal,Her2,LumA,LumB,Normal,All
initial_response,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Complete response,2,0,27,4,1,34
Progressive/Stable,0,0,5,1,1,7
Unknown,6,7,215,69,10,307
All,8,7,247,74,12,348


- Complete response: 2 Basal, 1 Normal-like
- Progressive/Stable: 1 Normal-like

In [15]:
pd.crosstab(hormone_tumors['recurrence_status'], hormone_tumors['subtype_sub'], margins=True)

subtype_sub,Basal,Her2,LumA,LumB,Normal,All
recurrence_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Low risk,0,0,7,0,0,7
Occured,1,0,19,9,1,30
Unknown,7,7,216,65,11,306
All,8,7,242,74,12,343


- Occured: 1 Basal, 1 Normal-like

In [16]:
hormone_meth_bigtable.to_csv("./datasets/hormone_meth_bigtable.HM450.tsv", sep="\t", index=False)


# Average methylation per sigDMD

In [6]:
meth = pd.read_table("./datasets/methylation.HM450.tsv", sep="\t")
print(meth.shape)
meth.head()

(20313452, 5)


Unnamed: 0,case_barcode,sample_barcode,aliquot_barcode,probe_id,avg_beta_value
0,TCGA-D8-A1JT,TCGA-D8-A1JT-01A,TCGA-D8-A1JT-01A-31D-A13K-05,cg21679391,0.97
1,TCGA-GM-A3NW,TCGA-GM-A3NW-01A,TCGA-GM-A3NW-01A-21D-A22B-05,cg01846046,0.95
2,TCGA-BH-A0B2,TCGA-BH-A0B2-01A,TCGA-BH-A0B2-01A-11D-A10N-05,cg12967902,0.97
3,TCGA-LL-A7T0,TCGA-LL-A7T0-01A,TCGA-LL-A7T0-01A-31D-A357-05,cg21919790,0.99
4,TCGA-AO-A0JG,TCGA-AO-A0JG-01A,TCGA-AO-A0JG-01A-31D-A10P-05,cg01263854,0.99


In [7]:
meth_sigDMD = pd.merge(left=meth, 
                       right=probe[['probe_id', 'sig-DMD']], how='inner', 
                       on=["probe_id"]).groupby(["case_barcode", "sample_barcode", 'aliquot_barcode', 
                                                 'sig-DMD'], as_index=False)['avg_beta_value'].mean()
print(meth_sigDMD.shape)
meth_sigDMD.head()

(2595325, 5)


Unnamed: 0,case_barcode,sample_barcode,aliquot_barcode,sig-DMD,avg_beta_value
0,TCGA-3C-AAAU,TCGA-3C-AAAU-01A,TCGA-3C-AAAU-01A-11D-A41Q-05,sig-DMD_1,0.77
1,TCGA-3C-AAAU,TCGA-3C-AAAU-01A,TCGA-3C-AAAU-01A-11D-A41Q-05,sig-DMD_10,0.2175
2,TCGA-3C-AAAU,TCGA-3C-AAAU-01A,TCGA-3C-AAAU-01A-11D-A41Q-05,sig-DMD_100,0.72
3,TCGA-3C-AAAU,TCGA-3C-AAAU-01A,TCGA-3C-AAAU-01A-11D-A41Q-05,sig-DMD_1000,0.046667
4,TCGA-3C-AAAU,TCGA-3C-AAAU-01A,TCGA-3C-AAAU-01A-11D-A41Q-05,sig-DMD_1001,0.28


In [8]:
df = pd.crosstab(meth_sigDMD['aliquot_barcode'], meth_sigDMD['sig-DMD'])
print(df.shape)
df.mean().describe()

(894, 2905)


count    2905.000000
mean        0.999328
std         0.018711
min         0.039150
25%         1.000000
50%         1.000000
75%         1.000000
max         1.000000
dtype: float64

In [9]:
meth_sigDMD.to_csv("./datasets/methylation_sigDMD.HM450.tsv", sep="\t", index=False)