In [1]:
import pandas as pd
import numpy as np

from google.cloud import bigquery as bq

%matplotlib inline

In [2]:
probe = pd.read_table("./metadata/sig-DMD.union.includeERR.hm450probes", sep="\t", header=None)
probe.columns = ['chrom', 'start', 'end', 'probe_id', 
                 'UCSC.RefGene_Group', 'UCSC.RefGene_Accession', 'UCSC.RefGene_Name', 
                 'sig-DMD', 'DMV', 'pairs', 'no_pairs']

print(probe.shape)
probe.head()

(25329, 11)


Unnamed: 0,chrom,start,end,probe_id,UCSC.RefGene_Group,UCSC.RefGene_Accession,UCSC.RefGene_Name,sig-DMD,DMV,pairs,no_pairs
0,chr1,864878,864880,cg02896266,Body,NM_152486,SAMD11,sig-DMD_1,DMV_4,"nontrip_trip,normal_trip",2
1,chr1,931326,931328,cg03648020,,,,sig-DMD_2,DMV_6,"nontrip_trip,normal_nontrip,normal_trip",3
2,chr1,933305,933307,cg01729262,,,,sig-DMD_2,DMV_6,"nontrip_trip,normal_nontrip,normal_trip",3
3,chr1,933387,933389,cg15882305,,,,sig-DMD_2,DMV_6,"nontrip_trip,normal_nontrip,normal_trip",3
4,chr1,933684,933686,cg15713103,,,,sig-DMD_2,DMV_6,"nontrip_trip,normal_nontrip,normal_trip",3


In [3]:
probe['pairs'].value_counts()

normal_nontrip,normal_trip                         12326
nontrip_trip,normal_nontrip,normal_trip             4568
nontrip_trip,normal_nontrip                         4007
nontrip_trip,normal_trip                            2701
ers_err,nontrip_trip,normal_nontrip,normal_trip      504
ers_err,normal_nontrip,normal_trip                   464
ers_err,nontrip_trip,normal_nontrip                  272
ers_err,normal_nontrip                               247
ers_err                                              108
ers_err,normal_trip                                   79
ers_err,nontrip_trip,normal_trip                      32
ers_err,nontrip_trip                                  21
Name: pairs, dtype: int64

# Subtype

In [4]:
probe_subtype = probe.loc[probe['pairs'] != "ers_err",:]
print(probe_subtype.shape)
probe_subtype.head()

(25221, 11)


Unnamed: 0,chrom,start,end,probe_id,UCSC.RefGene_Group,UCSC.RefGene_Accession,UCSC.RefGene_Name,sig-DMD,DMV,pairs,no_pairs
0,chr1,864878,864880,cg02896266,Body,NM_152486,SAMD11,sig-DMD_1,DMV_4,"nontrip_trip,normal_trip",2
1,chr1,931326,931328,cg03648020,,,,sig-DMD_2,DMV_6,"nontrip_trip,normal_nontrip,normal_trip",3
2,chr1,933305,933307,cg01729262,,,,sig-DMD_2,DMV_6,"nontrip_trip,normal_nontrip,normal_trip",3
3,chr1,933387,933389,cg15882305,,,,sig-DMD_2,DMV_6,"nontrip_trip,normal_nontrip,normal_trip",3
4,chr1,933684,933686,cg15713103,,,,sig-DMD_2,DMV_6,"nontrip_trip,normal_nontrip,normal_trip",3


In [5]:
probe_subtype['pairs'].value_counts()

normal_nontrip,normal_trip                         12326
nontrip_trip,normal_nontrip,normal_trip             4568
nontrip_trip,normal_nontrip                         4007
nontrip_trip,normal_trip                            2701
ers_err,nontrip_trip,normal_nontrip,normal_trip      504
ers_err,normal_nontrip,normal_trip                   464
ers_err,nontrip_trip,normal_nontrip                  272
ers_err,normal_nontrip                               247
ers_err,normal_trip                                   79
ers_err,nontrip_trip,normal_trip                      32
ers_err,nontrip_trip                                  21
Name: pairs, dtype: int64

In [6]:
sample = pd.read_table("./metadata/subtype_metadata.tsv", sep="\t")

print(sample.shape)
sample.head()

(1282, 12)


Unnamed: 0,case_barcode,sample_barcode,sample_type,sample_type_name,subtype,ER,PR,Her2_IHC,Her2_ISH,Her2,triple_negative,subtype_sub
0,TCGA-A2-A04P,TCGA-A2-A04P-01A,1,Primary solid Tumor,Basal,Negative,Negative,,Negative,Negative,Yes,Basal-TN
1,TCGA-AR-A0TP,TCGA-AR-A0TP-01A,1,Primary solid Tumor,Basal,Positive,Negative,,,,,Basal
2,TCGA-GM-A2DF,TCGA-GM-A2DF-01A,1,Primary solid Tumor,Basal,Negative,Negative,1+,Negative,Negative,Yes,Basal-TN
3,TCGA-BH-A0C0,TCGA-BH-A0C0-01A,1,Primary solid Tumor,LumB,Positive,Positive,2+,Positive,,,LumB
4,TCGA-D8-A141,TCGA-D8-A141-01A,1,Primary solid Tumor,LumA,Positive,Positive,1+,,,,LumA


In [7]:
# fill NaN value in subtype, triple_negative into Unknown
sample.fillna(value={'subtype': 'Unknown', 'triple_negative': 'Unknown', 'subtype_sub': 'Unknown'}, inplace=True)
sample.subtype_sub.value_counts()

LumA        581
LumB        219
Normal      143
Basal       101
Basal-TN     91
Her2         82
Unknown      65
Name: subtype_sub, dtype: int64

-----------------------------------------------

In [8]:
sample_barcodes = ", ".join("'%s'" % w for w in sample.sample_barcode)
probe_ids = ", ".join("'%s'" % w for w in probe_subtype.probe_id)

In [9]:
client = bq.Client()
methylation = 'isb-cgc.TCGA_hg19_data_v0.DNA_Methylation'

# some aliquots are duplicated => we take the average of those duplicates 
query="""\
SELECT 
    case_barcode, sample_barcode, aliquot_barcode, probe_id,
    AVG(beta_value) AS avg_beta_value
FROM
    `{}`
WHERE
    platform = "HumanMethylation450" AND
    sample_barcode IN ({}) AND probe_id IN ({})
GROUP BY
    case_barcode, sample_barcode, aliquot_barcode, probe_id
""".format(methylation, sample_barcodes, probe_ids)

subtype_meth = client.query(query).to_dataframe()

print(subtype_meth.shape)

subtype_meth.head()

(20220571, 5)


Unnamed: 0,case_barcode,sample_barcode,aliquot_barcode,probe_id,avg_beta_value
0,TCGA-A7-A13E,TCGA-A7-A13E-01A,TCGA-A7-A13E-01A-11D-A12R-05,cg21064449,0.95
1,TCGA-E9-A1NA,TCGA-E9-A1NA-11A,TCGA-E9-A1NA-11A-33D-A145-05,cg08546107,0.95
2,TCGA-D8-A73X,TCGA-D8-A73X-01A,TCGA-D8-A73X-01A-11D-A32T-05,cg00007987,0.98
3,TCGA-E2-A1LS,TCGA-E2-A1LS-01A,TCGA-E2-A1LS-01A-12D-A161-05,cg08141142,0.96
4,TCGA-D8-A1JF,TCGA-D8-A1JF-01A,TCGA-D8-A1JF-01A-11D-A13K-05,cg10106095,0.99


In [10]:
subtype_meth.to_csv("./datasets/subtype_meth.HM450.tsv", sep="\t", index=False)

------------------------------

In [8]:
subtype_meth = pd.read_table("./datasets/subtype_meth.HM450.tsv", sep="\t")
print(subtype_meth.shape)
subtype_meth.head()

(20220571, 5)


Unnamed: 0,case_barcode,sample_barcode,aliquot_barcode,probe_id,avg_beta_value
0,TCGA-A7-A13E,TCGA-A7-A13E-01A,TCGA-A7-A13E-01A-11D-A12R-05,cg21064449,0.95
1,TCGA-E9-A1NA,TCGA-E9-A1NA-11A,TCGA-E9-A1NA-11A-33D-A145-05,cg08546107,0.95
2,TCGA-D8-A73X,TCGA-D8-A73X-01A,TCGA-D8-A73X-01A-11D-A32T-05,cg00007987,0.98
3,TCGA-E2-A1LS,TCGA-E2-A1LS-01A,TCGA-E2-A1LS-01A-12D-A161-05,cg08141142,0.96
4,TCGA-D8-A1JF,TCGA-D8-A1JF-01A,TCGA-D8-A1JF-01A-11D-A13K-05,cg10106095,0.99


In [9]:
subtype_meth_bigtable = pd.merge(left=subtype_meth, right=sample, how='inner', on=["sample_barcode", "case_barcode"])

print(subtype_meth_bigtable.shape)
subtype_meth_bigtable.head()

(20220571, 15)


Unnamed: 0,case_barcode,sample_barcode,aliquot_barcode,probe_id,avg_beta_value,sample_type,sample_type_name,subtype,ER,PR,Her2_IHC,Her2_ISH,Her2,triple_negative,subtype_sub
0,TCGA-A7-A13E,TCGA-A7-A13E-01A,TCGA-A7-A13E-01A-11D-A12R-05,cg21064449,0.95,1,Primary solid Tumor,Basal,Positive,Negative,2+,Negative,Negative,Unknown,Basal
1,TCGA-A7-A13E,TCGA-A7-A13E-01A,TCGA-A7-A13E-01A-11D-A12R-05,cg16108964,0.95,1,Primary solid Tumor,Basal,Positive,Negative,2+,Negative,Negative,Unknown,Basal
2,TCGA-A7-A13E,TCGA-A7-A13E-01A,TCGA-A7-A13E-01A-11D-A12R-05,cg00610360,0.95,1,Primary solid Tumor,Basal,Positive,Negative,2+,Negative,Negative,Unknown,Basal
3,TCGA-A7-A13E,TCGA-A7-A13E-01A,TCGA-A7-A13E-01A-11D-A12R-05,cg09873164,0.96,1,Primary solid Tumor,Basal,Positive,Negative,2+,Negative,Negative,Unknown,Basal
4,TCGA-A7-A13E,TCGA-A7-A13E-01A,TCGA-A7-A13E-01A-11D-A12R-05,cg24997276,0.95,1,Primary solid Tumor,Basal,Positive,Negative,2+,Negative,Negative,Unknown,Basal


In [10]:
print("Sample types")
print(subtype_meth_bigtable.drop_duplicates(subset="sample_barcode").sample_type_name.value_counts())
print("\n")

print("BRCA subtypes")
print(subtype_meth_bigtable.drop_duplicates(subset="case_barcode").subtype_sub.value_counts())

Sample types
Primary solid Tumor    789
Solid Tissue Normal     98
Metastatic               5
Name: sample_type_name, dtype: int64


BRCA subtypes
LumA        391
LumB        135
Normal       77
Basal        76
Basal-TN     53
Her2         45
Unknown      12
Name: subtype_sub, dtype: int64


In [11]:
subtype_meth_bigtable.to_csv("./datasets/subtype_meth_bigtable.HM450.tsv", sep="\t", index=False)

# Hormone

In [4]:
select = [True if "ers_err" in l else False for l in probe['pairs'].str.split(",")]

probe_hormone = probe.loc[select,:]
print(probe_hormone.shape)
probe_hormone.head()

(1727, 11)


Unnamed: 0,chrom,start,end,probe_id,UCSC.RefGene_Group,UCSC.RefGene_Accession,UCSC.RefGene_Name,sig-DMD,DMV,pairs,no_pairs
11,chr1,1286916,1286918,cg21679391,,,,sig-DMD_4,DMV_9,"ers_err,normal_nontrip,normal_trip",3
12,chr1,1287258,1287260,cg21118819,,,,sig-DMD_4,DMV_9,"ers_err,normal_nontrip,normal_trip",3
13,chr1,1288585,1288587,cg11414742,3'UTR,NM_032348,MXRA8,sig-DMD_4,DMV_9,"ers_err,normal_nontrip,normal_trip",3
14,chr1,1288925,1288927,cg15472728,3'UTR,NM_032348,MXRA8,sig-DMD_4,DMV_9,"ers_err,normal_nontrip,normal_trip",3
15,chr1,1289805,1289807,cg14270725,Body,NM_032348,MXRA8,sig-DMD_4,DMV_9,"ers_err,normal_nontrip,normal_trip",3


In [5]:
sample = pd.read_table("./metadata/hormone_metadata.tsv", sep="\t")

print(sample.shape)
sample.head()

(540, 15)


Unnamed: 0,case_barcode,sample_barcode,sample_type,sample_type_name,initial_response,recurrence_status,recurrence_log,subtype,ER,PR,Her2_IHC,Her2_ISH,Her2,triple_negative,subtype_sub
0,TCGA-BH-A0C0,TCGA-BH-A0C0-01A,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ...",LumB,Positive,Positive,2+,Positive,,,LumB
1,TCGA-BH-A0C0,TCGA-BH-A0C0-11A,11,Solid Tissue Normal,Unknown,Unknown,"Alive tumor free, last follow-up within risky ...",Normal,Positive,Positive,2+,Positive,,,Normal
2,TCGA-D8-A141,TCGA-D8-A141-01A,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ...",LumA,Positive,Positive,1+,,,,LumA
3,TCGA-EW-A424,TCGA-EW-A424-01A,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ...",LumA,Positive,Positive,,,,,LumA
4,TCGA-AO-A12G,TCGA-AO-A12G-01A,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ...",LumA,Positive,Positive,2+,Negative,,,LumA


In [6]:
sample_barcodes = ", ".join("'%s'" % w for w in sample.sample_barcode)
probe_ids = ", ".join("'%s'" % w for w in probe_hormone.probe_id)

In [None]:
client = bq.Client()
methylation = 'isb-cgc.TCGA_hg19_data_v0.DNA_Methylation'

query="""\
SELECT 
    case_barcode, sample_barcode, aliquot_barcode, probe_id, beta_value
FROM
    `{}`
WHERE
    platform = "HumanMethylation450" AND
    sample_barcode IN ({}) AND probe_id IN ({})
""".format(methylation, sample_barcodes, probe_ids)

hormone_meth = client.query(query).to_dataframe()

print(hormone_meth.shape)

hormone_meth.head()

In [None]:
hormone_meth_bigtable = pd.merge(hormone_meth, sample, how='inner', on=["sample_barcode", "case_barcode"])

print(hormone_meth_bigtable.shape)
hormone_meth_bigtable.head()

In [None]:
print("Sample types")
print(hormone_meth_bigtable.drop_duplicates(subset="sample_barcode").sample_type_name.value_counts())
print("\n")

print("Cases - Initial responses")
print(hormone_meth_bigtable.drop_duplicates(subset="case_barcode").initial_response.value_counts())
print("\n")

print("Cases - Recurrence status")
print(hormone_meth_bigtable.drop_duplicates(subset="case_barcode").recurrence_status.value_counts())
print("\n")

print("Cases - Recurrence log")
print(hormone_meth_bigtable.drop_duplicates(subset="case_barcode").recurrence_log.value_counts())
print("\n")

In [None]:
hormone_meth_bigtable.to_csv("./datasets/hormone_meth_bigtable.HM450.tsv", sep="\t", index=False)