In [1]:
import pandas as pd
import numpy as np

from google.cloud import bigquery as bq

%matplotlib inline

In [2]:
probe = pd.read_table("./metadata/sig-DMD.union.includeERR.hm450probes", sep="\t", header=None)
probe.columns = ['chrom', 'start', 'end', 'probe_id', 
                 'UCSC.RefGene_Group', 'UCSC.RefGene_Accession', 'UCSC.RefGene_Name', 
                 'sig-DMD', 'DMV', 'pairs', 'no_pairs']

print(probe.shape)
probe.head()

(25329, 11)


Unnamed: 0,chrom,start,end,probe_id,UCSC.RefGene_Group,UCSC.RefGene_Accession,UCSC.RefGene_Name,sig-DMD,DMV,pairs,no_pairs
0,chr1,864878,864880,cg02896266,Body,NM_152486,SAMD11,sig-DMD_1,DMV_4,"nontrip_trip,normal_trip",2
1,chr1,931326,931328,cg03648020,,,,sig-DMD_2,DMV_6,"nontrip_trip,normal_nontrip,normal_trip",3
2,chr1,933305,933307,cg01729262,,,,sig-DMD_2,DMV_6,"nontrip_trip,normal_nontrip,normal_trip",3
3,chr1,933387,933389,cg15882305,,,,sig-DMD_2,DMV_6,"nontrip_trip,normal_nontrip,normal_trip",3
4,chr1,933684,933686,cg15713103,,,,sig-DMD_2,DMV_6,"nontrip_trip,normal_nontrip,normal_trip",3


# Subtype

In [3]:
sample = pd.read_table("./metadata/subtype_metadata.tsv", sep="\t")

print(sample.shape)
sample.head()

(1282, 12)


Unnamed: 0,case_barcode,sample_barcode,sample_type,sample_type_name,subtype,ER,PR,Her2_IHC,Her2_ISH,Her2,triple_negative,subtype_sub
0,TCGA-A2-A04P,TCGA-A2-A04P-01A,1,Primary solid Tumor,Basal,Negative,Negative,,Negative,Negative,Yes,Basal-TN
1,TCGA-AR-A0TP,TCGA-AR-A0TP-01A,1,Primary solid Tumor,Basal,Positive,Negative,,,,,Basal
2,TCGA-GM-A2DF,TCGA-GM-A2DF-01A,1,Primary solid Tumor,Basal,Negative,Negative,1+,Negative,Negative,Yes,Basal-TN
3,TCGA-BH-A0C0,TCGA-BH-A0C0-01A,1,Primary solid Tumor,LumB,Positive,Positive,2+,Positive,,,LumB
4,TCGA-D8-A141,TCGA-D8-A141-01A,1,Primary solid Tumor,LumA,Positive,Positive,1+,,,,LumA


In [4]:
# fill NaN value in subtype, triple_negative into Unknown
sample.fillna(value={'subtype': 'Unknown', 'triple_negative': 'Unknown', 'subtype_sub': 'Unknown'}, inplace=True)
sample.subtype_sub.value_counts()

LumA        581
LumB        219
Normal      143
Basal       101
Basal-TN     91
Her2         82
Unknown      65
Name: subtype_sub, dtype: int64

-----------------------------------------------

In [5]:
sample_barcodes = ", ".join("'%s'" % w for w in sample.sample_barcode)
probe_ids = ", ".join("'%s'" % w for w in probe.probe_id)

In [None]:
client = bq.Client()
methylation = 'isb-cgc.TCGA_hg19_data_v0.DNA_Methylation'

# some aliquots are duplicated => we take the average of those duplicates 
query="""\
SELECT 
    case_barcode, sample_barcode, aliquot_barcode, probe_id,
    AVG(beta_value) AS avg_beta_value
FROM
    `{}`
WHERE
    platform = "HumanMethylation450" AND
    sample_barcode IN ({}) AND probe_id IN ({})
GROUP BY
    case_barcode, sample_barcode, aliquot_barcode, probe_id
""".format(methylation, sample_barcodes, probe_ids)

subtype_meth = client.query(query).to_dataframe()

print(subtype_meth.shape)

subtype_meth.head()

In [None]:
subtype_meth.to_csv("./datasets/subtype_meth.HM450.tsv", sep="\t", index=False)

------------------------------

In [6]:
subtype_meth = pd.read_table("./datasets/subtype_meth.HM450.tsv", sep="\t")
print(subtype_meth.shape)
subtype_meth.head()

(20313452, 5)


Unnamed: 0,case_barcode,sample_barcode,aliquot_barcode,probe_id,avg_beta_value
0,TCGA-GM-A2DM,TCGA-GM-A2DM-01A,TCGA-GM-A2DM-01A-11D-A17Z-05,cg19996418,0.98
1,TCGA-AO-A0JJ,TCGA-AO-A0JJ-01A,TCGA-AO-A0JJ-01A-11D-A10P-05,cg17427491,0.98
2,TCGA-EW-A1J1,TCGA-EW-A1J1-01A,TCGA-EW-A1J1-01A-11D-A13K-05,cg16108964,0.95
3,TCGA-AC-A62V,TCGA-AC-A62V-01A,TCGA-AC-A62V-01A-11D-A31V-05,cg26450010,0.99
4,TCGA-EW-A1PC,TCGA-EW-A1PC-01B,TCGA-EW-A1PC-01B-11D-A21R-05,cg03453744,0.95


In [7]:
subtype_meth_bigtable = pd.merge(left=subtype_meth, right=sample, how='inner', on=["sample_barcode", "case_barcode"])

print(subtype_meth_bigtable.shape)
subtype_meth_bigtable.head()

(20313452, 15)


Unnamed: 0,case_barcode,sample_barcode,aliquot_barcode,probe_id,avg_beta_value,sample_type,sample_type_name,subtype,ER,PR,Her2_IHC,Her2_ISH,Her2,triple_negative,subtype_sub
0,TCGA-GM-A2DM,TCGA-GM-A2DM-01A,TCGA-GM-A2DM-01A-11D-A17Z-05,cg19996418,0.98,1,Primary solid Tumor,LumA,Positive,Positive,1+,,,Unknown,LumA
1,TCGA-GM-A2DM,TCGA-GM-A2DM-01A,TCGA-GM-A2DM-01A-11D-A17Z-05,cg17657322,0.98,1,Primary solid Tumor,LumA,Positive,Positive,1+,,,Unknown,LumA
2,TCGA-GM-A2DM,TCGA-GM-A2DM-01A,TCGA-GM-A2DM-01A-11D-A17Z-05,cg00872151,0.96,1,Primary solid Tumor,LumA,Positive,Positive,1+,,,Unknown,LumA
3,TCGA-GM-A2DM,TCGA-GM-A2DM-01A,TCGA-GM-A2DM-01A-11D-A17Z-05,cg16608267,0.97,1,Primary solid Tumor,LumA,Positive,Positive,1+,,,Unknown,LumA
4,TCGA-GM-A2DM,TCGA-GM-A2DM-01A,TCGA-GM-A2DM-01A-11D-A17Z-05,cg11289296,0.98,1,Primary solid Tumor,LumA,Positive,Positive,1+,,,Unknown,LumA


In [8]:
print("Sample types")
print(subtype_meth_bigtable.drop_duplicates(subset="sample_barcode").sample_type_name.value_counts())
print("\n")

print("BRCA subtypes")
print(subtype_meth_bigtable.drop_duplicates(subset="case_barcode").subtype_sub.value_counts())

Sample types
Primary solid Tumor    789
Solid Tissue Normal     98
Metastatic               5
Name: sample_type_name, dtype: int64


BRCA subtypes
LumA        392
LumB        141
Basal        75
Normal       70
Basal-TN     53
Her2         44
Unknown      14
Name: subtype_sub, dtype: int64


In [9]:
subtype_meth_bigtable.to_csv("./datasets/subtype_meth_bigtable.HM450.tsv", sep="\t", index=False)

# Hormone

In [3]:
pairs_splitted = probe['pairs'].str.split(",")
truefalse = [True if "ers_err" in l else False for l in pairs_splitted]

probe_hormone = probe.loc[truefalse,:]
print(probe_hormone.shape)
probe_hormone.head()

(1727, 11)


Unnamed: 0,chrom,start,end,probe_id,UCSC.RefGene_Group,UCSC.RefGene_Accession,UCSC.RefGene_Name,sig-DMD,DMV,pairs,no_pairs
11,chr1,1286916,1286918,cg21679391,,,,sig-DMD_4,DMV_9,"ers_err,normal_nontrip,normal_trip",3
12,chr1,1287258,1287260,cg21118819,,,,sig-DMD_4,DMV_9,"ers_err,normal_nontrip,normal_trip",3
13,chr1,1288585,1288587,cg11414742,3'UTR,NM_032348,MXRA8,sig-DMD_4,DMV_9,"ers_err,normal_nontrip,normal_trip",3
14,chr1,1288925,1288927,cg15472728,3'UTR,NM_032348,MXRA8,sig-DMD_4,DMV_9,"ers_err,normal_nontrip,normal_trip",3
15,chr1,1289805,1289807,cg14270725,Body,NM_032348,MXRA8,sig-DMD_4,DMV_9,"ers_err,normal_nontrip,normal_trip",3


In [4]:
sample = pd.read_table("./metadata/hormone_metadata.tsv", sep="\t")

print(sample.shape)
sample.head()

(540, 7)


Unnamed: 0,case_barcode,sample_barcode,sample_type,sample_type_name,initial_response,recurrence_status,recurrence_log
0,TCGA-BH-A0C0,TCGA-BH-A0C0-01A,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ..."
1,TCGA-BH-A0C0,TCGA-BH-A0C0-11A,11,Solid Tissue Normal,Unknown,Unknown,"Alive tumor free, last follow-up within risky ..."
2,TCGA-D8-A141,TCGA-D8-A141-01A,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ..."
3,TCGA-EW-A424,TCGA-EW-A424-01A,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ..."
4,TCGA-AO-A12G,TCGA-AO-A12G-01A,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ..."


In [5]:
sample_barcodes = ", ".join("'%s'" % w for w in sample.sample_barcode)
probe_ids = ", ".join("'%s'" % w for w in probe_hormone.probe_id)

In [6]:
client = bq.Client()
methylation = 'isb-cgc.TCGA_hg19_data_v0.DNA_Methylation'

query="""\
SELECT 
    case_barcode, sample_barcode, aliquot_barcode, probe_id, beta_value
FROM
    `{}`
WHERE
    platform = "HumanMethylation450" AND
    sample_barcode IN ({}) AND probe_id IN ({})
""".format(methylation, sample_barcodes, probe_ids)

hormone_meth = client.query(query).to_dataframe()

print(hormone_meth.shape)

hormone_meth.head()

(625379, 5)


Unnamed: 0,case_barcode,sample_barcode,aliquot_barcode,probe_id,beta_value
0,TCGA-AQ-A1H2,TCGA-AQ-A1H2-01A,TCGA-AQ-A1H2-01A-11D-A13K-05,cg05467160,0.86
1,TCGA-AC-A5XU,TCGA-AC-A5XU-01A,TCGA-AC-A5XU-01A-11D-A28C-05,cg02046665,0.91
2,TCGA-EW-A1IX,TCGA-EW-A1IX-01A,TCGA-EW-A1IX-01A-12D-A145-05,cg14556146,0.85
3,TCGA-AO-A1KT,TCGA-AO-A1KT-01A,TCGA-AO-A1KT-01A-11D-A13K-05,cg07223632,0.92
4,TCGA-WT-AB41,TCGA-WT-AB41-01A,TCGA-WT-AB41-01A-11D-A41Q-05,cg22727572,0.88


In [7]:
hormone_meth_bigtable = pd.merge(hormone_meth, sample, how='inner', on=["sample_barcode", "case_barcode"])

print(hormone_meth_bigtable.shape)
hormone_meth_bigtable.head()

(625379, 10)


Unnamed: 0,case_barcode,sample_barcode,aliquot_barcode,probe_id,beta_value,sample_type,sample_type_name,initial_response,recurrence_status,recurrence_log
0,TCGA-AQ-A1H2,TCGA-AQ-A1H2-01A,TCGA-AQ-A1H2-01A-11D-A13K-05,cg05467160,0.86,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ..."
1,TCGA-AQ-A1H2,TCGA-AQ-A1H2-01A,TCGA-AQ-A1H2-01A-11D-A13K-05,cg20181887,0.94,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ..."
2,TCGA-AQ-A1H2,TCGA-AQ-A1H2-01A,TCGA-AQ-A1H2-01A-11D-A13K-05,cg06796779,0.92,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ..."
3,TCGA-AQ-A1H2,TCGA-AQ-A1H2-01A,TCGA-AQ-A1H2-01A-11D-A13K-05,cg23399222,0.94,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ..."
4,TCGA-AQ-A1H2,TCGA-AQ-A1H2-01A,TCGA-AQ-A1H2-01A-11D-A13K-05,cg17101450,0.91,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ..."


In [8]:
print("Sample types")
print(hormone_meth_bigtable.drop_duplicates(subset="sample_barcode").sample_type_name.value_counts())
print("\n")

print("Cases - Initial responses")
print(hormone_meth_bigtable.drop_duplicates(subset="case_barcode").initial_response.value_counts())
print("\n")

print("Cases - Recurrence status")
print(hormone_meth_bigtable.drop_duplicates(subset="case_barcode").recurrence_status.value_counts())
print("\n")

print("Cases - Recurrence log")
print(hormone_meth_bigtable.drop_duplicates(subset="case_barcode").recurrence_log.value_counts())
print("\n")

Sample types
Primary solid Tumor    353
Solid Tissue Normal     35
Name: sample_type_name, dtype: int64


Cases - Initial responses
Unknown               313
Complete response      34
Progressive/Stable      7
Name: initial_response, dtype: int64


Cases - Recurrence status
Unknown     312
Occured      30
Low risk      7
Name: recurrence_status, dtype: int64


Cases - Recurrence log
Alive tumor free, last follow-up within risky period             279
New tumor after/during treatment                                  21
Drug start and end dates unknown                                  14
Dead with tumor                                                    9
Unknown neoplasm status                                            9
Alive tumor free after risky period                                7
Dead tumor free during risky period                                5
Reported having tumor during follow-up but unknown recurrence      3
Alive tumor free but unknown follow-up date                   

In [9]:
hormone_meth_bigtable.to_csv("./datasets/hormone_meth_bigtable.HM450.tsv", sep="\t", index=False)