In [1]:
import pandas as pd
import numpy as np

from google.cloud import bigquery as bq

%matplotlib inline

In [2]:
gene = pd.read_table("./metadata/associated_genes_list.tsv", sep="\t")

print(gene.shape)
gene.head()

(3518, 5)


Unnamed: 0,gene_symbol,BRCA,TN-BRCA,nonTN-BRCA,ERR-BRCA
0,A1CF,Yes,No,No,Yes
1,AACS,Yes,No,No,No
2,AARS2,Yes,No,No,No
3,AASS,Yes,Yes,Yes,No
4,AATF,Yes,No,No,No


In [3]:
subtypes = pd.read_table("./metadata/subtype_metadata.tsv", sep="\t")

print(subtypes.shape)
subtypes.head()

(1282, 12)


Unnamed: 0,case_barcode,sample_barcode,sample_type,sample_type_name,subtype,ER,PR,Her2_IHC,Her2_ISH,Her2,triple_negative,subtype_sub
0,TCGA-A2-A04P,TCGA-A2-A04P-01A,1,Primary solid Tumor,Basal,Negative,Negative,,Negative,Negative,Yes,Basal-TN
1,TCGA-AR-A0TP,TCGA-AR-A0TP-01A,1,Primary solid Tumor,Basal,Positive,Negative,,,,,Basal
2,TCGA-GM-A2DF,TCGA-GM-A2DF-01A,1,Primary solid Tumor,Basal,Negative,Negative,1+,Negative,Negative,Yes,Basal-TN
3,TCGA-BH-A0C0,TCGA-BH-A0C0-01A,1,Primary solid Tumor,LumB,Positive,Positive,2+,Positive,,,LumB
4,TCGA-D8-A141,TCGA-D8-A141-01A,1,Primary solid Tumor,LumA,Positive,Positive,1+,,,,LumA


In [4]:
# fill NaN value in subtype, triple_negative into Unknown
subtypes.fillna(value={'subtype': 'Unknown', 'triple_negative': 'Unknown', 'subtype_sub': 'Unknown'}, inplace=True)
subtypes.subtype_sub.value_counts()

LumA        581
LumB        219
Normal      143
Basal       101
Basal-TN     91
Her2         82
Unknown      65
Name: subtype_sub, dtype: int64

In [5]:
hormone = pd.read_table("./metadata/hormone_metadata.tsv", sep="\t")

print(hormone.shape)
hormone.head()

(540, 15)


Unnamed: 0,case_barcode,sample_barcode,sample_type,sample_type_name,initial_response,recurrence_status,recurrence_log,subtype,ER,PR,Her2_IHC,Her2_ISH,Her2,triple_negative,subtype_sub
0,TCGA-BH-A0C0,TCGA-BH-A0C0-01A,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ...",LumB,Positive,Positive,2+,Positive,,,LumB
1,TCGA-BH-A0C0,TCGA-BH-A0C0-11A,11,Solid Tissue Normal,Unknown,Unknown,"Alive tumor free, last follow-up within risky ...",Normal,Positive,Positive,2+,Positive,,,Normal
2,TCGA-D8-A141,TCGA-D8-A141-01A,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ...",LumA,Positive,Positive,1+,,,,LumA
3,TCGA-EW-A424,TCGA-EW-A424-01A,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ...",LumA,Positive,Positive,,,,,LumA
4,TCGA-AO-A12G,TCGA-AO-A12G-01A,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ...",LumA,Positive,Positive,2+,Negative,,,LumA


-----------------------------------------------

In [6]:
sample_barcodes = ", ".join("'%s'" % w for w in subtypes.sample_barcode)
genes = ", ".join("'%s'" % w for w in gene.gene_symbol)

In [7]:
client = bq.Client()
rnaseq = 'isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM'

# some aliquots are duplicated => we take the average of those duplicates 
query="""\
SELECT 
    case_barcode, sample_barcode, aliquot_barcode, HGNC_gene_symbol, original_gene_symbol, 
    normalized_count,
    LOG(normalized_count + 1, 2) AS transformed_count
FROM
    `{}`
WHERE
    (sample_barcode IN ({})) AND 
    (HGNC_gene_symbol IN ({}) OR original_gene_symbol IN ({}))
""".format(rnaseq, sample_barcodes, genes, genes)

expression = client.query(query).to_dataframe()

print(expression.shape)

expression.head()

(4194180, 7)


Unnamed: 0,case_barcode,sample_barcode,aliquot_barcode,HGNC_gene_symbol,original_gene_symbol,normalized_count,transformed_count
0,TCGA-BH-A18P,TCGA-BH-A18P-11A,TCGA-BH-A18P-11A-43R-A12D-07,REPIN1,REPIN1,1616.041,10.659141
1,TCGA-BH-A0DG,TCGA-BH-A0DG-11A,TCGA-BH-A0DG-11A-43R-A12P-07,JMY,JMY,1500.2891,10.551986
2,TCGA-BH-A0BM,TCGA-BH-A0BM-11A,TCGA-BH-A0BM-11A-12R-A089-07,BDNF,BDNF,69.7229,6.144106
3,TCGA-BH-A0DL,TCGA-BH-A0DL-11A,TCGA-BH-A0DL-11A-13R-A115-07,FOXD1,FOXD1,29.6156,4.936195
4,TCGA-BH-A0DV,TCGA-BH-A0DV-11A,TCGA-BH-A0DV-11A-22R-A12P-07,SBNO2,SBNO2,1280.1724,10.323249


In [8]:
expression['gene_symbol'] = expression[['HGNC_gene_symbol', 'original_gene_symbol']].apply(lambda x: 
                                                                                           x[1] if x[1] in gene.gene_symbol 
                                                                                           else x[0], axis=1)
expression.drop(['HGNC_gene_symbol', 'original_gene_symbol'], axis=1, inplace=True)

In [9]:
expression = expression.groupby(by=['case_barcode', 'sample_barcode', 'aliquot_barcode', 'gene_symbol'], 
                                as_index=False).mean()

print(expression.shape)
expression.head()

(4191750, 6)


Unnamed: 0,case_barcode,sample_barcode,aliquot_barcode,gene_symbol,normalized_count,transformed_count
0,TCGA-3C-AAAU,TCGA-3C-AAAU-01A,TCGA-3C-AAAU-01A-11R-A41B-07,A1CF,0.0,0.0
1,TCGA-3C-AAAU,TCGA-3C-AAAU-01A,TCGA-3C-AAAU-01A-11R-A41B-07,AACS,1087.3986,10.087991
2,TCGA-3C-AAAU,TCGA-3C-AAAU-01A,TCGA-3C-AAAU-01A-11R-A41B-07,AARS2,689.8967,9.432326
3,TCGA-3C-AAAU,TCGA-3C-AAAU-01A,TCGA-3C-AAAU-01A-11R-A41B-07,AASS,45.8396,5.549657
4,TCGA-3C-AAAU,TCGA-3C-AAAU-01A,TCGA-3C-AAAU-01A-11R-A41B-07,AATF,1518.9114,10.569772


In [10]:
print("Number of genes having expression data:")
expression.pivot(index='gene_symbol', columns='aliquot_barcode', values='transformed_count').shape[0]

Number of genes having expression data:


3450

In [11]:
expression.to_csv("./datasets/expression.IlluminaHiSeq.tsv", sep="\t", index=False)

----------------------------------

In [6]:
expression = pd.read_table("./datasets/expression.IlluminaHiSeq.tsv", sep="\t")
print(expression.shape)
expression.head()

(4191750, 6)


Unnamed: 0,case_barcode,sample_barcode,aliquot_barcode,gene_symbol,normalized_count,transformed_count
0,TCGA-3C-AAAU,TCGA-3C-AAAU-01A,TCGA-3C-AAAU-01A-11R-A41B-07,A1CF,0.0,0.0
1,TCGA-3C-AAAU,TCGA-3C-AAAU-01A,TCGA-3C-AAAU-01A-11R-A41B-07,AACS,1087.3986,10.087991
2,TCGA-3C-AAAU,TCGA-3C-AAAU-01A,TCGA-3C-AAAU-01A-11R-A41B-07,AARS2,689.8967,9.432326
3,TCGA-3C-AAAU,TCGA-3C-AAAU-01A,TCGA-3C-AAAU-01A-11R-A41B-07,AASS,45.8396,5.549657
4,TCGA-3C-AAAU,TCGA-3C-AAAU-01A,TCGA-3C-AAAU-01A-11R-A41B-07,AATF,1518.9114,10.569772


# Subtype

In [7]:
subtype_genes = list(gene.loc[(gene['BRCA']=='Yes') | (gene['TN-BRCA']=='Yes') | (gene['nonTN-BRCA']=='Yes'), 
                              'gene_symbol'])
print(len(subtype_genes))

print(len(expression.loc[expression['gene_symbol'].isin(subtype_genes), 'gene_symbol'].unique()))

expression.loc[expression['gene_symbol'].isin(subtype_genes)].shape

3470
3360


(4082400, 6)

In [8]:
subtype_expression_bigtable = pd.merge(left = expression[expression['gene_symbol'].isin(subtype_genes)], 
                                       right = subtypes, how = 'inner', 
                                       on = ["sample_barcode", "case_barcode"])

print(subtype_expression_bigtable.shape)
subtype_expression_bigtable.head()

(4082400, 16)


Unnamed: 0,case_barcode,sample_barcode,aliquot_barcode,gene_symbol,normalized_count,transformed_count,sample_type,sample_type_name,subtype,ER,PR,Her2_IHC,Her2_ISH,Her2,triple_negative,subtype_sub
0,TCGA-3C-AAAU,TCGA-3C-AAAU-01A,TCGA-3C-AAAU-01A-11R-A41B-07,A1CF,0.0,0.0,1,Primary solid Tumor,LumA,Positive,Positive,,,,Unknown,LumA
1,TCGA-3C-AAAU,TCGA-3C-AAAU-01A,TCGA-3C-AAAU-01A-11R-A41B-07,AACS,1087.3986,10.087991,1,Primary solid Tumor,LumA,Positive,Positive,,,,Unknown,LumA
2,TCGA-3C-AAAU,TCGA-3C-AAAU-01A,TCGA-3C-AAAU-01A-11R-A41B-07,AARS2,689.8967,9.432326,1,Primary solid Tumor,LumA,Positive,Positive,,,,Unknown,LumA
3,TCGA-3C-AAAU,TCGA-3C-AAAU-01A,TCGA-3C-AAAU-01A-11R-A41B-07,AASS,45.8396,5.549657,1,Primary solid Tumor,LumA,Positive,Positive,,,,Unknown,LumA
4,TCGA-3C-AAAU,TCGA-3C-AAAU-01A,TCGA-3C-AAAU-01A-11R-A41B-07,AATF,1518.9114,10.569772,1,Primary solid Tumor,LumA,Positive,Positive,,,,Unknown,LumA


In [9]:
subtype_expression_bigtable.drop_duplicates(subset=["sample_barcode", 
                                                    "gene_symbol"]).gene_symbol.value_counts().describe()

count    3360.0
mean     1215.0
std         0.0
min      1215.0
25%      1215.0
50%      1215.0
75%      1215.0
max      1215.0
Name: gene_symbol, dtype: float64

==> 1215 samples, each sample has expression data from 3360 genes associated with subtype-specific signatures

In [10]:
print("Sample types")
print(subtype_expression_bigtable.drop_duplicates(subset="sample_barcode").sample_type_name.value_counts())
print("\n")

print("BRCA subtypes")
print(subtype_expression_bigtable.drop_duplicates(subset="case_barcode").subtype_sub.value_counts())

Sample types
Primary solid Tumor    1095
Solid Tissue Normal     113
Metastatic                7
Name: sample_type_name, dtype: int64


BRCA subtypes
LumA        566
LumB        217
Basal       100
Basal-TN     90
Her2         82
Normal       40
Name: subtype_sub, dtype: int64


In [11]:
subtype_expression_bigtable.to_csv("./datasets/subtype_expression_bigtable.IlluminaHiSeq.tsv", sep="\t", index=False)

# Hormone

In [12]:
hormone_genes = list(gene.loc[gene['ERR-BRCA']=='Yes', 'gene_symbol'])
print(len(hormone_genes))

print(len(expression.loc[expression['gene_symbol'].isin(hormone_genes), 'gene_symbol'].unique()))

expression.loc[expression['gene_symbol'].isin(hormone_genes)].shape

229
224


(272160, 6)

In [13]:
hormone_expression_bigtable = pd.merge(left = expression[expression['gene_symbol'].isin(hormone_genes)], 
                                       right = hormone, how = 'inner', 
                                       on = ["sample_barcode", "case_barcode"])

print(hormone_expression_bigtable.shape)
hormone_expression_bigtable.head()

(115136, 19)


Unnamed: 0,case_barcode,sample_barcode,aliquot_barcode,gene_symbol,normalized_count,transformed_count,sample_type,sample_type_name,initial_response,recurrence_status,recurrence_log,subtype,ER,PR,Her2_IHC,Her2_ISH,Her2,triple_negative,subtype_sub
0,TCGA-5L-AAT0,TCGA-5L-AAT0-01A,TCGA-5L-AAT0-01A-12R-A41B-07,A1CF,0.0,0.0,1,Primary solid Tumor,Unknown,,,LumA,Positive,Positive,1+,,,,LumA
1,TCGA-5L-AAT0,TCGA-5L-AAT0-01A,TCGA-5L-AAT0-01A-12R-A41B-07,AATK,54.5535,5.795806,1,Primary solid Tumor,Unknown,,,LumA,Positive,Positive,1+,,,,LumA
2,TCGA-5L-AAT0,TCGA-5L-AAT0-01A,TCGA-5L-AAT0-01A-12R-A41B-07,ADRA2C,39.8885,5.353623,1,Primary solid Tumor,Unknown,,,LumA,Positive,Positive,1+,,,,LumA
3,TCGA-5L-AAT0,TCGA-5L-AAT0-01A,TCGA-5L-AAT0-01A-12R-A41B-07,ANP32E,766.0947,9.583261,1,Primary solid Tumor,Unknown,,,LumA,Positive,Positive,1+,,,,LumA
4,TCGA-5L-AAT0,TCGA-5L-AAT0-01A,TCGA-5L-AAT0-01A-12R-A41B-07,ARHGEF37,1644.2294,10.684073,1,Primary solid Tumor,Unknown,,,LumA,Positive,Positive,1+,,,,LumA


In [14]:
hormone_expression_bigtable.drop_duplicates(subset=["sample_barcode", 
                                                    "gene_symbol"]).gene_symbol.value_counts().describe()

count    224.0
mean     514.0
std        0.0
min      514.0
25%      514.0
50%      514.0
75%      514.0
max      514.0
Name: gene_symbol, dtype: float64

==> 514 samples, each sample has expression data for 224 genes associated with endocrine resistance signatures

In [15]:
print("Sample types")
print(hormone_expression_bigtable.drop_duplicates(subset="sample_barcode").sample_type_name.value_counts())
print("\n")

print("Cases - Initial responses")
print(hormone_expression_bigtable.drop_duplicates(subset="case_barcode").initial_response.value_counts())
print("\n")

print("Cases - Recurrence status")
print(hormone_expression_bigtable.drop_duplicates(subset="case_barcode").recurrence_status.value_counts())
print("\n")

print("Cases - Recurrence log")
print(hormone_expression_bigtable.drop_duplicates(subset="case_barcode").recurrence_log.value_counts())
print("\n")

Sample types
Primary solid Tumor    480
Solid Tissue Normal     33
Metastatic               1
Name: sample_type_name, dtype: int64


Cases - Initial responses
Unknown               431
Complete response      40
Progressive/Stable      9
Name: initial_response, dtype: int64


Cases - Recurrence status
Unknown     423
Occured      42
Low risk      8
Name: recurrence_status, dtype: int64


Cases - Recurrence log
Alive tumor free, last follow-up within risky period             376
New tumor after/during treatment                                  30
Drug start and end dates unknown                                  18
Dead with tumor                                                   12
Unknown neoplasm status                                           10
Dead tumor free during risky period                                9
Reported having tumor during follow-up but unknown recurrence      8
Alive tumor free after risky period                                8
Alive tumor free but unknown follow

In [16]:
hormone_expression_bigtable[hormone_expression_bigtable['sample_type']==6].drop_duplicates(subset=["initial_response", 
                                                                                                   "recurrence_status"])

Unnamed: 0,case_barcode,sample_barcode,aliquot_barcode,gene_symbol,normalized_count,transformed_count,sample_type,sample_type_name,initial_response,recurrence_status,recurrence_log,subtype,ER,PR,Her2_IHC,Her2_ISH,Her2,triple_negative,subtype_sub
85120,TCGA-E2-A15E,TCGA-E2-A15E-06A,TCGA-E2-A15E-06A-11R-A12D-07,A1CF,0.3766,0.461109,6,Metastatic,Unknown,Unknown,"Alive tumor free, last follow-up within risky ...",LumA,Positive,Positive,,Positive,,,LumA


In [18]:
hormone_tumors = hormone_expression_bigtable.loc[hormone_expression_bigtable['sample_type']==1].drop_duplicates(subset="case_barcode")
pd.crosstab(hormone_tumors['initial_response'], hormone_tumors['subtype_sub'], margins=True)

subtype_sub,Basal,Her2,LumA,LumB,Normal,All
initial_response,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Complete response,2,1,30,6,1,40
Progressive/Stable,0,1,6,1,1,9
Unknown,8,11,290,108,14,431
All,10,13,326,115,16,480


- Complete response: 2 Basal, 1 Normal-like
- Progressive/Stable: 1 Normal-like

In [19]:
pd.crosstab(hormone_tumors['recurrence_status'], hormone_tumors['subtype_sub'], margins=True)

subtype_sub,Basal,Her2,LumA,LumB,Normal,All
recurrence_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Low risk,0,0,8,0,0,8
Occured,1,1,28,11,1,42
Unknown,9,12,283,104,15,423
All,10,13,319,115,16,473


- Occured: 1 Basal, 1 Normal-like

In [20]:
hormone_expression_bigtable.to_csv("./datasets/hormone_expression_bigtable.IlluminaHiSeq.tsv", sep="\t", index=False)