In [1]:
import pandas as pd
from google.cloud import bigquery as bq

# Hormone therapy

In [2]:
patient = pd.read_table("metadata/hormone_response_recurrence.tsv", sep="\t")

print(patient.shape)
patient.head()

(500, 4)


Unnamed: 0,case_barcode,initial_response,recurrence_status,recurrence_log
0,TCGA-5L-AAT0,Unknown,,
1,TCGA-5L-AAT1,Unknown,,
2,TCGA-5T-A9QA,Unknown,Unknown,Unknown neoplasm status
3,TCGA-A1-A0SF,Unknown,,
4,TCGA-A1-A0SQ,Unknown,,


In [3]:
case_list = ", ".join("'%s'" % w for w in patient.case_barcode)
case_list

"'TCGA-5L-AAT0', 'TCGA-5L-AAT1', 'TCGA-5T-A9QA', 'TCGA-A1-A0SF', 'TCGA-A1-A0SQ', 'TCGA-A2-A04N', 'TCGA-A2-A04R', 'TCGA-A2-A04V', 'TCGA-A2-A04X', 'TCGA-A2-A04Y', 'TCGA-A2-A0CK', 'TCGA-A2-A0CL', 'TCGA-A2-A0CO', 'TCGA-A2-A0CP', 'TCGA-A2-A0CQ', 'TCGA-A2-A0CR', 'TCGA-A2-A0CS', 'TCGA-A2-A0CT', 'TCGA-A2-A0CU', 'TCGA-A2-A0CV', 'TCGA-A2-A0CW', 'TCGA-A2-A0CX', 'TCGA-A2-A0CY', 'TCGA-A2-A0CZ', 'TCGA-A2-A0D3', 'TCGA-A2-A0D4', 'TCGA-A2-A0EM', 'TCGA-A2-A0EN', 'TCGA-A2-A0EO', 'TCGA-A2-A0EP', 'TCGA-A2-A0ER', 'TCGA-A2-A0ES', 'TCGA-A2-A0ET', 'TCGA-A2-A0EW', 'TCGA-A2-A0EX', 'TCGA-A2-A0EY', 'TCGA-A2-A0SU', 'TCGA-A2-A0SV', 'TCGA-A2-A0SW', 'TCGA-A2-A0SY', 'TCGA-A2-A0T3', 'TCGA-A2-A0T4', 'TCGA-A2-A0T5', 'TCGA-A2-A0T6', 'TCGA-A2-A0T7', 'TCGA-A2-A0YC', 'TCGA-A2-A0YD', 'TCGA-A2-A0YF', 'TCGA-A2-A0YG', 'TCGA-A2-A0YH', 'TCGA-A2-A0YI', 'TCGA-A2-A0YK', 'TCGA-A2-A0YL', 'TCGA-A2-A0YT', 'TCGA-A2-A1FV', 'TCGA-A2-A1FW', 'TCGA-A2-A1FX', 'TCGA-A2-A1FZ', 'TCGA-A2-A1G0', 'TCGA-A2-A1G4', 'TCGA-A2-A259', 'TCGA-A2-A25A', 'TCGA-A

In [4]:
client = bq.Client()
biospecimen = 'isb-cgc.TCGA_bioclin_v0.Biospecimen'

query = """\
SELECT 
    case_barcode, sample_barcode, sample_type, sample_type_name
FROM
    `{}`
WHERE
    case_barcode IN ({}) AND sample_type IN ('01', '02', '06', '07', '11')
""".format(biospecimen, case_list)

samples = client.query(query).to_dataframe()

print(samples.shape)

samples.head()

(565, 4)


Unnamed: 0,case_barcode,sample_barcode,sample_type,sample_type_name
0,TCGA-BH-A0C0,TCGA-BH-A0C0-01A,1,Primary solid Tumor
1,TCGA-D8-A141,TCGA-D8-A141-01A,1,Primary solid Tumor
2,TCGA-EW-A424,TCGA-EW-A424-01A,1,Primary solid Tumor
3,TCGA-AO-A12G,TCGA-AO-A12G-01A,1,Primary solid Tumor
4,TCGA-AC-A3QP,TCGA-AC-A3QP-01A,1,Primary solid Tumor


In [5]:
samples.sample_type_name.value_counts()

Primary solid Tumor    505
Solid Tissue Normal     59
Metastatic               1
Name: sample_type_name, dtype: int64

In [8]:
merged = pd.merge(samples, patient, how='outer', on='case_barcode')

print(merged.shape)
merged.head()

(565, 7)


Unnamed: 0,case_barcode,sample_barcode,sample_type,sample_type_name,initial_response,recurrence_status,recurrence_log
0,TCGA-BH-A0C0,TCGA-BH-A0C0-01A,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ..."
1,TCGA-BH-A0C0,TCGA-BH-A0C0-11A,11,Solid Tissue Normal,Unknown,Unknown,"Alive tumor free, last follow-up within risky ..."
2,TCGA-D8-A141,TCGA-D8-A141-01A,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ..."
3,TCGA-EW-A424,TCGA-EW-A424-01A,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ..."
4,TCGA-AO-A12G,TCGA-AO-A12G-01A,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ..."


In [9]:
merged.to_csv("./metadata/hormone_metadata.tsv", sep="\t", index=False)