In [1]:
import pandas as pd
from google.cloud import bigquery as bq

# Subtype

In [2]:
patient = pd.read_table("./metadata/subtypes.tsv", sep="\t")

print(patient.shape)
patient.head()

(1218, 4)


Unnamed: 0,aliquot_barcode,subtype,case_barcode,sample_barcode
0,TCGA-E2-A158-11A-22R-A12D-07,Normal,TCGA-E2-A158,TCGA-E2-A158-11A
1,TCGA-BH-A0DD-11A-23R-A12P-07,LumA,TCGA-BH-A0DD,TCGA-BH-A0DD-11A
2,TCGA-BH-A1EO-11A-31R-A137-07,LumA,TCGA-BH-A1EO,TCGA-BH-A1EO-11A
3,TCGA-BH-A0B5-11A-23R-A12P-07,LumA,TCGA-BH-A0B5,TCGA-BH-A0B5-11A
4,TCGA-A7-A13G-11A-51R-A13Q-07,LumA,TCGA-A7-A13G,TCGA-A7-A13G-11A


In [3]:
print(patient['sample_barcode'].drop_duplicates().shape)
print(patient.drop_duplicates(subset=["case_barcode", "sample_barcode"]).shape)
print(patient.drop_duplicates(subset=["case_barcode"]).shape)
print(patient.drop_duplicates(subset=["case_barcode", "subtype"]).shape)

(1218,)
(1218, 4)
(1097, 4)
(1199, 4)


==>>> Only 1 aliquot barcode per sample barcode.

==>>> 1 patient may have multiple samples collected. Various samples from 1 patient may be determined to be different subtypes. <br/>
For example, case "TCGA-BH-A0DL" have 2 samples, 1 is Basal, another is Normal-like

In [4]:
patient.subtype.value_counts()

LumA      581
LumB      219
Basal     193
Normal    143
Her2       82
Name: subtype, dtype: int64

In [5]:
case_list = ", ".join("'%s'" % w for w in patient.case_barcode)
case_list

"'TCGA-E2-A158', 'TCGA-BH-A0DD', 'TCGA-BH-A1EO', 'TCGA-BH-A0B5', 'TCGA-A7-A13G', 'TCGA-E9-A1NF', 'TCGA-A7-A0D9', 'TCGA-A7-A0DB', 'TCGA-BH-A0B8', 'TCGA-E9-A1RC', 'TCGA-E9-A1RF', 'TCGA-E2-A1LS', 'TCGA-E9-A1RI', 'TCGA-GI-A2C8', 'TCGA-A8-A07B', 'TCGA-A8-A08B', 'TCGA-A8-A08P', 'TCGA-A8-A094', 'TCGA-A8-A09T', 'TCGA-AO-A03O', 'TCGA-BH-A0DZ', 'TCGA-A2-A04P', 'TCGA-A2-A04Q', 'TCGA-A2-A04T', 'TCGA-A2-A04V', 'TCGA-AN-A041', 'TCGA-AN-A046', 'TCGA-AN-A04A', 'TCGA-AN-A04C', 'TCGA-AN-A04D', 'TCGA-AO-A03T', 'TCGA-AQ-A04J', 'TCGA-A7-A0CH', 'TCGA-A8-A06T', 'TCGA-A8-A07F', 'TCGA-A8-A081', 'TCGA-A8-A08C', 'TCGA-A8-A08T', 'TCGA-AN-A0AJ', 'TCGA-AN-A0AS', 'TCGA-AO-A03P', 'TCGA-A8-A07I', 'TCGA-A8-A082', 'TCGA-A8-A08F', 'TCGA-A8-A08X', 'TCGA-A8-A09K', 'TCGA-A8-A09X', 'TCGA-BH-A0AY', 'TCGA-A8-A06X', 'TCGA-A8-A07J', 'TCGA-A8-A092', 'TCGA-A8-A09C', 'TCGA-A8-A09Q', 'TCGA-A8-A0A7', 'TCGA-A2-A04X', 'TCGA-A2-A04Y', 'TCGA-A2-A0CM', 'TCGA-A2-A0CQ', 'TCGA-A2-A0CU', 'TCGA-A8-A06Q', 'TCGA-A8-A07C', 'TCGA-A7-A0CD', 'TCGA-A

In [6]:
client = bq.Client()
biospecimen = 'isb-cgc.TCGA_bioclin_v0.Biospecimen'

query = """\
SELECT 
    case_barcode, sample_barcode, sample_type, sample_type_name
FROM
    `{}`
WHERE
    case_barcode IN ({}) AND sample_type IN ('01', '02', '06', '07', '11')
""".format(biospecimen, case_list)

samples = client.query(query).to_dataframe()

print(samples.shape)

samples.head()

(1282, 4)


Unnamed: 0,case_barcode,sample_barcode,sample_type,sample_type_name
0,TCGA-A2-A04P,TCGA-A2-A04P-01A,1,Primary solid Tumor
1,TCGA-AR-A0TP,TCGA-AR-A0TP-01A,1,Primary solid Tumor
2,TCGA-GM-A2DF,TCGA-GM-A2DF-01A,1,Primary solid Tumor
3,TCGA-BH-A0C0,TCGA-BH-A0C0-01A,1,Primary solid Tumor
4,TCGA-D8-A141,TCGA-D8-A141-01A,1,Primary solid Tumor


In [7]:
samples.sample_type_name.value_counts()

Primary solid Tumor    1112
Solid Tissue Normal     163
Metastatic                7
Name: sample_type_name, dtype: int64

In [12]:
merged = pd.merge(samples, patient.iloc[:, 1:], how='left', on=['sample_barcode', 'case_barcode'])

print(merged.shape)
merged.head()

(1282, 5)


Unnamed: 0,case_barcode,sample_barcode,sample_type,sample_type_name,subtype
0,TCGA-A2-A04P,TCGA-A2-A04P-01A,1,Primary solid Tumor,Basal
1,TCGA-AR-A0TP,TCGA-AR-A0TP-01A,1,Primary solid Tumor,Basal
2,TCGA-GM-A2DF,TCGA-GM-A2DF-01A,1,Primary solid Tumor,Basal
3,TCGA-BH-A0C0,TCGA-BH-A0C0-01A,1,Primary solid Tumor,LumB
4,TCGA-D8-A141,TCGA-D8-A141-01A,1,Primary solid Tumor,LumA


In [13]:
print(merged.drop_duplicates().shape)

(1282, 5)


In [14]:
print(merged.sample_type_name.value_counts())
print("\n")

print(merged.subtype.value_counts())
print(merged.subtype.isnull().value_counts())

Primary solid Tumor    1112
Solid Tissue Normal     163
Metastatic                7
Name: sample_type_name, dtype: int64


LumA      581
LumB      219
Basal     193
Normal    143
Her2       82
Name: subtype, dtype: int64
False    1218
True       64
Name: subtype, dtype: int64


In [16]:
merged.loc[merged.subtype.isnull(), "sample_type_name"].value_counts()

Solid Tissue Normal    49
Primary solid Tumor    15
Name: sample_type_name, dtype: int64

In [17]:
merged.to_csv("./metadata/subtype_metadata.tsv", sep="\t", index=False)

# Hormone therapy

In [12]:
patient = pd.read_table("metadata/hormone_response_recurrence.tsv", sep="\t")

print(patient.shape)
patient.head()

(500, 4)


Unnamed: 0,case_barcode,initial_response,recurrence_status,recurrence_log
0,TCGA-5L-AAT0,Unknown,,
1,TCGA-5L-AAT1,Unknown,,
2,TCGA-5T-A9QA,Unknown,Unknown,Unknown neoplasm status
3,TCGA-A1-A0SF,Unknown,,
4,TCGA-A1-A0SQ,Unknown,,


In [13]:
case_list = ", ".join("'%s'" % w for w in patient.case_barcode)

client = bq.Client()
biospecimen = 'isb-cgc.TCGA_bioclin_v0.Biospecimen'

query = """\
SELECT 
    case_barcode, sample_barcode, sample_type, sample_type_name
FROM
    `{}`
WHERE
    case_barcode IN ({}) AND sample_type IN ('01', '02', '06', '07', '11')
""".format(biospecimen, case_list)

samples = client.query(query).to_dataframe()

print(samples.shape)

samples.head()

(565, 4)


Unnamed: 0,case_barcode,sample_barcode,sample_type,sample_type_name
0,TCGA-BH-A0C0,TCGA-BH-A0C0-01A,1,Primary solid Tumor
1,TCGA-D8-A141,TCGA-D8-A141-01A,1,Primary solid Tumor
2,TCGA-EW-A424,TCGA-EW-A424-01A,1,Primary solid Tumor
3,TCGA-AO-A12G,TCGA-AO-A12G-01A,1,Primary solid Tumor
4,TCGA-AC-A3QP,TCGA-AC-A3QP-01A,1,Primary solid Tumor


In [14]:
samples.sample_type_name.value_counts()

Primary solid Tumor    505
Solid Tissue Normal     59
Metastatic               1
Name: sample_type_name, dtype: int64

In [16]:
merged = pd.merge(samples, patient, how='inner', on='case_barcode')

print(merged.shape)
merged.head()

(565, 7)


Unnamed: 0,case_barcode,sample_barcode,sample_type,sample_type_name,initial_response,recurrence_status,recurrence_log
0,TCGA-BH-A0C0,TCGA-BH-A0C0-01A,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ..."
1,TCGA-BH-A0C0,TCGA-BH-A0C0-11A,11,Solid Tissue Normal,Unknown,Unknown,"Alive tumor free, last follow-up within risky ..."
2,TCGA-D8-A141,TCGA-D8-A141-01A,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ..."
3,TCGA-EW-A424,TCGA-EW-A424-01A,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ..."
4,TCGA-AO-A12G,TCGA-AO-A12G-01A,1,Primary solid Tumor,Unknown,Unknown,"Alive tumor free, last follow-up within risky ..."


In [17]:
merged.to_csv("./metadata/hormone_metadata.tsv", sep="\t", index=False)