```
if (!requireNamespace("BiocManager", quietly=TRUE))
    install.packages("BiocManager")
BiocManager::install("TCGAbiolinks")
```

In [71]:
library("TCGAbiolinks")
library("dplyr")
'%notin%' <- Negate('%in%')

# Subtypes

In [2]:
subtypes <- PanCancerAtlas_subtypes()

head(subtypes)

pan.samplesID,cancer.type,Subtype_mRNA,Subtype_DNAmeth,Subtype_protein,Subtype_miRNA,Subtype_CNA,Subtype_Integrative,Subtype_other,Subtype_Selected
TCGA-OR-A5J1,ACC,steroid-phenotype-high+proliferation,CIMP-high,,miRNA_1,Quiet,COC3,C1A,ACC.CIMP-high
TCGA-OR-A5J2,ACC,steroid-phenotype-high+proliferation,CIMP-low,1.0,miRNA_1,Noisy,COC3,C1A,ACC.CIMP-low
TCGA-OR-A5J3,ACC,steroid-phenotype-high,CIMP-intermediate,3.0,miRNA_6,Chromosomal,COC2,C1A,ACC.CIMP-intermediate
TCGA-OR-A5J4,ACC,,CIMP-high,,miRNA_6,Chromosomal,,,ACC.CIMP-high
TCGA-OR-A5J5,ACC,steroid-phenotype-high,CIMP-intermediate,,miRNA_2,Chromosomal,COC2,C1A,ACC.CIMP-intermediate
TCGA-OR-A5J6,ACC,steroid-phenotype-low,CIMP-low,2.0,miRNA_1,Noisy,COC1,C1B,ACC.CIMP-low


In [3]:
brca_subtypes <- subset(subtypes, cancer.type == "BRCA", select=c("pan.samplesID", "Subtype_mRNA"))
print(dim(brca_subtypes))
head(brca_subtypes)

[1] 1218    2


pan.samplesID,Subtype_mRNA
TCGA-E2-A158-11A-22R-A12D-07,Normal
TCGA-BH-A0DD-11A-23R-A12P-07,LumA
TCGA-BH-A1EO-11A-31R-A137-07,LumA
TCGA-BH-A0B5-11A-23R-A12P-07,LumA
TCGA-A7-A13G-11A-51R-A13Q-07,LumA
TCGA-E9-A1NF-11A-73R-A14D-07,LumA


In [4]:
colnames(brca_subtypes) <- c("sample_barcode", "subtype")

In [5]:
brca_subtypes$case_barcode <- substr(brca_subtypes$sample_barcode, 1, 12)
head(brca_subtypes)

sample_barcode,subtype,case_barcode
TCGA-E2-A158-11A-22R-A12D-07,Normal,TCGA-E2-A158
TCGA-BH-A0DD-11A-23R-A12P-07,LumA,TCGA-BH-A0DD
TCGA-BH-A1EO-11A-31R-A137-07,LumA,TCGA-BH-A1EO
TCGA-BH-A0B5-11A-23R-A12P-07,LumA,TCGA-BH-A0B5
TCGA-A7-A13G-11A-51R-A13Q-07,LumA,TCGA-A7-A13G
TCGA-E9-A1NF-11A-73R-A14D-07,LumA,TCGA-E9-A1NF


In [8]:
write.table(brca_subtypes, "./metadata/subtypes.tsv", sep="\t", row.names=FALSE)

------------------------
# Hormone therapy responses

In [9]:
query <- GDCquery(project = "TCGA-BRCA", 
                  data.category = "Clinical", 
                  file.type = "xml", 
                  barcode = brca_subtypes$case_barcode)
GDCdownload(query)
clinical_drug <- GDCprepare_clinic(query, clinical.info = "drug")

--------------------------------------
o GDCquery: Searching in GDC database
--------------------------------------
Genome of reference: hg38
--------------------------------------------
oo Accessing GDC. This might take a while...
--------------------------------------------
ooo Project: TCGA-BRCA
--------------------
oo Filtering results
--------------------
ooo By file.type
ooo By barcode
----------------
oo Checking data
----------------
ooo Check if there are duplicated cases
ooo Check if there results for the query
-------------------
o Preparing output
-------------------
Downloading data for project TCGA-BRCA
Of the 1171 files for download 1171 already exist.
All samples have been already downloaded




In [10]:
dim(clinical_drug)

In [11]:
colnames(clinical_drug)

In [12]:
clinical_drug <- clinical_drug[,c('bcr_patient_barcode', 'bcr_drug_barcode', 'bcr_drug_uuid', 
                 'days_to_drug_therapy_start', 'days_to_drug_therapy_end', 'therapy_types', 
                 'drug_name', 'therapy_ongoing', 'measure_of_response', 'year_of_form_completion')]

In [13]:
hormone <- subset(clinical_drug, therapy_types == "Hormone Therapy")

print(dim(hormone))
head(hormone)

[1] 729  10


Unnamed: 0,bcr_patient_barcode,bcr_drug_barcode,bcr_drug_uuid,days_to_drug_therapy_start,days_to_drug_therapy_end,therapy_types,drug_name,therapy_ongoing,measure_of_response,year_of_form_completion
15,TCGA-5L-AAT0,TCGA-5L-AAT0-D63723,BCA5B47A-BE3A-4258-8A17-692D7DFDDF87,322,,Hormone Therapy,Tamoxifen,YES,,2014
16,TCGA-5L-AAT0,TCGA-5L-AAT0-D63723,BCA5B47A-BE3A-4258-8A17-692D7DFDDF87,322,,Hormone Therapy,Tamoxifen,YES,,2014
17,TCGA-5L-AAT1,TCGA-5L-AAT1-D63716,747059AE-DC7B-409F-9178-2DD683C19A6F,46,,Hormone Therapy,Letrozol,YES,,2014
18,TCGA-5L-AAT1,TCGA-5L-AAT1-D63716,747059AE-DC7B-409F-9178-2DD683C19A6F,46,,Hormone Therapy,Letrozol,YES,,2014
20,TCGA-5T-A9QA,TCGA-5T-A9QA-D66891,906BC4D2-3C18-42A8-ABE2-BF47933ABA1F,304,,Hormone Therapy,Letrozole,YES,,2014
24,TCGA-A1-A0SF,TCGA-A1-A0SF-D12154,8B91D294-5433-4D18-A271-253430DDD871,157,918.0,Hormone Therapy,tamoxifen,NO,,2011


In [23]:
length(unique(hormone$bcr_patient_barcode)) # 520 patients received hormone therapy

In [43]:
subset(brca_subtypes, case_barcode %in% hormone$bcr_patient_barcode & subtype == "Basal") %>% dim()

In [46]:
# 20 patients were triple negative but received hormone therapy. We exclude these patients
excluded <- subset(brca_subtypes, case_barcode %in% hormone$bcr_patient_barcode & subtype == "Basal")$case_barcode
consolidated_hormone <- subset(hormone, !(bcr_patient_barcode %in% excluded))

length(unique(consolidated_hormone$bcr_patient_barcode)) # 500 patients

In [47]:
consolidated_hormone <- consolidated_hormone %>% distinct()

print(dim(consolidated_hormone))

[1] 642  10


In [48]:
# Some patients have more than 1 record
dup <- consolidated_hormone$bcr_patient_barcode[duplicated(consolidated_hormone$bcr_patient_barcode)]
consolidated_hormone[consolidated_hormone$bcr_patient_barcode %in% dup,]

Unnamed: 0,bcr_patient_barcode,bcr_drug_barcode,bcr_drug_uuid,days_to_drug_therapy_start,days_to_drug_therapy_end,therapy_types,drug_name,therapy_ongoing,measure_of_response,year_of_form_completion
7,TCGA-A2-A04R,TCGA-A2-A04R-D2556,258814a0-8e67-4ac2-84ec-1353588f2f9d,910,,Hormone Therapy,Anastrozole,YES,,2010
8,TCGA-A2-A04R,TCGA-A2-A04R-D2555,59cf62c8-9052-43df-aefa-ea940996727a,399,909,Hormone Therapy,Tamoxifen,NO,,2010
9,TCGA-A2-A04V,TCGA-A2-A04V-D2673,41160c56-81c5-42b7-83b8-cc0018073cac,1452,,Hormone Therapy,EXEMESTANE,NO,,2010
10,TCGA-A2-A04V,TCGA-A2-A04V-D2672,c1e949b8-77cf-4bbb-b6f2-a74ede20457a,107,1258,Hormone Therapy,TAMOXIFEN,NO,,2010
11,TCGA-A2-A04V,TCGA-A2-A04V-D2768,4de0e435-5d29-44a0-a877-986681b96a3b,1269,1439,Hormone Therapy,LETROZOLE,NO,,2010
13,TCGA-A2-A04Y,TCGA-A2-A04Y-D2741,d6d00a85-5494-4f08-8711-63169285c197,273,371,Hormone Therapy,Tamoxifen,NO,,2010
14,TCGA-A2-A04Y,TCGA-A2-A04Y-D2746,227dd2b0-0b22-46bd-95d4-20775978c5dd,372,,Hormone Therapy,Arimidex,YES,,2010
16,TCGA-A2-A0CL,TCGA-A2-A0CL-D7415,9ded5455-e05f-4b38-b10a-0accda9fdabb,505,,Hormone Therapy,Arimidex,YES,,2011
17,TCGA-A2-A0CL,TCGA-A2-A0CL-D7412,c002a4ce-ae2a-4c04-9941-a4fa99d93666,269,504,Hormone Therapy,Tamoxifen,NO,,2011
18,TCGA-A2-A0CO,TCGA-A2-A0CO-D33663,80B1C97B-8D60-43C3-BDCE-AC0B58A11419,84,512,Hormone Therapy,Tamoxifen,NO,Complete Response,2012


In [49]:
length(dup) # 142 patients received more than 1 type of hormone therapy

In [61]:
write.table(consolidated_hormone, "./metadata/consolidated_hormone.tsv", sep="\t", row.names=FALSE)

### Patients with Stable/Progressive during/after treatment

In [70]:
poor <- vector()
for (patient in unique(consolidated_hormone$bcr_patient_barcode)) {
    responses <- subset(consolidated_hormone, bcr_patient_barcode == patient)$measure_of_response
    if ("Stable" %in% responses | "Clinical Progressive Disease" %in% responses){
        poor <- c(poor, patient)
    }
}

print(length(poor))
poor

[1] 10


### Patients with Complete Response

In [72]:
good <- subset(consolidated_hormone, measure_of_response=="Complete Response" & 
                                    bcr_patient_barcode %notin% poor)$bcr_patient_barcode
print(length(good))
good

[1] 48


### Others

In [87]:
others <- subset(consolidated_hormone, bcr_patient_barcode %notin% good & 
                                    bcr_patient_barcode %notin% poor)$bcr_patient_barcode
print(length(others))

[1] 551


## Follow-up of patients received hormone therapy

In [57]:
follow_up <- GDCprepare_clinic(query, clinical.info = "follow_up")

We found more than one follow up version!
We will parse all and add a collumn (follow_up_version) to identify each version
Parsing follow up version: follow_up_v1.5




Parsing follow up version: follow_up_v2.1




Parsing follow up version: follow_up_v4.0




In [58]:
keep <- c('follow_up_version', 'bcr_patient_barcode', 'bcr_followup_barcode', 'bcr_followup_uuid', 
          'days_to_last_followup', 'vital_status', 'person_neoplasm_cancer_status', 
          'days_to_last_known_alive', 'days_to_death', 'days_to_new_tumor_event_after_initial_treatment', 
          'new_tumor_event_after_initial_treatment', 'new_neoplasm_event_type', 
          'new_neoplasm_event_occurrence_anatomic_site', 'new_neoplasm_occurrence_anatomic_site_text', 
          'lost_follow_up', 'new_tumor_events')
follow_up <- follow_up[,keep]

In [59]:
follow_up

follow_up_version,bcr_patient_barcode,bcr_followup_barcode,bcr_followup_uuid,days_to_last_followup,vital_status,person_neoplasm_cancer_status,days_to_last_known_alive,days_to_death,days_to_new_tumor_event_after_initial_treatment,new_tumor_event_after_initial_treatment,new_neoplasm_event_type,new_neoplasm_event_occurrence_anatomic_site,new_neoplasm_occurrence_anatomic_site_text,lost_follow_up,new_tumor_events
follow_up_v1.5,TCGA-A7-A0D9,TCGA-A7-A0D9-F2920,CC7279B1-E6EC-4C7C-B5D3-1FB6ABFAA9BC,273,Alive,TUMOR FREE,,,,,,,,,
follow_up_v1.5,TCGA-A7-A0DB,TCGA-A7-A0DB-F2931,36D2F1C0-4545-438D-A3CD-F1E072A82DB2,267,Alive,TUMOR FREE,,,,,,,,,
follow_up_v1.5,TCGA-AO-A03O,TCGA-AO-A03O-F3141,740A6308-84E3-4231-995D-82D7D5A34C86,,Dead,TUMOR FREE,,2483,,,,,,,
follow_up_v1.5,TCGA-BH-A0DZ,TCGA-BH-A0DZ-F4588,F7218F8F-0D41-4B86-B276-C020CA69412E,495,Alive,TUMOR FREE,,,,,,,,,
follow_up_v1.5,TCGA-A2-A04P,TCGA-A2-A04P-F2652,0D1BD57A-F783-438D-9F39-8A75193DDDCC,,Dead,WITH TUMOR,,548,179,,,,,,
follow_up_v1.5,TCGA-A2-A04P,TCGA-A2-A04P-F10241,1966A070-7389-434A-BE2F-01ABFB19B3AB,,Dead,WITH TUMOR,,548,180,,,,,,
follow_up_v1.5,TCGA-A2-A04Q,TCGA-A2-A04Q-F9329,B7EAC97B-63C1-4DB3-A0F0-62D03122CC88,2385,Alive,TUMOR FREE,,,,,,,,,
follow_up_v1.5,TCGA-A2-A04T,TCGA-A2-A04T-F9343,94203642-8318-4406-9B37-865EB9288015,2246,Alive,TUMOR FREE,,,,,,,,,
follow_up_v1.5,TCGA-A2-A04V,TCGA-A2-A04V-F10273,8A12F731-0D21-451D-AED9-2677F2356DE2,,Dead,WITH TUMOR,,1920,1470,,,,,,
follow_up_v1.5,TCGA-AO-A03T,TCGA-AO-A03T-F3255,25D19A8C-2173-4BC0-815E-256998CB4135,1187,Alive,TUMOR FREE,,,,,,,,,


In [62]:
follow_up_hormone <- subset(follow_up, bcr_patient_barcode %in% consolidated_hormone$bcr_patient_barcode)

In [63]:
write.table(follow_up_hormone, "./metadata/follow_up_hormone.tsv", sep="\t", row.names=FALSE)

**All either dead or alive is tumor free** ==> We classify the group by the `days_to_last_followup = f` or `days_to_death = d`.

Take `e` = `days_to_drug_therapy_end` of `max(days_to_drug_therapy_start)`, <br/>
`t` = `e` if  else `max(days_to_drug_therapy_start)`

If `!is.na(e)` <br/>
    AND the last follow-up day > 5 years after therapy end, meaning `[max(f) - e]/365 > 5` <br/>
        OR `[d - t]/365 > 5` ==> Low risk of recurrence
        
Elif `[max(f) - max(days_to_drug_therapy_start)] > 10` OR `[d - max(days_to_drug_therapy_start)] > 10` <br/>
    ==> Low risk of recurrence
    
Else ==> Unknown risk of recurrence