```
if (!requireNamespace("BiocManager", quietly=TRUE))
    install.packages("BiocManager")
BiocManager::install("TCGAbiolinks")
```

In [1]:
library("TCGAbiolinks")
library("dplyr")
'%notin%' <- Negate('%in%')

sessionInfo()


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



R version 3.5.1 (2018-07-02)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Ubuntu 16.04.6 LTS

Matrix products: default
BLAS/LAPACK: /home/singuyen/anaconda3/lib/R/lib/libRblas.so

locale:
 [1] LC_CTYPE=en_US.UTF-8    LC_NUMERIC=C            LC_TIME=vi_VN          
 [4] LC_COLLATE=en_US.UTF-8  LC_MONETARY=vi_VN       LC_MESSAGES=en_US.UTF-8
 [7] LC_PAPER=vi_VN          LC_NAME=C               LC_ADDRESS=C           
[10] LC_TELEPHONE=C          LC_MEASUREMENT=vi_VN    LC_IDENTIFICATION=C    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] dplyr_0.8.0.1       TCGAbiolinks_2.10.5

loaded via a namespace (and not attached):
  [1] uuid_0.1-2                    backports_1.1.4              
  [3] circlize_0.4.6                AnnotationHub_2.14.5         
  [5] aroma.light_3.12.0            plyr_1.8.4                   
  [7] selectr_0.4-1                 ConsensusClusterPlus_1.46.0  
  [9] repr

# Subtypes

In [2]:
subtypes <- PanCancerAtlas_subtypes()

head(subtypes)

pan.samplesID,cancer.type,Subtype_mRNA,Subtype_DNAmeth,Subtype_protein,Subtype_miRNA,Subtype_CNA,Subtype_Integrative,Subtype_other,Subtype_Selected
TCGA-OR-A5J1,ACC,steroid-phenotype-high+proliferation,CIMP-high,,miRNA_1,Quiet,COC3,C1A,ACC.CIMP-high
TCGA-OR-A5J2,ACC,steroid-phenotype-high+proliferation,CIMP-low,1.0,miRNA_1,Noisy,COC3,C1A,ACC.CIMP-low
TCGA-OR-A5J3,ACC,steroid-phenotype-high,CIMP-intermediate,3.0,miRNA_6,Chromosomal,COC2,C1A,ACC.CIMP-intermediate
TCGA-OR-A5J4,ACC,,CIMP-high,,miRNA_6,Chromosomal,,,ACC.CIMP-high
TCGA-OR-A5J5,ACC,steroid-phenotype-high,CIMP-intermediate,,miRNA_2,Chromosomal,COC2,C1A,ACC.CIMP-intermediate
TCGA-OR-A5J6,ACC,steroid-phenotype-low,CIMP-low,2.0,miRNA_1,Noisy,COC1,C1B,ACC.CIMP-low


In [3]:
brca_subtypes <- subset(subtypes, cancer.type == "BRCA", select=c("pan.samplesID", "Subtype_mRNA"))
print(dim(brca_subtypes))
head(brca_subtypes)

[1] 1218    2


pan.samplesID,Subtype_mRNA
TCGA-E2-A158-11A-22R-A12D-07,Normal
TCGA-BH-A0DD-11A-23R-A12P-07,LumA
TCGA-BH-A1EO-11A-31R-A137-07,LumA
TCGA-BH-A0B5-11A-23R-A12P-07,LumA
TCGA-A7-A13G-11A-51R-A13Q-07,LumA
TCGA-E9-A1NF-11A-73R-A14D-07,LumA


In [4]:
colnames(brca_subtypes) <- c("aliquot_barcode", "subtype")

In [5]:
brca_subtypes$case_barcode <- substr(brca_subtypes$aliquot_barcode, 1, 12)
brca_subtypes$sample_barcode <- substr(brca_subtypes$aliquot_barcode, 1, 16)
head(brca_subtypes)

aliquot_barcode,subtype,case_barcode,sample_barcode
TCGA-E2-A158-11A-22R-A12D-07,Normal,TCGA-E2-A158,TCGA-E2-A158-11A
TCGA-BH-A0DD-11A-23R-A12P-07,LumA,TCGA-BH-A0DD,TCGA-BH-A0DD-11A
TCGA-BH-A1EO-11A-31R-A137-07,LumA,TCGA-BH-A1EO,TCGA-BH-A1EO-11A
TCGA-BH-A0B5-11A-23R-A12P-07,LumA,TCGA-BH-A0B5,TCGA-BH-A0B5-11A
TCGA-A7-A13G-11A-51R-A13Q-07,LumA,TCGA-A7-A13G,TCGA-A7-A13G-11A
TCGA-E9-A1NF-11A-73R-A14D-07,LumA,TCGA-E9-A1NF,TCGA-E9-A1NF-11A


In [6]:
write.table(brca_subtypes, "./metadata/subtypes.tsv", sep="\t", row.names=FALSE)

------------------------
# Hormone therapy responses

In [7]:
query <- GDCquery(project = "TCGA-BRCA", 
                  data.category = "Clinical", 
                  file.type = "xml", 
                  barcode = brca_subtypes$case_barcode)
#GDCdownload(query) # Run this one time then data will be downloaded to your machine
clinical_drug <- GDCprepare_clinic(query, clinical.info = "drug")

--------------------------------------
o GDCquery: Searching in GDC database
--------------------------------------
Genome of reference: hg38
--------------------------------------------
oo Accessing GDC. This might take a while...
--------------------------------------------
ooo Project: TCGA-BRCA
--------------------
oo Filtering results
--------------------
ooo By file.type
ooo By barcode
----------------
oo Checking data
----------------
ooo Check if there are duplicated cases
ooo Check if there results for the query
-------------------
o Preparing output
-------------------




In [8]:
dim(clinical_drug)

In [9]:
colnames(clinical_drug)

In [10]:
clinical_drug <- clinical_drug[,c('bcr_patient_barcode', 'bcr_drug_barcode', 'bcr_drug_uuid', 
                 'days_to_drug_therapy_start', 'days_to_drug_therapy_end', 'therapy_types', 
                 'drug_name', 'therapy_ongoing', 'measure_of_response', 'year_of_form_completion')]

In [11]:
hormone <- subset(clinical_drug, therapy_types == "Hormone Therapy")

print(dim(hormone))
head(hormone)

[1] 729  10


Unnamed: 0,bcr_patient_barcode,bcr_drug_barcode,bcr_drug_uuid,days_to_drug_therapy_start,days_to_drug_therapy_end,therapy_types,drug_name,therapy_ongoing,measure_of_response,year_of_form_completion
15,TCGA-5L-AAT0,TCGA-5L-AAT0-D63723,BCA5B47A-BE3A-4258-8A17-692D7DFDDF87,322,,Hormone Therapy,Tamoxifen,YES,,2014
16,TCGA-5L-AAT0,TCGA-5L-AAT0-D63723,BCA5B47A-BE3A-4258-8A17-692D7DFDDF87,322,,Hormone Therapy,Tamoxifen,YES,,2014
17,TCGA-5L-AAT1,TCGA-5L-AAT1-D63716,747059AE-DC7B-409F-9178-2DD683C19A6F,46,,Hormone Therapy,Letrozol,YES,,2014
18,TCGA-5L-AAT1,TCGA-5L-AAT1-D63716,747059AE-DC7B-409F-9178-2DD683C19A6F,46,,Hormone Therapy,Letrozol,YES,,2014
20,TCGA-5T-A9QA,TCGA-5T-A9QA-D66891,906BC4D2-3C18-42A8-ABE2-BF47933ABA1F,304,,Hormone Therapy,Letrozole,YES,,2014
24,TCGA-A1-A0SF,TCGA-A1-A0SF-D12154,8B91D294-5433-4D18-A271-253430DDD871,157,918.0,Hormone Therapy,tamoxifen,NO,,2011


In [12]:
hormone$bcr_patient_barcode <- as.character(hormone$bcr_patient_barcode)
hormone$measure_of_response <- as.character(hormone$measure_of_response)

str(hormone)

'data.frame':	729 obs. of  10 variables:
 $ bcr_patient_barcode       : chr  "TCGA-5L-AAT0" "TCGA-5L-AAT0" "TCGA-5L-AAT1" "TCGA-5L-AAT1" ...
 $ bcr_drug_barcode          : chr  "TCGA-5L-AAT0-D63723" "TCGA-5L-AAT0-D63723" "TCGA-5L-AAT1-D63716" "TCGA-5L-AAT1-D63716" ...
 $ bcr_drug_uuid             : chr  "BCA5B47A-BE3A-4258-8A17-692D7DFDDF87" "BCA5B47A-BE3A-4258-8A17-692D7DFDDF87" "747059AE-DC7B-409F-9178-2DD683C19A6F" "747059AE-DC7B-409F-9178-2DD683C19A6F" ...
 $ days_to_drug_therapy_start: int  322 322 46 46 304 157 299 308 424 910 ...
 $ days_to_drug_therapy_end  : int  NA NA NA NA NA 918 NA 2121 NA NA ...
 $ therapy_types             : Factor w/ 19 levels "","Ancillary",..: 5 5 5 5 5 5 5 5 5 5 ...
 $ drug_name                 : Factor w/ 204 levels "","5 fluorouracil",..: 163 163 122 122 124 162 162 30 91 25 ...
 $ therapy_ongoing           : Factor w/ 3 levels "","NO","YES": 3 3 3 3 3 2 3 2 2 3 ...
 $ measure_of_response       : chr  "" "" "" "" ...
 $ year_of_form_completion   : i

In [13]:
length(unique(hormone$bcr_patient_barcode)) # 520 patients received hormone therapy

In [14]:
subset(brca_subtypes, case_barcode %in% hormone$bcr_patient_barcode & subtype == "Basal") %>% dim()

In [15]:
# 20 patients were triple negative but received hormone therapy. We exclude these patients
excluded <- subset(brca_subtypes, case_barcode %in% hormone$bcr_patient_barcode & subtype == "Basal")$case_barcode
consolidated_hormone <- subset(hormone, !(bcr_patient_barcode %in% excluded))

length(unique(consolidated_hormone$bcr_patient_barcode)) # 500 patients

In [16]:
consolidated_hormone <- consolidated_hormone %>% distinct()

print(dim(consolidated_hormone))

[1] 642  10


In [17]:
# Some patients have more than 1 record
dup <- consolidated_hormone$bcr_patient_barcode[duplicated(consolidated_hormone$bcr_patient_barcode)]
consolidated_hormone[consolidated_hormone$bcr_patient_barcode %in% dup,]

Unnamed: 0,bcr_patient_barcode,bcr_drug_barcode,bcr_drug_uuid,days_to_drug_therapy_start,days_to_drug_therapy_end,therapy_types,drug_name,therapy_ongoing,measure_of_response,year_of_form_completion
7,TCGA-A2-A04R,TCGA-A2-A04R-D2556,258814a0-8e67-4ac2-84ec-1353588f2f9d,910,,Hormone Therapy,Anastrozole,YES,,2010
8,TCGA-A2-A04R,TCGA-A2-A04R-D2555,59cf62c8-9052-43df-aefa-ea940996727a,399,909,Hormone Therapy,Tamoxifen,NO,,2010
9,TCGA-A2-A04V,TCGA-A2-A04V-D2673,41160c56-81c5-42b7-83b8-cc0018073cac,1452,,Hormone Therapy,EXEMESTANE,NO,,2010
10,TCGA-A2-A04V,TCGA-A2-A04V-D2672,c1e949b8-77cf-4bbb-b6f2-a74ede20457a,107,1258,Hormone Therapy,TAMOXIFEN,NO,,2010
11,TCGA-A2-A04V,TCGA-A2-A04V-D2768,4de0e435-5d29-44a0-a877-986681b96a3b,1269,1439,Hormone Therapy,LETROZOLE,NO,,2010
13,TCGA-A2-A04Y,TCGA-A2-A04Y-D2741,d6d00a85-5494-4f08-8711-63169285c197,273,371,Hormone Therapy,Tamoxifen,NO,,2010
14,TCGA-A2-A04Y,TCGA-A2-A04Y-D2746,227dd2b0-0b22-46bd-95d4-20775978c5dd,372,,Hormone Therapy,Arimidex,YES,,2010
16,TCGA-A2-A0CL,TCGA-A2-A0CL-D7415,9ded5455-e05f-4b38-b10a-0accda9fdabb,505,,Hormone Therapy,Arimidex,YES,,2011
17,TCGA-A2-A0CL,TCGA-A2-A0CL-D7412,c002a4ce-ae2a-4c04-9941-a4fa99d93666,269,504,Hormone Therapy,Tamoxifen,NO,,2011
18,TCGA-A2-A0CO,TCGA-A2-A0CO-D33663,80B1C97B-8D60-43C3-BDCE-AC0B58A11419,84,512,Hormone Therapy,Tamoxifen,NO,Complete Response,2012


In [18]:
length(dup) # 142 patients received more than 1 type of hormone therapy

In [19]:
write.table(consolidated_hormone, "./metadata/consolidated_hormone.tsv", sep="\t", row.names=FALSE)

### Patients with Stable/Progressive during/after treatment

In [20]:
poor <- vector()
for (patient in unique(consolidated_hormone$bcr_patient_barcode)) {
    responses <- subset(consolidated_hormone, bcr_patient_barcode == patient)$measure_of_response
    if ("Stable" %in% responses | "Clinical Progressive Disease" %in% responses){
        poor <- c(poor, patient)
    }
}

print(length(poor))
poor

[1] 10


### Patients with Complete Response

In [102]:
good <- unique(
            subset(consolidated_hormone, measure_of_response=="Complete Response" & 
                    bcr_patient_barcode %notin% poor)$bcr_patient_barcode
        )
print(length(good))
good

[1] 40


### Others

In [103]:
others <- unique(
            subset(consolidated_hormone, bcr_patient_barcode %notin% good & 
                                    bcr_patient_barcode %notin% poor)$bcr_patient_barcode
        )
print(length(others))

[1] 450


### Summary table

In [142]:
initial_response <- matrix(nrow=length(unique(consolidated_hormone$bcr_patient_barcode)), ncol=2)
initial_response[,1] <- c(poor, good, others)
initial_response[,2] <- c(rep("Progressive/Stable", length(poor)), 
                          rep("Complete response", length(good)), 
                          rep("Unknown", length(others)))
initial_response <- as.data.frame(initial_response, stringsAsFactors = FALSE)
colnames(initial_response) <- c("case_barcode", "initial_response")
table(initial_response$initial_response)


 Complete response Progressive/Stable            Unknown 
                40                 10                450 

## Follow-up of patients received hormone therapy

In [104]:
follow_up <- GDCprepare_clinic(query, clinical.info = "follow_up")

We found more than one follow up version!
We will parse all and add a collumn (follow_up_version) to identify each version
Parsing follow up version: follow_up_v1.5




Parsing follow up version: follow_up_v2.1




Parsing follow up version: follow_up_v4.0




In [105]:
keep <- c('follow_up_version', 'bcr_patient_barcode', 'bcr_followup_barcode', 'bcr_followup_uuid', 
          'days_to_last_followup', 'vital_status', 'person_neoplasm_cancer_status', 
          'days_to_last_known_alive', 'days_to_death', 'days_to_new_tumor_event_after_initial_treatment', 
          'new_tumor_event_after_initial_treatment', 'new_neoplasm_event_type', 
          'new_neoplasm_event_occurrence_anatomic_site', 'new_neoplasm_occurrence_anatomic_site_text', 
          'lost_follow_up', 'new_tumor_events')
follow_up <- follow_up[,keep]

In [106]:
head(follow_up)

follow_up_version,bcr_patient_barcode,bcr_followup_barcode,bcr_followup_uuid,days_to_last_followup,vital_status,person_neoplasm_cancer_status,days_to_last_known_alive,days_to_death,days_to_new_tumor_event_after_initial_treatment,new_tumor_event_after_initial_treatment,new_neoplasm_event_type,new_neoplasm_event_occurrence_anatomic_site,new_neoplasm_occurrence_anatomic_site_text,lost_follow_up,new_tumor_events
follow_up_v1.5,TCGA-A7-A0D9,TCGA-A7-A0D9-F2920,CC7279B1-E6EC-4C7C-B5D3-1FB6ABFAA9BC,273.0,Alive,TUMOR FREE,,,,,,,,,
follow_up_v1.5,TCGA-A7-A0DB,TCGA-A7-A0DB-F2931,36D2F1C0-4545-438D-A3CD-F1E072A82DB2,267.0,Alive,TUMOR FREE,,,,,,,,,
follow_up_v1.5,TCGA-AO-A03O,TCGA-AO-A03O-F3141,740A6308-84E3-4231-995D-82D7D5A34C86,,Dead,TUMOR FREE,,2483.0,,,,,,,
follow_up_v1.5,TCGA-BH-A0DZ,TCGA-BH-A0DZ-F4588,F7218F8F-0D41-4B86-B276-C020CA69412E,495.0,Alive,TUMOR FREE,,,,,,,,,
follow_up_v1.5,TCGA-A2-A04P,TCGA-A2-A04P-F2652,0D1BD57A-F783-438D-9F39-8A75193DDDCC,,Dead,WITH TUMOR,,548.0,179.0,,,,,,
follow_up_v1.5,TCGA-A2-A04P,TCGA-A2-A04P-F10241,1966A070-7389-434A-BE2F-01ABFB19B3AB,,Dead,WITH TUMOR,,548.0,180.0,,,,,,


In [107]:
follow_up_hormone <- subset(follow_up, bcr_patient_barcode %in% unique(consolidated_hormone$bcr_patient_barcode))

print(dim(follow_up_hormone))

print(length(unique(follow_up_hormone$bcr_patient_barcode))) # Only 493 patients have follow-up records

[1] 719  16
[1] 493


In [27]:
write.table(follow_up_hormone, "./metadata/follow_up_hormone.tsv", sep="\t", row.names=FALSE)

### Recurrence occured

In [122]:
log = data.frame(stringsAsFactors=FALSE)

In [123]:
select <- c(poor, good, others)
select <- select[select %in% unique(follow_up_hormone$bcr_patient_barcode)]
length(select)

In [124]:
recurrence_occured = vector()

select <- c(poor, good, others)
select <- select[select %in% unique(follow_up_hormone$bcr_patient_barcode)]

for (patient in select){

    df <- subset(follow_up_hormone, bcr_patient_barcode == patient)
    t <- min(subset(hormone, bcr_patient_barcode == patient, days_to_drug_therapy_start)) #min_start
    if (is.na(t)) {t <- min(subset(hormone, bcr_patient_barcode == patient, days_to_drug_therapy_end))} #min_end
    
    if (!is.na(t)){
        check <- "no"
        for (i in (1:nrow(df))) {
            new_tumor <- df[i,]$new_tumor_events
            last_follow <- df[i,]$days_to_last_followup 
            if ((new_tumor != "NO" & new_tumor != "" & !is.na(new_tumor)) & 
                (!is.na(last_follow) & last_follow >= t)){
                recurrence_occured <- c(recurrence_occured, patient)
                log <- rbind.data.frame(log, c(patient, "New tumor after/during treatment"), stringsAsFactors = FALSE)
                check <- "yes"
                break
            }

            days_2_new <- df[i,]$days_to_new_tumor
            if (!is.na(days_2_new) & days_2_new > t){
                recurrence_occured <- c(recurrence_occured, patient)
                log <- rbind.data.frame(log, c(patient, "New tumor after/during treatment"), stringsAsFactors = FALSE)
                check <- "yes"
                break
            }
            
            neoplasm_status <- df[i,]$person_neoplasm_cancer_status
            vital_status <- df[i,]$vital_status
            if (!is.na(vital_status) & !is.na(neoplasm_status) & 
                    vital_status == "Dead" & neoplasm_status == "WITH TUMOR"){
                recurrence_occured <- c(recurrence_occured, patient)
                log <- rbind.data.frame(log, c(patient, "Dead with tumor"), stringsAsFactors = FALSE)
                check <- "yes"
                break
            }
        }
        
        if (check == "no") {
            log <- rbind.data.frame(log, c(patient, "Low risk or unknown"), stringsAsFactors = FALSE)
        }
    }
    else {log <- rbind.data.frame(log, c(patient, "Drug start and end dates unknown"), stringsAsFactors = FALSE)}
}

In [125]:
colnames(log) <- c("case_barcode", "recurrence_log")
data.frame(table(log$recurrence_log))

Var1,Freq
Dead with tumor,12
Drug start and end dates unknown,17
Low risk or unknown,431
New tumor after/during treatment,33


In [126]:
print(length(recurrence_occured))
recurrence_occured

[1] 45


### Low risk of recurrence

In [128]:
low_risk = vector()

select <- subset(log, recurrence_log=="Low risk or unknown")$case_barcode

for (patient in select){
    df <- subset(follow_up_hormone, bcr_patient_barcode == patient)
    neoplasm_status <- df$person_neoplasm_cancer_status
    if ("WITH TUMOR" %in% neoplasm_status){
        log[log$case_barcode == patient,]$recurrence_log <- "Reported having tumor during follow-up but unknown recurrence"
        next
    }
    else if ("TUMOR FREE" %in% neoplasm_status){
        vital_status <- df$vital_status
        t <- max(subset(hormone, bcr_patient_barcode == patient, days_to_drug_therapy_end)) #max_end
        thres <- 5
        if (is.na(t)){
            t <- max(subset(hormone, bcr_patient_barcode == patient, days_to_drug_therapy_start)) #max_start
            thres <- 10
        }
        
        if ("Dead" %in% vital_status) {
            i <- which(vital_status == "Dead")
            days_2_death <- df[i,]$days_to_death
            if (!is.na(days_2_death)) {
                if ((days_2_death - t)/365 > thres) {
                    low_risk <- c(low_risk, patient)
                    log[log$case_barcode == patient,]$recurrence_log <- "Dead tumor free after risky period"
                    next
                } 
                else {
                    log[log$case_barcode == patient,]$recurrence_log <- "Dead tumor free during risky period" 
                    next
                }
            } else {
                log[log$case_barcode == patient,]$recurrence_log <- "Dead tumor free but unknown death date"
                next
            }
        } 
        
        else if ("Alive" %in% vital_status) {
            last_follow <- max(df$days_to_last_followup)
            if (!is.na(last_follow)) {
                if ((last_follow - t)/365 > thres) {
                    low_risk <- c(low_risk, patient)
                    log[log$case_barcode == patient,]$recurrence_log <- "Alive tumor free after risky period"
                    next
                } 
                else {
                    log[log$case_barcode == patient,]$recurrence_log <- "Alive tumor free, last follow-up within risky period"
                    next
                }
            } else {
                log[log$case_barcode == patient,]$recurrence_log <- "Alive tumor free but unknown follow-up date"
                next
            }          
        }
        
        else {
            log[log$case_barcode == patient,]$recurrence_log <- "Unknown vital status"
            next
        }
    }
    else log[log$case_barcode == patient,]$recurrence_log <- "Unknown neoplasm status"
}

In [129]:
data.frame(table(log$recurrence_log))

Var1,Freq
Alive tumor free after risky period,8
Alive tumor free but unknown follow-up date,8
"Alive tumor free, last follow-up within risky period",386
Dead tumor free but unknown death date,1
Dead tumor free during risky period,10
Dead with tumor,12
Drug start and end dates unknown,17
New tumor after/during treatment,33
Reported having tumor during follow-up but unknown recurrence,8
Unknown neoplasm status,10


In [130]:
print(length(low_risk))
low_risk

[1] 8


### Unknown of recurrence status

In [136]:
select <- c(poor, good, others)
select <- select[select %in% unique(follow_up_hormone$bcr_patient_barcode)]
unknown_recurrence <- select[select %notin% recurrence_occured &  select %notin% low_risk]
length(unknown_recurrence)

### Table of recurrence status and log details

In [144]:
recurrence_status <- matrix(nrow=length(unique(follow_up_hormone$bcr_patient_barcode)), ncol=2)
recurrence_status[,1] <- c(recurrence_occured, low_risk, unknown_recurrence)
recurrence_status[,2] <- c(rep("Occured", length(recurrence_occured)), 
                           rep("Low risk", length(low_risk)), 
                           rep("Unknown", length(unknown_recurrence)))

recurrence_status <- as.data.frame(recurrence_status, stringsAsFactors = FALSE)
colnames(recurrence_status) <- c("case_barcode", "recurrence_status")
print(str(recurrence_status))
table(recurrence_status$recurrence_status)

'data.frame':	493 obs. of  2 variables:
 $ case_barcode     : chr  "TCGA-A2-A0EW" "TCGA-A7-A3RF" "TCGA-A8-A08O" "TCGA-D8-A73W" ...
 $ recurrence_status: chr  "Occured" "Occured" "Occured" "Occured" ...
NULL



Low risk  Occured  Unknown 
       8       45      440 

In [145]:
recurrence <- merge(x = recurrence_status, y = log, by = "case_barcode", all = TRUE)
str(recurrence)

'data.frame':	493 obs. of  3 variables:
 $ case_barcode     : chr  "TCGA-5T-A9QA" "TCGA-A2-A04N" "TCGA-A2-A04R" "TCGA-A2-A04V" ...
 $ recurrence_status: chr  "Unknown" "Low risk" "Unknown" "Occured" ...
 $ recurrence_log   : chr  "Unknown neoplasm status" "Alive tumor free after risky period" "Alive tumor free, last follow-up within risky period" "New tumor after/during treatment" ...


## Merged table of hormone therapy initial responses + recurrence statuses

In [146]:
hormone_response_recurrence <- merge(x = initial_response, y = recurrence, by = "case_barcode", all = TRUE)
str(hormone_response_recurrence)

'data.frame':	500 obs. of  4 variables:
 $ case_barcode     : chr  "TCGA-5L-AAT0" "TCGA-5L-AAT1" "TCGA-5T-A9QA" "TCGA-A1-A0SF" ...
 $ initial_response : chr  "Unknown" "Unknown" "Unknown" "Unknown" ...
 $ recurrence_status: chr  NA NA "Unknown" NA ...
 $ recurrence_log   : chr  NA NA "Unknown neoplasm status" NA ...


In [147]:
write.table(hormone_response_recurrence, "./metadata/hormone_response_recurrence.tsv", sep="\t", row.names=FALSE)

**All either dead or alive is tumor free** ==> We classify the group by the `days_to_last_followup = f` or `days_to_death = d`.

Take `e` = `days_to_drug_therapy_end` of `max(days_to_drug_therapy_start)`, <br/>
`t` = `e` if  else `max(days_to_drug_therapy_start)`

If `!is.na(e)` <br/>
    AND the last follow-up day > 5 years after therapy end, meaning `[max(f) - e]/365 > 5` <br/>
        OR `[d - t]/365 > 5` ==> Low risk of recurrence
        
Elif `[max(f) - max(days_to_drug_therapy_start)] > 10` OR `[d - max(days_to_drug_therapy_start)] > 10` <br/>
    ==> Low risk of recurrence
    
Else ==> Unknown risk of recurrence