In [2]:
library(dplyr)
library(miceadds)

# 1) Nejm Dataset

In [79]:
df_nejm <- read.table("../Validation/full_data_validation_NO_ITD.tsv")

# Rename columns like in Cardiff

df_nejm <- df_nejm %>% rename(OS_stat = OS_Status, gender = Gender , age = Age, hb = HB, bm_blasts = BM_Blasts, plt = PLT, wbc = WBC)
# Add CR Relapse and their status from AMLSG_Clinical:

load.Rdata("../Validation/AMLSG_Clinical_Anon.RData","val_data")
rownames(val_data) <- val_data$PDID

df_nejm <- merge(df_nejm,val_data[,c("PDID","CR_date","rfs","rfsstat")],by = 0)
df_nejm <- df_nejm %>% rename(CR = CR_date, Relapse = rfs , Relapse_stat = rfsstat)

df_nejm$CR <- as.numeric(df_nejm$CR) / 365
df_nejm$Relapse <- df_nejm$Relapse / 365



# Handle  CR stat and Relapse stat

# For CR stat , we say that when CR is Na the status is 0 or not Na but CR greater than OS

df_nejm$CR_stat <- ifelse(is.na(df_nejm$CR) | (!is.na(df_nejm$CR) & df_nejm$CR>df_nejm$OS),0,1)

## for relapse we keep relapse stat when available and relapse lower than OS (because rfs is either relapse or OS so if they are equals it means that it's OS so we put 0)
## and we put 0 when patients did not experienced CR (because you need CR before Relapse)
#we just made sure that CR+Relapse is lower than OS

df_nejm[is.na(df_nejm$CR) |is.na(df_nejm$Relapse) |( !is.na(df_nejm$CR) & df_nejm$CR+df_nejm$Relapse>=df_nejm$OS) ,"Relapse_stat"] <- 0
 
# # Handle CR
df_nejm[df_nejm$CR_stat==0,"CR"] <- df_nejm[df_nejm$CR_stat==0,"OS"] 

# # Handle Relapse

df_nejm[df_nejm$Relapse_stat==0,"Relapse"] <- df_nejm[df_nejm$Relapse_stat==0,"OS"]
df_nejm[df_nejm$Relapse_stat==1,"Relapse"] <- df_nejm[df_nejm$Relapse_stat==1,"CR"] + df_nejm[df_nejm$Relapse_stat==1,"Relapse"]


df_nejm$Death_in_CR <- df_nejm$OS
df_nejm$Death_in_CR_stat <- ifelse(df_nejm$CR_stat==1 & df_nejm$Relapse_stat==0 & df_nejm$OS_stat==1,1,0 )

df_nejm$Death_in_Relapse <- df_nejm$OS
df_nejm$Death_in_Relapse_stat <- ifelse(df_nejm$CR_stat==1 & df_nejm$Relapse_stat==1 & df_nejm$OS_stat==1,1,0 )

df_nejm$Death_without <- df_nejm$OS
df_nejm$Death_without_stat <- ifelse(df_nejm$CR_stat==0 & df_nejm$Relapse_stat==0 & df_nejm$OS_stat==1,1,0 )


# Add New Proposal

df_nejm$molecular_classification <- "none"

df_nejm[(df_nejm$full_component_NPM1==1 |df_nejm$full_component_inv_16==1 | df_nejm$full_component_t_8_21==1 | df_nejm$full_component_t_15_17==1 ) & df_nejm$molecular_classification=="none","molecular_classification"] <- "NEW_favorable"

df_nejm[(df_nejm$full_component_chr_splicing_1==1 | df_nejm$full_component_t_6_9==1) & df_nejm$molecular_classification=="none" ,"molecular_classification"] <- "NEW_intermediate"
          
df_nejm[((df_nejm$full_component_additions==1 & df_nejm$overlap==1) | df_nejm$full_component_chr_splicing_multiple==1 | 
          df_nejm$full_component_TP53_complex==1   | df_nejm$full_component_inv_3==1) & df_nejm$molecular_classification=="none","molecular_classification"] <- "NEW_adverse"  
          
df_nejm[((df_nejm$full_component_additions==1 & df_nejm$overlap==0) | df_nejm$full_component_t_11==1 | df_nejm$full_component_WT1==1 | df_nejm$full_component_DNMT3A_IDH1_2==1 |
          df_nejm$full_component_not_assigned==1) & df_nejm$molecular_classification=="none","molecular_classification"] <- "NEW_intermediate"
          
df_nejm[(df_nejm$full_component_CEBPA_bi==1 | df_nejm$full_component_no_events==1) & df_nejm$molecular_classification=="none" ,"molecular_classification"] <- "NEW_favorable"

#order is important! adverse . then intermediate otherwise we . overlap
df_nejm[df_nejm$molecular_classification=="NEW_intermediate" & df_nejm$ITD==1,"molecular_classification"] <- "NEW_adverse"

df_nejm[df_nejm$molecular_classification=="NEW_favorable" & df_nejm$ITD==1,"molecular_classification"] <- "NEW_intermediate"

df_nejm$NEW_favorable <-  ifelse(df_nejm$molecular_classification=="NEW_favorable",1,0)
df_nejm$NEW_intermediate <-  ifelse(df_nejm$molecular_classification=="NEW_intermediate",1,0)
df_nejm$NEW_adverse <-  ifelse(df_nejm$molecular_classification=="NEW_adverse",1,0)
table(df_nejm$molecular_classification)

rownames(df_nejm) <- df_nejm$Row.names
df_nejm$Row.names <- NULL

table(df_nejm$Relapse_stat)
table(df_nejm$CR_stat)
table(df_nejm$Death_in_CR_stat)
table(df_nejm$Death_in_Relapse_stat)
table(df_nejm$Death_without_stat)
write.table(df_nejm,"df_nejm_personnalization.tsv")


     NEW_adverse    NEW_favorable NEW_intermediate 
             356              372              448 


  0   1 
632 544 


  0   1 
200 976 


   0    1 
1111   65 


  0   1 
742 434 


  0   1 
981 195 

# OLD NEJM

In [80]:
df_nejm <- read.table("../Validation/full_data_validation_NO_ITD.tsv")

# Rename columns like in Cardiff

df_nejm <- df_nejm %>% rename(OS_stat = OS_Status, gender = Gender , age = Age, hb = HB, bm_blasts = BM_Blasts, plt = PLT, wbc = WBC)
# Add CR Relapse and their status from AMLSG_Clinical:

load.Rdata("../Validation/AMLSG_Clinical_Anon.RData","val_data")
rownames(val_data) <- val_data$PDID

df_nejm <- merge(df_nejm,val_data[,c("PDID","CR_date","rfs","rfsstat")],by = 0)
df_nejm <- df_nejm %>% rename(CR = CR_date, Relapse = rfs , Relapse_stat = rfsstat)

df_nejm$CR <- as.numeric(df_nejm$CR) / 365
df_nejm$Relapse <- df_nejm$Relapse / 365

# Handle  CR stat and Relapse stat

# For CR stat , we say that when CR is Na the status is 0 . That makes sense because for all other cases CR date is different from OSpw 

df_nejm$CR_stat <- ifelse(is.na(df_nejm$CR),0,1)
df_nejm[is.na(df_nejm$CR),"Relapse_stat"] <- 0  ## for relapse we keep relapse stat when available and we put 0 when patients did not experienced CR (because you need CR before Relapse)



# Handle CR

df_nejm[is.na(df_nejm$CR),"CR"] <- df_nejm[is.na(df_nejm$CR),"OS"] 

# Handle Relapse

df_nejm[df_nejm$Relapse_stat==0,"Relapse"] <- df_nejm[df_nejm$Relapse_stat==0,"OS"]
df_nejm[df_nejm$Relapse_stat==1,"Relapse"] <- df_nejm[df_nejm$Relapse_stat==1,"CR"] + df_nejm[df_nejm$Relapse_stat==1,"Relapse"]
df_nejm[df_nejm$Relapse > df_nejm$OS,"Relapse"] <- df_nejm[df_nejm$Relapse > df_nejm$OS,"OS"]   ## sanity check it is the case due to rounding errors for 63 patients so we fix it

df_nejm$Death_in_CR <- df_nejm$OS
df_nejm$Death_in_CR_stat <- ifelse(df_nejm$CR_stat==1 & df_nejm$Relapse_stat==0 & df_nejm$OS_stat==1,1,0 )

df_nejm$Death_in_Relapse <- df_nejm$OS
df_nejm$Death_in_Relapse_stat <- ifelse(df_nejm$CR_stat==1 & df_nejm$Relapse_stat==1 & df_nejm$OS_stat==1,1,0 )

df_nejm$Death_without <- df_nejm$OS
df_nejm$Death_without_stat <- ifelse(df_nejm$CR_stat==0 & df_nejm$Relapse_stat==0 & df_nejm$OS_stat==1,1,0 )


# Add New Proposal

df_nejm$molecular_classification <- "none"

df_nejm[(df_nejm$full_component_NPM1==1 |df_nejm$full_component_inv_16==1 | df_nejm$full_component_t_8_21==1 | df_nejm$full_component_t_15_17==1 ) & df_nejm$molecular_classification=="none","molecular_classification"] <- "NEW_favorable"

df_nejm[(df_nejm$full_component_chr_splicing_1==1 | df_nejm$full_component_t_6_9==1) & df_nejm$molecular_classification=="none" ,"molecular_classification"] <- "NEW_intermediate"
          
df_nejm[((df_nejm$full_component_additions==1 & df_nejm$overlap==1) | df_nejm$full_component_chr_splicing_multiple==1 | 
          df_nejm$full_component_TP53_complex==1   | df_nejm$full_component_inv_3==1) & df_nejm$molecular_classification=="none","molecular_classification"] <- "NEW_adverse"  
          
df_nejm[((df_nejm$full_component_additions==1 & df_nejm$overlap==0) | df_nejm$full_component_t_11==1 | df_nejm$full_component_WT1==1 | df_nejm$full_component_DNMT3A_IDH1_2==1 |
          df_nejm$full_component_not_assigned==1) & df_nejm$molecular_classification=="none","molecular_classification"] <- "NEW_intermediate"
          
df_nejm[(df_nejm$full_component_CEBPA_bi==1 | df_nejm$full_component_no_events==1) & df_nejm$molecular_classification=="none" ,"molecular_classification"] <- "NEW_favorable"

#order is important! adverse . then intermediate otherwise we . overlap
df_nejm[df_nejm$molecular_classification=="NEW_intermediate" & df_nejm$ITD==1,"molecular_classification"] <- "NEW_adverse"

df_nejm[df_nejm$molecular_classification=="NEW_favorable" & df_nejm$ITD==1,"molecular_classification"] <- "NEW_intermediate"

df_nejm$NEW_favorable <-  ifelse(df_nejm$molecular_classification=="NEW_favorable",1,0)
df_nejm$NEW_intermediate <-  ifelse(df_nejm$molecular_classification=="NEW_intermediate",1,0)
df_nejm$NEW_adverse <-  ifelse(df_nejm$molecular_classification=="NEW_adverse",1,0)
table(df_nejm$molecular_classification)

rownames(df_nejm) <- df_nejm$Row.names
df_nejm$Row.names <- NULL
table(df_nejm$Relapse_stat)
table(df_nejm$CR_stat)
table(df_nejm$Death_in_CR_stat)
table(df_nejm$Death_in_Relapse_stat)
table(df_nejm$Death_without_stat)


     NEW_adverse    NEW_favorable NEW_intermediate 
             356              372              448 


  0   1 
567 609 


  0   1 
200 976 


   0 
1176 


  0   1 
677 499 


  0   1 
981 195 

In [8]:
write.table(df_nejm,"df_nejm_personnalization.tsv")

# 2) Cardiff Dataset

In [83]:
df <- read.table('../../../clustering/clustering_Final_1/df_final_full_component.tsv')
all_gen <- colnames(df)[c(5:88)]
vect <- apply(X=df[,all_gen],2,FUN=function(x) 100*length(which(x==1))/dim(df)[1])
gen <- colnames(df[,match(names(vect[vect>=2]),names(df))])

all_cyto <- colnames(df)[c(89:158)]
vect <- apply(X=df[,all_cyto],2,FUN=function(x) 100*length(which(x==1))/dim(df)[1])
cyto <- c(colnames(df[,match(names(vect[vect>=2]),names(df))]),"inv_3","t_15_17")       
              
comp <- colnames(df)[170:186]

clin <- colnames(df)[c(159:165)]
demo <- colnames(df)[166:167]
master <- read.table('../../../../data/initial_dataset/Master_04_10_2019.csv',sep=",",header=T)
rownames(master) <- master$data_pd

              
df_cardiff <- merge(df[,c(comp,all_gen,all_cyto,clin,demo,"eln_2017_favorable","eln_2017_intermediate","eln_2017_adverse")],master[,c("os","os_status","RelapseCIStatus","OS_CR","RFSyears")],by=0)
rownames(df_cardiff) <- df_cardiff$Row.names
df_cardiff$patient_id <- df_cardiff$Row.names
              

# New molecular Classification


df_cardiff$molecular_classification <- "none"

df_cardiff[(df_cardiff$full_component_NPM1==1 |df_cardiff$full_component_inv_16==1 | df_cardiff$full_component_t_8_21==1 | df_cardiff$full_component_t_15_17==1 ) & df_cardiff$molecular_classification=="none","molecular_classification"] <- "NEW_favorable"

df_cardiff[(df_cardiff$full_component_chr_splicing_1==1 | df_cardiff$full_component_t_6_9==1) & df_cardiff$molecular_classification=="none" ,"molecular_classification"] <- "NEW_intermediate"
          
df_cardiff[((df_cardiff$full_component_additions==1 & df_cardiff$overlap==1) | df_cardiff$full_component_chr_splicing_multiple==1 | 
          df_cardiff$full_component_TP53_complex==1   | df_cardiff$full_component_inv_3==1) & df_cardiff$molecular_classification=="none","molecular_classification"] <- "NEW_adverse"  
          
df_cardiff[((df_cardiff$full_component_additions==1 & df_cardiff$overlap==0) | df_cardiff$full_component_t_11==1 | df_cardiff$full_component_WT1==1 | df_cardiff$full_component_DNMT3A_IDH1_2==1 |
          df_cardiff$full_component_not_assigned==1) & df_cardiff$molecular_classification=="none","molecular_classification"] <- "NEW_intermediate"
          
df_cardiff[(df_cardiff$full_component_CEBPA_bi==1 | df_cardiff$full_component_no_events==1) & df_cardiff$molecular_classification=="none" ,"molecular_classification"] <- "NEW_favorable"

#order is important! adverse . then intermediate otherwise we . overlap
df_cardiff[df_cardiff$molecular_classification=="NEW_intermediate" & df_cardiff$ITD==1,"molecular_classification"] <- "NEW_adverse"

df_cardiff[df_cardiff$molecular_classification=="NEW_favorable" & df_cardiff$ITD==1,"molecular_classification"] <- "NEW_intermediate"

df_cardiff$NEW_favorable <-  ifelse(df_cardiff$molecular_classification=="NEW_favorable",1,0)
df_cardiff$NEW_intermediate <-  ifelse(df_cardiff$molecular_classification=="NEW_intermediate",1,0)
df_cardiff$NEW_adverse <-  ifelse(df_cardiff$molecular_classification=="NEW_adverse",1,0)
table(df_cardiff$molecular_classification)



# Remove NA values from Relapse Status (2125 to 2044 patients)

df_cardiff <- df_cardiff[!is.na(df_cardiff$RelapseCIStatus),]              
              
# Remove weird cases (15) : we do not know when they have relapsed or CR


df_cardiff <- df_cardiff[!(df_cardiff$RelapseCIStatus==1 & !is.na(df_cardiff$RelapseCIStatus) & is.na(df_cardiff$OS_CR)),] # 11 cases : they relapsed but we dont know when!
df_cardiff <- df_cardiff[!(df_cardiff$RelapseCIStatus==1 & !is.na(df_cardiff$RelapseCIStatus) & df_cardiff$os_status==1 & df_cardiff$RFSyears > df_cardiff$OS_CR),]   # 3 cases : you can not relapse from CR  after you die !!!
df_cardiff <- df_cardiff[!(df_cardiff$RFSyears==0 & !is.na(df_cardiff$RFSyears)),]  # 1 case :  remove when Relapse is exactly 0!


# Handle CR

df_cardiff$CR_stat <- ifelse(is.na(df_cardiff$OS_CR),0,1) 
# we are confident to say that those NA OS_CR did not experienced CR because those are the only patients for which RFSyears is equal to overall survival .
# Other patients have relapse survival from CR different from overall survival which means that they experienced CR!
# Also for all that have CR_stat=1 with our rule , the RelapseCIStatus is Death without Relapse so it is concordant
df_cardiff[is.na(df_cardiff$OS_CR),"CR"] <- df_cardiff[is.na(df_cardiff$OS_CR),"os"]
df_cardiff[!is.na(df_cardiff$OS_CR),"CR"]<- df_cardiff[!is.na(df_cardiff$OS_CR),"os"] - df_cardiff[!is.na(df_cardiff$OS_CR),"OS_CR"]   # time to CR from 0 : survial time - suvival time from CR

# Handle RFS (relapse from first CR)

df_cardiff$Relapse_stat <- ifelse(df_cardiff$RelapseCIStatus==1 & !is.na(df_cardiff$RelapseCIStatus),1,0)
df_cardiff$Relapse <- df_cardiff$os
df_cardiff[df_cardiff$Relapse_stat==1,"Relapse"] <- df_cardiff[df_cardiff$Relapse_stat==1,"CR"] + df_cardiff[df_cardiff$Relapse_stat==1,"RFSyears"]   # time to Relapse from 0 = time to CR from 0   (above) + relapse free time from CR (RFSyears)
              


df_cardiff$OS <- df_cardiff$os
df_cardiff$OS_stat <- df_cardiff$os_status

df_cardiff$Death_in_CR <- df_cardiff$OS
df_cardiff$Death_in_CR_stat <- ifelse(df_cardiff$CR_stat==1 & df_cardiff$Relapse_stat==0 & df_cardiff$OS_stat==1,1,0 )

df_cardiff$Death_in_Relapse <- df_cardiff$OS
df_cardiff$Death_in_Relapse_stat <- ifelse(df_cardiff$CR_stat==1 & df_cardiff$Relapse_stat==1 & df_cardiff$OS_stat==1,1,0 )

df_cardiff$Death_without <- df_cardiff$OS
df_cardiff$Death_without_stat <- ifelse(df_cardiff$CR_stat==0 & df_cardiff$Relapse_stat==0 & df_cardiff$OS_stat==1,1,0 )
              
df_cardiff$Row.names <- NULL
              

              
              


     NEW_adverse    NEW_favorable NEW_intermediate 
             752              689              684 


   0    1 
1274  755 

In [21]:
write.table(df_cardiff,"df_cardiff_personnalization.tsv")