In [36]:
library(gridExtra)
library(matrixStats)
library(dplyr)
library(rlang)
library(reshape2)
library(clusterCrit)
library(grid)
library(ggrepel)
library(RColorBrewer)
library(dplyr)
library(IRdisplay)
library(scales)
library(survival)
library(corrplot)
library(Hmisc)
library(ggplot2)
library(survminer)
library(muhaz)
library(dplyr)
library(stringr)
library(dummies)
library(miceadds)
options(warn=-1)
library(data.table)
library(tidyr)

source('../../../../src/tools.R')     # custom tools function
source('../../../../src/hdp_tools_yanis.R')
source("../../../../src/merge_df.R")
source("../../../../src/my_toolbox.R")
source("../../../../src/my_components.R")
source("../../../../src/my_utils.R")
source("../../../../src/ggstyles.R")
source("../../../../src/my_hotspots.R")
###


theme_set(theme_minimal())

# set jupyer notebook parameters
options(repr.plot.res        = 100, # set a medium-definition resolution for the jupyter notebooks plots (DPI)
        repr.matrix.max.rows = 200, # set the maximum number of rows displayed
        repr.matrix.max.cols = 200) # set the maximum number of columns displayed







In [92]:
data_clin <- read.csv(file="ULM.csv", header=TRUE, sep=",")[,c("ID","Gender","Age","HB","BM_Blasts","PLT","WBC","OS","OS_Status")]
# data_clin <- read.csv(file="ULM.csv", header=TRUE, sep=",")[,c("ID","Gender","Age","BM_Blasts","PLT","WBC","OS","OS_Status")]
data_clin$OS_Status <- ifelse(data_clin$OS_Status=="TRUE",1,0)

data2 <- read.table(file="AMLSG_Classification.txt", header = TRUE, sep = "")[,c("ID","NPM1","CEBPA_bi","inv16","t15_17","t8_21","t6_9","inv3")]
setnames(data2, old = c("inv16","t15_17","t8_21","t6_9","inv3"), new = c("inv_16","t_15_17","t_8_21","t_6_9","inv_3"))

data_itd <- read.table(file="AMLSG_FLT3ITD.txt", header = TRUE, sep = "")[,c("ID","FLT3_ITD_status")]
colnames(data_itd)[colnames(data_itd)=="FLT3_ITD_status"] <- "ITD" 
data_itd$ITD <- ifelse(data_itd$ITD=="ITD",1,0)

data_genes <- read.table(file="AMLSG_Genetic.txt", header = TRUE, sep = "")
colnames(data_genes)[colnames(data_genes)=="SAMPLE_NAME"] <- "ID"
data_genes<- cbind(data_genes,dummy('GENE',data_genes, sep = "_"))
data_genes <- data_genes[ , names(data_genes)=="ID" | grepl( "GENE_" , names( data_genes ) ) ]
data_genes <- data_genes[!duplicated(data_genes), ]
data_genes <- data_genes %>% group_by(ID) %>% summarise_all(list(sum))
colnames(data_genes)[colnames(data_genes)=="GENE_SFRS2"] <- "GENE_SRSF2"   ###ERROR IN THE COLNAME
names(data_genes) <- gsub("GENE_", "", names(data_genes), fixed = TRUE)
data_genes$NPM1 <- NULL   ## already introduced
data_genes$CEBPA <- NULL  ## already introduced

load.Rdata("AMLSG_Clinical_Anon.RData","val_data")
colnames(val_data)[colnames(val_data)=="PDID"] <- "ID"

val_data$del_3 <- val_data$abn3q_other
val_data$del_5 <- val_data$minus5_5q
val_data$del_7 <- val_data$minus7 + val_data$minus7q + val_data$abn7other
val_data$del_9 <- val_data$minus9q
val_data$del_12 <- val_data$mono12_12p_abn12p
val_data$del_17 <- val_data$mono17_17p_abn17p
val_data$del_18 <- val_data$minus18_18q
val_data$del_20 <- val_data$minus20_20q
val_data$minusy <- val_data$minusY

val_data$add_8 <- val_data$plus8_8q
val_data$add_11 <- val_data$plus11_11q
val_data$add_13 <- val_data$plus13
val_data$add_21 <- val_data$plus21
val_data$add_22 <- val_data$plus22
val_data$t_v_11 <- ifelse(val_data$t_v_11=="True",1,0)

val_data <- val_data[,c("ID","del_3","del_5","del_7","del_9","del_12","del_17","del_18","del_20",
                        "minusy","add_8","add_11","add_13","add_21","add_22","t_9_22","t_9_11",
                        "t_v_11","complex","FLT3_TKD","FLT3_other","minus7")]                      # we add minus 7 only for the eln 

nejm_data <- merge(data_clin,data2, by="ID")
dim(nejm_data)
nejm_data <- merge(nejm_data,data_itd,by ="ID")
dim(nejm_data)
nejm_data <- merge(nejm_data,data_genes,by="ID")   ### not same number of patients
dim(nejm_data)
nejm_data <- merge(nejm_data,val_data,by="ID")
# colnames(nejm_data)
nejm_data <- nejm_data[,c(1,10,11,17:69,88,89,12:16,84:87,70:83,8,9,2:7,90)]
rownames(nejm_data) <- nejm_data$ID
dim(nejm_data)
nejm_data <- nejm_data[-1]
nejm_data <- na.omit(nejm_data)   ### complete case 
write.table(nejm_data,"nejm_data.tsv")

In [91]:
all.equal(read.table("tmp.tsv",sep =" ",header=T),read.table("nejm_data.tsv",sep =" ",header=T))

# DATASET ITD SEPARATION FOR COMPONENT

In [96]:
tmp <- read.table("nejm_data_eln.tsv",sep =" ",header=T)
rownames(tmp) <- tmp$X
tmp$X <- NULL
genes <- colnames(tmp)[1:57]
cyto <- colnames(tmp)[58:80]

factors <- c("ZRSR2","U2AF1","SRSF2","SF3B1","ASXL1","STAG2","BCOR","RUNX1","EZH2","MLL","PHF6","SF1","NF1","CUX1")    #"SETBP1" missing
mol_defining <- c('NPM1','CEBPA_bi','inv16','t15_17','t8_21','t6_9','inv3','DNMT3A','IDH1','IDH2','WT1',factors)

tmp$full_component_t_15_17 <- ifelse((tmp$t_15_17==1),1,0)

tmp$full_component_inv_16 <- ifelse((tmp$inv_16==1),1,0) 

tmp$full_component_t_8_21 <- ifelse((tmp$t_8_21==1 & tmp$ITD==0),1,0)
tmp$full_component_t_8_21_ITD <- ifelse((tmp$t_8_21==1 & tmp$ITD==1),1,0) 

tmp$full_component_t_11 <- ifelse((tmp$t_v_11==1 | tmp$t_9_11==1),1,0)   

tmp$full_component_t_6_9 <- ifelse((tmp$t_6_9==1 & tmp$ITD==0),1,0)
tmp$full_component_t_6_9_ITD <- ifelse((tmp$t_6_9==1 & tmp$ITD==1),1,0)

tmp$full_component_inv_3 <- ifelse((tmp$inv_3==1),1,0)

tmp$full_component_additions <- ifelse((tmp$add_8==1 | tmp$add_13==1 | tmp$add_21==1 | tmp$add_22==1),1,0)

tmp$full_component_TP53_complex <- ifelse((tmp$TP53==1 | tmp$complex==1) ,1,0)

tmp$full_component_NPM1 <- ifelse((tmp$NPM1==1 & tmp$ITD==0),1,0) 
tmp$full_component_NPM1_ITD <- ifelse((tmp$NPM1==1 & tmp$ITD==1),1,0)

tmp$full_component_CEBPA_bi <- ifelse((tmp$CEBPA_bi==1 & tmp$ITD==0),1,0) 
tmp$full_component_CEBPA_bi_ITD <- ifelse((tmp$CEBPA_bi==1 & tmp$ITD==1),1,0)

tmp$full_component_DNMT3A_IDH1_2 <- ifelse((tmp$DNMT3A==1 & (tmp$IDH1==1 | tmp$IDH2==1 ) & tmp$ITD==0),1,0)
tmp$full_component_DNMT3A_IDH1_2_ITD <- ifelse((tmp$DNMT3A==1 & (tmp$IDH1==1 | tmp$IDH2==1 ) & tmp$ITD==1),1,0)

tmp$full_component_WT1 <- ifelse((tmp$WT1==1 & tmp$ITD==0),1,0) 
tmp$full_component_WT1_ITD <- ifelse((tmp$WT1==1 & tmp$ITD==1),1,0)

tmp$full_component_chr_splicing_multiple <- ifelse((rowSums(tmp[,factors])>1) ,1,0)

tmp$full_component_chr_splicing_1 <- ifelse((rowSums(tmp[,factors])==1),1,0)



# Handling additions
tmp[tmp$full_component_additions==1 & (tmp$full_component_t_15_17==1 | tmp$full_component_inv_16==1 | tmp$full_component_t_8_21==1 |tmp$full_component_t_8_21_ITD==1 |
                                     tmp$full_component_t_11==1 | tmp$full_component_t_6_9==1 | tmp$full_component_t_6_9_ITD==1 | tmp$full_component_inv_3==1 |
                                     tmp$full_component_NPM1==1 |tmp$full_component_NPM1_ITD==1 | tmp$full_component_CEBPA_bi==1 | tmp$full_component_CEBPA_bi_ITD==1 |
                                     tmp$full_component_DNMT3A_IDH1_2==1 | tmp$full_component_DNMT3A_IDH1_2_ITD==1 | tmp$full_component_WT1==1 | tmp$full_component_WT1_ITD==1 ),"full_component_additions"] <- 0


tmp[tmp$full_component_additions==1 & (tmp$full_component_TP53_complex==1) & !(tmp$del_7==1 | tmp$del_17==1 | tmp$del_5==1 | tmp$del_9==1  | tmp$minusy |
                                                                           tmp$del_20==1 | tmp$del_18==1  | tmp$del_12==1 | tmp$del_3==1),"full_component_additions"] <- 0

# Handling DNMT3A IDH
tmp[tmp$full_component_DNMT3A_IDH1_2==1 & (tmp$full_component_t_15_17==1 | tmp$full_component_inv_16==1 | tmp$full_component_t_8_21==1 | tmp$full_component_t_8_21_ITD==1 | tmp$full_component_t_11==1 |
                                         tmp$full_component_t_6_9==1 | tmp$full_component_t_6_9_ITD==1 | tmp$full_component_inv_3==1 | tmp$full_component_TP53_complex==1 | tmp$full_component_NPM1==1 |
                                         tmp$full_component_NPM1_ITD==1 | tmp$full_component_CEBPA_bi==1 | tmp$full_component_CEBPA_bi_ITD==1 | tmp$full_component_additions==1 ),"full_component_DNMT3A_IDH1_2"] <- 0

tmp[tmp$full_component_DNMT3A_IDH1_2_ITD==1 & (tmp$full_component_t_15_17==1 | tmp$full_component_inv_16==1 | tmp$full_component_t_8_21==1 | tmp$full_component_t_8_21_ITD==1 | tmp$full_component_t_11==1 |
                                         tmp$full_component_t_6_9==1 | tmp$full_component_t_6_9_ITD==1 | tmp$full_component_inv_3==1 | tmp$full_component_TP53_complex==1 | tmp$full_component_NPM1==1 |
                                         tmp$full_component_NPM1_ITD==1 | tmp$full_component_CEBPA_bi==1 | tmp$full_component_CEBPA_bi_ITD==1 | tmp$full_component_additions==1 ),"full_component_DNMT3A_IDH1_2_ITD"] <- 0

# Handling WT1
tmp[tmp$full_component_WT1==1 & (tmp$full_component_t_15_17==1 | tmp$full_component_inv_16==1 | tmp$full_component_t_8_21==1 | tmp$full_component_t_8_21_ITD==1 | tmp$full_component_t_11==1 |
                               tmp$full_component_t_6_9==1 | tmp$full_component_t_6_9_ITD==1 | tmp$full_component_inv_3==1 | tmp$full_component_TP53_complex==1 | tmp$full_component_NPM1==1 |
                               tmp$full_component_NPM1_ITD==1 | tmp$full_component_CEBPA_bi==1 | tmp$full_component_CEBPA_bi_ITD==1 ),"full_component_WT1"] <- 0

tmp[tmp$full_component_WT1_ITD==1 & (tmp$full_component_t_15_17==1 | tmp$full_component_inv_16==1 | tmp$full_component_t_8_21==1 | tmp$full_component_t_8_21_ITD==1 | tmp$full_component_t_11==1 |
                               tmp$full_component_t_6_9==1 | tmp$full_component_t_6_9_ITD==1 | tmp$full_component_inv_3==1 | tmp$full_component_TP53_complex==1 | tmp$full_component_NPM1==1 |
                               tmp$full_component_NPM1_ITD==1 | tmp$full_component_CEBPA_bi==1 | tmp$full_component_CEBPA_bi_ITD==1 ),"full_component_WT1_ITD"] <- 0


# Handling chr_splicing_1
tmp[tmp$full_component_chr_splicing_1==1 & (tmp$full_component_t_15_17==1 | tmp$full_component_inv_16==1 | tmp$full_component_t_8_21==1 | tmp$full_component_t_8_21_ITD==1 |
                                          tmp$full_component_t_11==1 | tmp$full_component_t_6_9==1 | tmp$full_component_t_6_9_ITD==1 | tmp$full_component_inv_3==1 |
                                          tmp$full_component_TP53_complex==1 | tmp$full_component_NPM1==1 | tmp$full_component_NPM1_ITD==1 | tmp$full_component_CEBPA_bi==1 | tmp$full_component_CEBPA_bi_ITD==1 ),"full_component_chr_splicing_1"] <- 0

tmp$full_component_not_assigned <- ifelse(rowSums(tmp[,colnames(tmp[,startsWith(colnames(tmp),"full_component_"),])])==0 &  rowSums(tmp[,c(genes,cyto)])>0 & tmp$ITD==0,1,0)
tmp$full_component_not_assigned_ITD <- ifelse(rowSums(tmp[,colnames(tmp[,startsWith(colnames(tmp),"full_component_"),])])==0 &  rowSums(tmp[,c(genes,cyto)])>0 & tmp$ITD==1,1,0)

tmp$full_component_no_events <- ifelse(rowSums(tmp[,c(genes,cyto)])==0,1,0)

tmp$overlap <- 0
tmp[rowSums(tmp[,colnames(tmp[,startsWith(colnames(tmp),"full_component_"),])])>1,"overlap"] <- 1

tmp <- cbind(tmp,dummy('eln_2017',tmp, sep = "_"))

write.table(tmp,"full_data_validation.tsv")

# DATASET NO ITD COMPONENT SEPARATION

In [102]:
tmp <- read.table("nejm_data_eln.tsv",sep =" ",header=T)
rownames(tmp) <- tmp$X
tmp$X <- NULL
genes <- colnames(tmp)[1:57]
cyto <- colnames(tmp)[58:80]

factors <- c("ZRSR2","U2AF1","SRSF2","SF3B1","ASXL1","STAG2","BCOR","RUNX1","EZH2","MLL","PHF6","SF1","NF1","CUX1")    #"SETBP1" missing
mol_defining <- c('NPM1','CEBPA_bi','inv16','t15_17','t8_21','t6_9','inv3','DNMT3A','IDH1','IDH2','WT1',factors)

tmp$full_component_t_15_17 <- ifelse(tmp$t_15_17==1,1,0)

tmp$full_component_inv_16 <- ifelse(tmp$inv_16==1,1,0) 

tmp$full_component_t_8_21 <- ifelse(tmp$t_8_21==1,1,0)

tmp$full_component_t_11 <- ifelse((tmp$t_v_11==1 | tmp$t_9_11==1),1,0)   

tmp$full_component_t_6_9 <- ifelse(tmp$t_6_9==1,1,0)

tmp$full_component_inv_3 <- ifelse(tmp$inv_3==1,1,0)

tmp$full_component_additions <- ifelse((tmp$add_8==1 | tmp$add_13==1 | tmp$add_21==1 | tmp$add_22==1),1,0)

tmp$full_component_TP53_complex <- ifelse((tmp$TP53==1 | tmp$complex==1) ,1,0)

tmp$full_component_NPM1 <- ifelse(tmp$NPM1==1,1,0)

tmp$full_component_CEBPA_bi <- ifelse(tmp$CEBPA_bi==1,1,0) 

tmp$full_component_DNMT3A_IDH1_2 <- ifelse((tmp$DNMT3A==1 & (tmp$IDH1==1 | tmp$IDH2==1)),1,0)

tmp$full_component_WT1 <- ifelse(tmp$WT1==1,1,0)

tmp$full_component_chr_splicing_multiple <- ifelse((rowSums(tmp[,factors])>1) ,1,0)

tmp$full_component_chr_splicing_1 <- ifelse((rowSums(tmp[,factors])==1),1,0)



# Handling additions
tmp[tmp$full_component_additions==1 & (tmp$full_component_t_15_17==1 | tmp$full_component_inv_16==1 | tmp$full_component_t_8_21==1 |
                                     tmp$full_component_t_11==1 | tmp$full_component_t_6_9==1  | tmp$full_component_inv_3==1 |
                                     tmp$full_component_NPM1==1 | tmp$full_component_CEBPA_bi==1 | 
                                     tmp$full_component_DNMT3A_IDH1_2==1 |  tmp$full_component_WT1==1  ),"full_component_additions"] <- 0


tmp[tmp$full_component_additions==1 & (tmp$full_component_TP53_complex==1) & !(tmp$del_7==1 | tmp$del_17==1 | tmp$del_5==1 | tmp$del_9==1  | tmp$minusy |
                                                                           tmp$del_20==1 | tmp$del_18==1  | tmp$del_12==1 | tmp$del_3==1),"full_component_additions"] <- 0

# Handling DNMT3A IDH
tmp[tmp$full_component_DNMT3A_IDH1_2==1 & (tmp$full_component_t_15_17==1 | tmp$full_component_inv_16==1 | tmp$full_component_t_8_21==1 |  tmp$full_component_t_11==1 |
                                         tmp$full_component_t_6_9==1 |  tmp$full_component_inv_3==1 | tmp$full_component_TP53_complex==1 | tmp$full_component_NPM1==1 |
                                          tmp$full_component_CEBPA_bi==1 |  tmp$full_component_additions==1 ),"full_component_DNMT3A_IDH1_2"] <- 0


# Handling WT1
tmp[tmp$full_component_WT1==1 & (tmp$full_component_t_15_17==1 | tmp$full_component_inv_16==1 | tmp$full_component_t_8_21==1 |  tmp$full_component_t_11==1 |
                               tmp$full_component_t_6_9==1  | tmp$full_component_inv_3==1 | tmp$full_component_TP53_complex==1 | tmp$full_component_NPM1==1 |
                                tmp$full_component_CEBPA_bi==1  ),"full_component_WT1"] <- 0




# Handling chr_splicing_1
tmp[tmp$full_component_chr_splicing_1==1 & (tmp$full_component_t_15_17==1 | tmp$full_component_inv_16==1 | tmp$full_component_t_8_21==1 | 
                                          tmp$full_component_t_11==1 | tmp$full_component_t_6_9==1 |  tmp$full_component_inv_3==1 |
                                          tmp$full_component_TP53_complex==1 | tmp$full_component_NPM1==1 | tmp$full_component_CEBPA_bi==1  ),"full_component_chr_splicing_1"] <- 0

tmp$full_component_not_assigned <- ifelse(rowSums(tmp[,colnames(tmp[,startsWith(colnames(tmp),"full_component_"),])])==0 ,1,0)

tmp$full_component_no_events <- ifelse(rowSums(tmp[,c(genes,cyto)])==0,1,0)

tmp$overlap <- 0
tmp[rowSums(tmp[,colnames(tmp[,startsWith(colnames(tmp),"full_component_"),])])>1,"overlap"] <- 1

tmp <- cbind(tmp,dummy('eln_2017',tmp, sep = "_"))

write.table(tmp,"full_data_validation_NO_ITD.tsv")