---
# Visualize LISI metric and runtime
*L.Richards*  
*2021-06-09*    
*/cluster/projects/pughlab/projects/cancer_scrna_integration/figures*    

---

https://davemcg.github.io/post/lets-plot-scrna-dotplots/

In [6]:
library(ggplot2)
library(ggpubr)
library(data.table)
library(ComplexHeatmap)
library(tidyr)
library(viridis)

setwd("~/Desktop/H4H/pughlab/projects/cancer_scrna_integration/figures")

---
## 1.0 Calculate median LISI scores
---

Make a big matrix of metadata across all samples, this will require formatting header columns the be the same. 

In [8]:
lisi <- fread("~/Desktop/H4H/pughlab/projects/cancer_scrna_integration/evalutation/lisi/LISI_calculations.csv")

In [9]:
# load lisi scores (0 - 1)
#lisi <- fread("~/Desktop/H4H/pughlab/projects/cancer_scrna_integration/evalutation/lisi/LISI_calculations.csv")
lisi <- data.frame(lisi)
lisi$V1 <- NULL
rownames(lisi) <- lisi$V1.1
lisi$V1.1 <- NULL

---
### 1.1 Batch LISI for Samples and PatientIDs
---


In [None]:
# make a rowID where you combine Study and Method together
lisi$Study_Method <- paste0(lisi$Study, "_", lisi$Method)

In [None]:
# calculate median LISI for each study

# SAMPLEID
sample <- aggregate(LISI_SampleID_Norm ~ Study_Method, lisi, median)
sample$Category <- "SampleID"
colnames(sample)[2] <- "Median_LISI"

# Patient ID
patient <- aggregate(LISI_PatientID_Norm ~ Study_Method, lisi, median)
patient$Category <- "PatientID"
colnames(patient)[2] <- "Median_LISI"

In [None]:
# combine sample and patient batch lisi together & mutate
lisi_batch <- rbind(sample, patient)

---
### 1.2 Cell Type LISI for SampleID
---

In [None]:
# CELL TYPE

lisi$Study_Method_CellType <- paste0(lisi$Study_Method, "_", lisi$CellType)
celltype <- aggregate(LISI_CellType_SampleID_Norm ~ Study_Method_CellType, lisi, median)
celltype$Category <- "SampleID"


In [None]:
# extract cell types
cat <- sapply(strsplit(celltype$Study_Method_CellType,"_"), `[`, -c(1:2))
cat <- lapply(cat, function(x){paste(x, collapse = "_")})

# reformat studyID 
study <- sapply(strsplit(celltype$Study_Method_CellType,"_"), `[`, c(1:2))
study <- t(study)
study <- paste(study[,1], study[ ,2], sep = "_")

# add celltypes to dataframe
celltype$Category <- unlist(cat)
celltype$Study_Method_CellType <- study

# rename columns
colnames(celltype) <- c("Study_Method", "Median_LISI", "Category")

---
## 2.0 Visualize LISI scores with dotplot
---

In [None]:
unique(celltype$Category)

In [None]:
res <- rbind(lisi_batch, celltype) 
res$Category <- factor(res$Category,
                       levels = c(unique(celltype$Category), "PatientID", "SampleID")
                      )

# remove lisi batch patient, since not all cohorts have multiple samples
# from the same patient anyways
res <- res[!res$Category == "PatientID", ]
write.csv(res, file = "LISI_integration_median.csv")

In [None]:
res <- read.csv("LISI_integration_median.csv")
head(res)

In [None]:
res <- read.csv("LISI_integration_median.csv")

# order cells (y-axis)
cells <- as.character(unique(res$Category)[!unique(res$Category) %in% c("PatientID", "SampleID")])
res$Category <- factor(res$Category,
                       levels = c(cells, 
                                  "PatientID", 
                                  "SampleID")
                      )

In [None]:
# order study-method (x-axis)
method_order <- c("NoBatchCorrection", "Conos", "fastmnn", "Harmony", "RPCA", "STACAS")
study_order <- c("Richards-GBM-LGG", "Bi-RCC", "Caron-ALL", "Ma-LIHC", "Yost-BCC")
levels <- c()

for (i in 1:length(study_order)){
    
    study <- study_order[i]
    
    for (j in 1:length(method_order)){
    
    levels <- append(levels, paste0(study, "_", method_order[j]))
    
    }
    
}

res$Study_Method <- factor(res$Study_Method, levels = levels)

In [None]:
# plot dotplot

pdf("Figure2_LISI_Dotplot.pdf", width = 13, height = 8.5)
ggplot(res, aes(x = Study_Method, y = Category, fill = Median_LISI, size = Median_LISI)) + 
    geom_point(pch = 21) +
    scale_fill_viridis_c(option = "plasma", name = 'Median LISI') +
    cowplot::theme_cowplot() +
    ylab('') + xlab('') + 
    theme(axis.text.x = element_text(angle = 90, hjust = 0.95, size = 12)) +
    theme(axis.text.y = element_text(size = 15))
dev.off()

---
## 3.0 Visualize LISI scores with boxplot
---

In [30]:
# load lisi scores (0 - 1)
lisi <- fread("~/Desktop/H4H/pughlab/projects/cancer_scrna_integration/evalutation/lisi/LISI_calculations.csv")
lisi <- data.frame(lisi)
lisi$V1 <- NULL
rownames(lisi) <- lisi$V1.1
lisi$V1.1 <- NULL

In [31]:
# make a rowID where you combine Study and Method together
lisi$Study_Method <- paste0(lisi$Study, "_", lisi$Method)

---
### 3.1 Boxplot of dataset LISI scores
---

In [52]:
getwd()

In [32]:
lisi$Method <- factor(lisi$Method ,
                       levels = c("NoBatchCorrection", "Conos", "fastmnn", "Harmony", "RPCA", "STACAS")
                      )

### remove malignant cells

lisi_normal <- lisi[!lisi$CellType == "Malignant", ]

In [55]:
pdf("LISI_Boxplots.pdf", width = 11, height = 5)

cols <- c("#E6AB02", "#1B9E77", "#D95F02", "#7570B3", "#E7298A", "#66A61E")

### across entire dataset
#options(repr.plot.width=11, repr.plot.height=5)
ggboxplot(lisi,
          x = "Study",
          y = "LISI_SampleID_Norm",
          fill = "Method",
          ylab = "Normalized LISI Score",
          xlab = "",
          palette = cols,
          ylim = c(0,1),
          outlier.shape = 1,
          legend = "right",
          main = "Batch mixing across entire dataset"
         ) + border()

### across all normal cells
#options(repr.plot.width=11, repr.plot.height=5)
ggboxplot(lisi_normal,
          x = "Study",
          y = "LISI_CellType_SampleID_Norm",
          fill = "Method",
          palette = cols,
          ylab = "Normalized Cell Type LISI Score",
          xlab = "",
          ylim = c(0,1),
          outlier.shape = 1,
          legend = "right",
          main = "Batch mixing within normal cells"
         ) + border()

dev.off()

In [71]:
### median LISI for T cells Richards
test <- lisi[lisi$CellType == "T_cells", ]
test <- test[test$Study == "Caron-ALL", ]
aggregate(LISI_CellType_SampleID_Norm ~ Study_Method, test, median)

Study_Method,LISI_CellType_SampleID_Norm
<chr>,<dbl>
Caron-ALL_Conos,0.3190285
Caron-ALL_fastmnn,0.3237171
Caron-ALL_Harmony,0.3242445
Caron-ALL_NoBatchCorrection,0.1007892
Caron-ALL_RPCA,0.3196658
Caron-ALL_STACAS,0.1067079


In [70]:
### median LISI for T cells Richards
test <- lisi[lisi$CellType == "T_cells", ]
test <- test[test$Study == "Caron-ALL", ]
aggregate(LISI_CellType_SampleID_Norm ~ Study_Method, test, median)

Study_Method,LISI_CellType_SampleID_Norm
<chr>,<dbl>
Richards-GBM-LGG_Conos,0.3221801
Richards-GBM-LGG_fastmnn,0.3360566
Richards-GBM-LGG_Harmony,0.2822244
Richards-GBM-LGG_NoBatchCorrection,0.2602462
Richards-GBM-LGG_RPCA,0.2877669
Richards-GBM-LGG_STACAS,0.1890623
