---
## Visualize UMAPs of different batch correction methods across datasets
*L.Richards*  
*2021-06-14*  
*/cluster/projects/pughlab/projects/cancer_scrna_integration/figures* 

---

Plot 1 grid for each dataset. Grid has 6 columns for each of the methods + uncorrected, and 3 rows coloring cells by sampleID, patientID and cell type. 

---
### 1.0 Extract metadata from all datasets and integrations (H4H)
---

Output metadata so we can plot locally 

In [None]:
library(Seurat)
library(taRifx)

# set up paramters
integration.results <- "~/pughlab/projects/cancer_scrna_integration/integration"
files <- list.files(integration.results, pattern = ".rds$")

# for each study, harmonize the metadata columns
# SampleID, PatientID, CellType
# and extract coords for each integration

#### 1.1 Caron-ALL

In [None]:
# 1) Caron-ALL 
# already has SampleID, PatientID and CellType in @meta.data

study <- "Caron-ALL"
study.files <- files[grep(study, files)]
metas <- list()

for (i in 1:length(study.files)){
    
   dat <- readRDS(study.files[i])
   dat@meta.data$Dataset <- study
   dat@meta.data$Method <- strsplit(study.files[i], "_")[[1]][2]
   
   # extract reduction coordinates for method
   if(unique(dat@meta.data$Method) == "Conos"){
       
       dat@meta.data$Reduction <- "LARGEVIS"
       dat@meta.data$Coords_1 <- dat@reductions$largeVis@cell.embeddings[ ,1]
       dat@meta.data$Coords_2 <- dat@reductions$largeVis@cell.embeddings[ ,2]
       
   } else {
       
       dat@meta.data$Reduction <- "UMAP"
       dat@meta.data$Coords_1 <- dat@reductions$umap@cell.embeddings[ ,1]
       dat@meta.data$Coords_2 <- dat@reductions$umap@cell.embeddings[ ,2]
   }
    
   # filter metadata file to relevant columns
   keep <- c("Dataset", "Method", 
             "SampleID", "PatientID",
             "CellType", "Reduction",
             "Coords_1", "Coords_2"
            )
  
   metas[[unique(dat@meta.data$Method)]] <- dat@meta.data[ ,keep]
   metas[[unique(dat@meta.data$Method)]] <- remove.factors(metas[[unique(dat@meta.data$Method)]])

}

# combine metadata across methods into one big df
metas <- do.call(rbind, metas)

# save results dataframe
meta.file <- paste0("~/pughlab/projects/cancer_scrna_integration/figures/",
                    study, "_MergedMeta.csv"
                   )
write.csv(metas, file = meta.file)

#### 1.2 Ma-LIHC

In [None]:
# 2) Ma-LIHC

study <- "Ma-LIHC"
study.files <- files[grep(study, files)]
metas <- list()

for (i in 1:length(study.files)){
    
   dat <- readRDS(study.files[i])
   dat@meta.data$Dataset <- study
   dat@meta.data$Method <- strsplit(study.files[i], "_")[[1]][2]

   # rename appropiate meta columns
   colnames(dat@meta.data)[grep("Sample", colnames(dat@meta.data))] <- "SampleID"
   dat@meta.data$PatientID <- dat@meta.data$SampleID
   
   # extract reduction coordinates for method
   if(unique(dat@meta.data$Method) == "Conos"){
       
       dat@meta.data$Reduction <- "LARGEVIS"
       dat@meta.data$Coords_1 <- dat@reductions$largeVis@cell.embeddings[ ,1]
       dat@meta.data$Coords_2 <- dat@reductions$largeVis@cell.embeddings[ ,2]
       
   } else {
       
       dat@meta.data$Reduction <- "UMAP"
       dat@meta.data$Coords_1 <- dat@reductions$umap@cell.embeddings[ ,1]
       dat@meta.data$Coords_2 <- dat@reductions$umap@cell.embeddings[ ,2]
   }
    
   # filter metadata file to relevant columns
   keep <- c("Dataset", "Method", 
             "SampleID", "PatientID",
             "CellType", "Reduction",
             "Coords_1", "Coords_2"
            )
  
   metas[[unique(dat@meta.data$Method)]] <- dat@meta.data[ ,keep]
   metas[[unique(dat@meta.data$Method)]] <- remove.factors(metas[[unique(dat@meta.data$Method)]])

}

# combine metadata across methods into one big df
metas <- do.call(rbind, metas)

# save results dataframe
meta.file <- paste0("~/pughlab/projects/cancer_scrna_integration/figures/",
                    study, "_MergedMeta.csv"
                   )
write.csv(metas, file = meta.file)

#### 1.3 Yost-BCC

In [None]:
# 3) Yost-BCC

study <- "Yost-BCC"
study.files <- files[grep(study, files)]
metas <- list()

for (i in 1:length(study.files)){
    
   dat <- readRDS(study.files[i])
   dat@meta.data$Dataset <- study
   dat@meta.data$Method <- strsplit(study.files[i], "_")[[1]][2]

   # rename appropiate meta columns
   colnames(dat@meta.data)[grep("patient", colnames(dat@meta.data))] <- "PatientID"
   
   # extract reduction coordinates for method
   if(unique(dat@meta.data$Method) == "Conos"){
       
       dat@meta.data$Reduction <- "LARGEVIS"
       dat@meta.data$Coords_1 <- dat@reductions$largeVis@cell.embeddings[ ,1]
       dat@meta.data$Coords_2 <- dat@reductions$largeVis@cell.embeddings[ ,2]
       
   } else {
       
       dat@meta.data$Reduction <- "UMAP"
       dat@meta.data$Coords_1 <- dat@reductions$umap@cell.embeddings[ ,1]
       dat@meta.data$Coords_2 <- dat@reductions$umap@cell.embeddings[ ,2]
   }
    
   # filter metadata file to relevant columns
   keep <- c("Dataset", "Method", 
             "SampleID", "PatientID",
             "CellType", "Reduction",
             "Coords_1", "Coords_2"
            )
  
   metas[[unique(dat@meta.data$Method)]] <- dat@meta.data[ ,keep]
   metas[[unique(dat@meta.data$Method)]] <- remove.factors(metas[[unique(dat@meta.data$Method)]])

}

# combine metadata across methods into one big df
metas <- do.call(rbind, metas)

# save results dataframe
meta.file <- paste0("~/pughlab/projects/cancer_scrna_integration/figures/",
                    study, "_MergedMeta.csv"
                   )
write.csv(metas, file = meta.file)

#### 1.4 Richards-GBM-LGG

In [None]:
# 4) Richards-GBM-LGG

study <- "Richards-GBM-LGG"
study.files <- files[grep(study, files)]
metas <- list()

for (i in 1:length(study.files)){
    
   dat <- readRDS(study.files[i])
   dat@meta.data$Dataset <- study
   dat@meta.data$Method <- strsplit(study.files[i], "_")[[1]][2]

   # rename appropiate meta columns
   colnames(dat@meta.data)[grep("PairID", colnames(dat@meta.data))] <- "PatientID"
   colnames(dat@meta.data)[grep("SingleR_CollapsedLabels", colnames(dat@meta.data))] <- "CellType"
    
   # rename cell labels to match other studies
   dat@meta.data$CellType <- gsub("Tcell", "T_cells", dat@meta.data$CellType)
   dat@meta.data$CellType <- gsub("Vascular", "Endothelial", dat@meta.data$CellType)
   dat@meta.data$CellType <- gsub("Macrophage", "Macrophages", dat@meta.data$CellType)
   dat@meta.data$CellType <- gsub("Microglia", "Macrophages", dat@meta.data$CellType)
   
   # extract reduction coordinates for method
   if(unique(dat@meta.data$Method) == "Conos"){
       
       dat@meta.data$Reduction <- "LARGEVIS"
       dat@meta.data$Coords_1 <- dat@reductions$largeVis@cell.embeddings[ ,1]
       dat@meta.data$Coords_2 <- dat@reductions$largeVis@cell.embeddings[ ,2]
       
   } else {
       
       dat@meta.data$Reduction <- "UMAP"
       dat@meta.data$Coords_1 <- dat@reductions$umap@cell.embeddings[ ,1]
       dat@meta.data$Coords_2 <- dat@reductions$umap@cell.embeddings[ ,2]
   }
    
   # filter metadata file to relevant columns
   keep <- c("Dataset", "Method", 
             "SampleID", "PatientID",
             "CellType", "Reduction",
             "Coords_1", "Coords_2"
            )
  
   metas[[unique(dat@meta.data$Method)]] <- dat@meta.data[ ,keep]
   metas[[unique(dat@meta.data$Method)]] <- remove.factors(metas[[unique(dat@meta.data$Method)]])

}

# combine metadata across methods into one big df
metas <- do.call(rbind, metas)

# save results dataframe
meta.file <- paste0("~/pughlab/projects/cancer_scrna_integration/figures/",
                    study, "_MergedMeta.csv"
                   )
write.csv(metas, file = meta.file)

#### 1.5 Bi-RCC

In [None]:
# 3) Bi-RCC

study <- "Bi-RCC"
study.files <- files[grep(study, files)]
metas <- list()

for (i in 1:length(study.files)){
    
   dat <- readRDS(study.files[i])
   dat@meta.data$Dataset <- study
   dat@meta.data$Method <- strsplit(study.files[i], "_")[[1]][2]

   # rename appropiate meta columns
   # colnames(dat@meta.data)[grep("patient", colnames(dat@meta.data))] <- "PatientID"
   
   # extract reduction coordinates for method
   if(unique(dat@meta.data$Method) == "Conos"){
       
       dat@meta.data$Reduction <- "LARGEVIS"
       dat@meta.data$Coords_1 <- dat@reductions$largeVis@cell.embeddings[ ,1]
       dat@meta.data$Coords_2 <- dat@reductions$largeVis@cell.embeddings[ ,2]
       
   } else {
       
       dat@meta.data$Reduction <- "UMAP"
       dat@meta.data$Coords_1 <- dat@reductions$umap@cell.embeddings[ ,1]
       dat@meta.data$Coords_2 <- dat@reductions$umap@cell.embeddings[ ,2]
   }
    
   # filter metadata file to relevant columns
   keep <- c("Dataset", "Method", 
             "SampleID", "PatientID",
             "CellType", "Reduction",
             "Coords_1", "Coords_2"
            )
  
   metas[[unique(dat@meta.data$Method)]] <- dat@meta.data[ ,keep]
   metas[[unique(dat@meta.data$Method)]] <- remove.factors(metas[[unique(dat@meta.data$Method)]])

}

# combine metadata across methods into one big df
metas <- do.call(rbind, metas)

# save results dataframe
meta.file <- paste0("~/pughlab/projects/cancer_scrna_integration/figures/",
                    study, "_MergedMeta.csv"
                   )
write.csv(metas, file = meta.file)

---
### 2.0 Plot UMAPs across datasets and methods (local)
---

In [1]:
library(ggplot2)
library(ggpubr)
library(ggExtra)
library(gridExtra)
library(data.table)
library(ggrepel)
library(dplyr)
library(cowplot)

setwd("~/Desktop/H4H/pughlab/projects/cancer_scrna_integration/figures/")


Attaching package: ‘dplyr’


The following objects are masked from ‘package:data.table’:

    between, first, last


The following object is masked from ‘package:gridExtra’:

    combine


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘cowplot’


The following object is masked from ‘package:ggpubr’:

    get_legend




In [2]:
# define universal color scheme for cell types across all studies
celltypes <- c("Malignant", "T_cells", 
               "B_cells", "NK_cells", "Macrophages",
               "CAFs", "Endothelial", 
               
               "Erythrocytes", # only in ALL
               
               "HPCs", # only in liver
               
               "Astrocyte", "Neuron", #only in glioma
               "Oligodendrocyte", 
               
               "Melanocytes", "DCs", # only in BCC
               "Myofibroblasts", "NK_cells", "pDC",
               "Plasma_cells"
              )

celltypes.col <- c("#7570b3", "#e7298a", 
                   "#e6ab02", "#66a61e", "#a6761d",
                   "#d95f02", "#1b9e77", 
               
                   "#1b9e77", # only in ALL
               
                   "black", # only in liver
               
                   "blue", "darkgrey", #only in glioma
                   "#e31a1c", 
               
                   "blue", "darkgrey", # only in BCC
                   "#e31a1c", "black",
                   "#ff7f00"
                  )


In [4]:
# list meta files
files <- list.files(pattern = "MergedMeta.csv$")
files

In [7]:
# load data
j <- 3
dat <- fread(files[j], stringsAsFactors = FALSE)
dat <- data.frame(dat)
rownames(dat) <- dat$V1
dat$V1 <- NULL

In [8]:
# order methods by rank
order <- c("NoBatchCorrection", "STACAS", "fastmnn", "RPCA", "Conos", "Harmony")
methods <- as.character(unique(dat$Method))
methods <- order[order %in% methods]

# if STACAS in missing, add a blank plot for now
if (! "STACAS" %in% methods){
   blank <- ggplot() + theme_void() + ggtitle("STACAS") + theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 6))
}


########################
# PLOT CELL TYPES
########################
col1 <- "CellType"
plots <- list()
for (i in 1:length(methods)){
    
    # subset data to specific method
    sub <- dat[dat$Method == methods[i], ]
    
    # plot
    plots[[methods[i]]] <- ggplot(sub, aes_string(x = "Coords_1", y = "Coords_2", color = col1)) + 
                       geom_point(alpha = 0.1, size = 0.001, shape = 21) +
                       theme_void() +
                       scale_color_manual(breaks = celltypes, values = celltypes.col) +
                       ggtitle(methods[i]) +
                       theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 6),
                             legend.position = "bottom",
                             panel.border = element_rect(linetype = "solid", fill = NA, size = 0.1)
                            )
    # get legend
    leg <- get_legend(plots[[methods[i]]] + guides(colour = guide_legend(override.aes = list(size=2, alpha = 1, pch = 19))))
    # remove legend
    plots[[methods[i]]] <- plots[[methods[i]]] + theme(legend.position="none")
    
}

# add blank plot to list
if (! "STACAS" %in% methods){
   plots[["STACAS"]] <- blank
}

# reorder plots
plots <- plots[match(order, names(plots))]                                                           

plot.name <- paste0("~/Desktop/", gsub("_MergedMeta.csv", "", files[j]), "_", col1, ".tiff")
tiff(plot.name, units = "in", width = 5, height = 1, res = 600)
print(do.call(grid.arrange, c(plots, ncol = 6)))
dev.off()

plot.name2 <- paste0("~/Desktop/", gsub("_MergedMeta.csv", "", files[j]), "_", col1, "_legend.tiff")
tiff(plot.name2, units = "in", width = 10, height = 5, res = 600)
print(as_ggplot(leg))
dev.off()

########################
# PLOT SAMPLEID
########################
col1 <- "SampleID"
plots <- list()
for (i in 1:length(methods)){
    
    # subset data to specific method
    sub <- dat[dat$Method == methods[i], ]
    
    # plot
    plots[[methods[i]]] <- ggplot(sub, aes_string(x = "Coords_1", y = "Coords_2", color = col1)) + 
                       geom_point(alpha = 0.1, size = 0.001, shape = 21) +
                       theme_void() +
                       #scale_color_manual(breaks = celltypes, values = celltypes.col) +
                       ggtitle(methods[i]) +
                      
                       theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 6),
                             legend.position = "bottom",
                             panel.border = element_rect(linetype = "solid", fill = NA, size = 0.1)
                            )
    # get legend
    leg <- get_legend(plots[[methods[i]]] + guides(colour = guide_legend(override.aes = list(size=2, alpha = 1, pch = 19))))
    # remove legend
    plots[[methods[i]]] <- plots[[methods[i]]] + theme(legend.position="none")
    
}

# add blank plot to list
if (! "STACAS" %in% methods){
   plots[["STACAS"]] <- blank
}

# reorder plots
plots <- plots[match(order, names(plots))]                                                           

plot.name <- paste0("~/Desktop/", gsub("_MergedMeta.csv", "", files[j]), "_", col1, ".tiff")
tiff(plot.name, units = "in", width = 5, height = 1, res = 600)
print(do.call(grid.arrange, c(plots, ncol = 6)))
dev.off()

plot.name2 <- paste0("~/Desktop/", gsub("_MergedMeta.csv", "", files[j]), "_", col1, "_legend.tiff")
tiff(plot.name2, units = "in", width = 10, height = 5, res = 600)
print(as_ggplot(leg))
dev.off()


########################
# PLOT PATIENTID
########################
col1 <- "PatientID"
plots <- list()
for (i in 1:length(methods)){
    
    # subset data to specific method
    sub <- dat[dat$Method == methods[i], ]
    
    # plot
    plots[[methods[i]]] <- ggplot(sub, aes_string(x = "Coords_1", y = "Coords_2", color = col1)) + 
                       geom_point(alpha = 0.1, size = 0.001, shape = 21) +
                       theme_void() +
                       #scale_color_manual(breaks = celltypes, values = celltypes.col) +
                       ggtitle(methods[i]) +
                      
                       theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 6),
                             legend.position = "bottom",
                             panel.border = element_rect(linetype = "solid", fill = NA, size = 0.1)
                            )
    # get legend
    leg <- get_legend(plots[[methods[i]]] + guides(colour = guide_legend(override.aes = list(size=2, alpha = 1, pch = 19))))
    # remove legend
    plots[[methods[i]]] <- plots[[methods[i]]] + theme(legend.position="none")
    
}

# add blank plot to list
if (! "STACAS" %in% methods){
   plots[["STACAS"]] <- blank
}

# reorder plots
plots <- plots[match(order, names(plots))]                                                           

plot.name <- paste0("~/Desktop/", gsub("_MergedMeta.csv", "", files[j]), "_", col1, ".tiff")
tiff(plot.name, units = "in", width = 5, height = 1, res = 600)
print(do.call(grid.arrange, c(plots, ncol = 6)))
dev.off()

plot.name2 <- paste0("~/Desktop/", gsub("_MergedMeta.csv", "", files[j]), "_", col1, "_legend.tiff")
tiff(plot.name2, units = "in", width = 10, height = 5, res = 600)
print(as_ggplot(leg))
dev.off()

TableGrob (1 x 6) "arrange": 6 grobs
                  z     cells    name           grob
NoBatchCorrection 1 (1-1,1-1) arrange gtable[layout]
STACAS            2 (1-1,2-2) arrange gtable[layout]
fastmnn           3 (1-1,3-3) arrange gtable[layout]
RPCA              4 (1-1,4-4) arrange gtable[layout]
Conos             5 (1-1,5-5) arrange gtable[layout]
Harmony           6 (1-1,6-6) arrange gtable[layout]


TableGrob (1 x 6) "arrange": 6 grobs
                  z     cells    name           grob
NoBatchCorrection 1 (1-1,1-1) arrange gtable[layout]
STACAS            2 (1-1,2-2) arrange gtable[layout]
fastmnn           3 (1-1,3-3) arrange gtable[layout]
RPCA              4 (1-1,4-4) arrange gtable[layout]
Conos             5 (1-1,5-5) arrange gtable[layout]
Harmony           6 (1-1,6-6) arrange gtable[layout]


TableGrob (1 x 6) "arrange": 6 grobs
                  z     cells    name           grob
NoBatchCorrection 1 (1-1,1-1) arrange gtable[layout]
STACAS            2 (1-1,2-2) arrange gtable[layout]
fastmnn           3 (1-1,3-3) arrange gtable[layout]
RPCA              4 (1-1,4-4) arrange gtable[layout]
Conos             5 (1-1,5-5) arrange gtable[layout]
Harmony           6 (1-1,6-6) arrange gtable[layout]
