----
# Visualize results from batch correction bake off
---

In [4]:
options(repos='http://cran.rstudio.com/')
#install.packages("ggExtra")
library(Seurat)
library(ggplot2)
library(ggpubr)
library(ggExtra)
library(ggrepel)
library(dplyr)

In [5]:
setwd("~/Desktop/H4H/pughlab/projects/BTSCs_scRNAseq/Manuscript_G607removed/NatCan_Rebuttal/BatchCorrection/")

----
## 1.0 Plot UMAPs
----

In [4]:
#load data
meta <- readRDS("Global_GSC_BatchCorrection_metadata.rds")

In [7]:
colnames(meta)

In [122]:
## Define sample color palette

colfunc <- colorRampPalette(c("#54278f", "#bcbddc", "#084081", "#4eb3d3", "#238b45", "#ccebc5"))
dirks <- colfunc(21)
colfunc <- colorRampPalette(c("#800026", "#fc4e2a", "#feb24c", "#ffeda0"))
weiss <- colfunc(8)
cols <- c(weiss, dirks)
length(cols)

In [43]:
#### ORIGINAL CLUSTERING


#calculate centroids 
hc.norm.cent <- meta %>% group_by(Original_clusters) %>% select(Original_UMAP1, 
    Original_UMAP2) %>% summarize_all(median)
#hc.norm.cent



original_sample <- ggplot(meta, aes(x=Original_UMAP1, y=Original_UMAP2, color=SampleID)) + 
                   geom_point(alpha = 0.3, size = 0.8, pch = 16) +  
                   labs(x = "UMAP 1", y = "UMAP 2") +
                   scale_colour_manual(values = cols) + 
                   theme_bw() + 
                   theme(axis.text.x = element_blank(), axis.text.y = element_blank(), 
                         axis.ticks = element_blank(),
                        panel.border = element_rect(linetype = "solid", fill = NA, size = 1),
                        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
                        panel.background = element_blank())  +
                    guides(colour = guide_legend(override.aes = list(size=4, alpha = 1))) +
                    theme(legend.position="none") 

original_clusters <- ggplot(meta, aes(x=Original_UMAP1, y=Original_UMAP2, color=Original_clusters)) + 
                   geom_point(alpha = 0.3, size = 0.8, pch = 16) +  
                   labs(x = "UMAP 1", y = "UMAP 2") +
                   #scale_colour_manual(values = cols) + 
                   theme_bw() + 
                   theme(axis.text.x = element_blank(), axis.text.y = element_blank(), 
                         axis.ticks = element_blank(),
                        panel.border = element_rect(linetype = "solid", fill = NA, size = 1),
                        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
                        panel.background = element_blank())  +
                    guides(colour = guide_legend(override.aes = list(size=4, alpha = 1))) +
                    theme(legend.position="none") +
                    geom_label_repel(aes(label = Original_clusters), 
                                     data = hc.norm.cent, 
                                     label.size = 0.05, 
                                     parse = T, 
                                     size = 3)

pdf("~/Desktop/OriginalClustering_NatCan_Cluster.pdf", width = 8, height= 8)
original_clusters
dev.off()

pdf("~/Desktop/OriginalClustering_NatCan_Sample.pdf", width = 8, height= 8)
original_sample
dev.off()

Adding missing grouping variables: `Original_clusters`



In [40]:
#### CONOS Clustering


#calculate centroids 
hc.norm.cent <- meta %>% group_by(Conos_clusters) %>% select(Conos_UMAP1, 
    Conos_UMAP2) %>% summarize_all(median)
#hc.norm.cent



Conos_sample <- ggplot(meta, aes(x=Conos_UMAP1, y=Conos_UMAP2, color=SampleID)) + 
                   geom_point(alpha = 0.3, size = 0.8, pch = 16) +  
                   labs(x = "UMAP 1", y = "UMAP 2") +
                   scale_colour_manual(values = cols) + 
                   theme_bw() + 
                   theme(axis.text.x = element_blank(), axis.text.y = element_blank(), 
                         axis.ticks = element_blank(),
                        panel.border = element_rect(linetype = "solid", fill = NA, size = 1),
                        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
                        panel.background = element_blank())  +
                    guides(colour = guide_legend(override.aes = list(size=4, alpha = 1))) +
                    theme(legend.position="bottom") + theme(legend.position="none")

Conos_clusters <- ggplot(meta, aes(x=Conos_UMAP1, y=Conos_UMAP2, color=Conos_clusters)) + 
                   geom_point(alpha = 0.3, size = 0.8, pch = 16) +  
                   labs(x = "UMAP 1", y = "UMAP 2") +
                   #scale_colour_manual(values = cols) + 
                   theme_bw() + 
                   theme(axis.text.x = element_blank(), axis.text.y = element_blank(), 
                         axis.ticks = element_blank(),
                        panel.border = element_rect(linetype = "solid", fill = NA, size = 1),
                        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
                        panel.background = element_blank())  +
                    guides(colour = guide_legend(override.aes = list(size=4, alpha = 1))) +
                    theme(legend.position="none") +
                    geom_label_repel(aes(label = Conos_clusters), 
                                     data = hc.norm.cent, 
                                     label.size = 0.05, 
                                     parse = T, 
                                     size = 3)

pdf("~/Desktop/ConosClustering_NatCan_Cluster.pdf", width = 8, height= 8)
Conos_clusters
dev.off()

pdf("~/Desktop/ConosClustering_NatCan_Sample.pdf", width = 8, height= 8)
Conos_sample
dev.off()

Adding missing grouping variables: `Conos_clusters`



In [41]:
#### Liger Clustering


#calculate centroids 
hc.norm.cent <- meta %>% group_by(Liger_clusters) %>% select(Liger_UMAP1, 
    Liger_UMAP2) %>% summarize_all(median)
#hc.norm.cent



Liger_sample <- ggplot(meta, aes(x=Liger_UMAP1, y=Liger_UMAP2, color=SampleID)) + 
                   geom_point(alpha = 0.3, size = 0.8, pch = 16) +  
                   labs(x = "UMAP 1", y = "UMAP 2") +
                   scale_colour_manual(values = cols) + 
                   theme_bw() + 
                   theme(axis.text.x = element_blank(), axis.text.y = element_blank(), 
                         axis.ticks = element_blank(),
                        panel.border = element_rect(linetype = "solid", fill = NA, size = 1),
                        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
                        panel.background = element_blank())  +
                    guides(colour = guide_legend(override.aes = list(size=4, alpha = 1))) +
                    theme(legend.position="bottom") + theme(legend.position="none")

Liger_clusters <- ggplot(meta, aes(x=Liger_UMAP1, y=Liger_UMAP2, color=Liger_clusters)) + 
                   geom_point(alpha = 0.3, size = 0.8, pch = 16) +  
                   labs(x = "UMAP 1", y = "UMAP 2") +
                   #scale_colour_manual(values = cols) + 
                   theme_bw() + 
                   theme(axis.text.x = element_blank(), axis.text.y = element_blank(), 
                         axis.ticks = element_blank(),
                        panel.border = element_rect(linetype = "solid", fill = NA, size = 1),
                        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
                        panel.background = element_blank())  +
                    guides(colour = guide_legend(override.aes = list(size=4, alpha = 1))) +
                    theme(legend.position="none") +
                    geom_label_repel(aes(label = Liger_clusters), 
                                     data = hc.norm.cent, 
                                     label.size = 0.05, 
                                     parse = T, 
                                     size = 3)

pdf("~/Desktop/LigerClustering_NatCan_Cluster.pdf", width = 8, height= 8)
Liger_clusters
dev.off()

pdf("~/Desktop/LigerClustering_NatCan_Sample.pdf", width = 8, height= 8)
Liger_sample
dev.off()

Adding missing grouping variables: `Liger_clusters`



In [44]:
#### fastMNN Clustering


#calculate centroids 
hc.norm.cent <- meta %>% group_by(fastMNN_clusters) %>% select(fastMNN_UMAP1, 
    fastMNN_UMAP2) %>% summarize_all(median)
#hc.norm.cent



fastMNN_sample <- ggplot(meta, aes(x=fastMNN_UMAP1, y=fastMNN_UMAP2, color=SampleID)) + 
                   geom_point(alpha = 0.3, size = 0.8, pch = 16) +  
                   labs(x = "UMAP 1", y = "UMAP 2") +
                   scale_colour_manual(values = cols) + 
                   theme_bw() + 
                   theme(axis.text.x = element_blank(), axis.text.y = element_blank(), 
                         axis.ticks = element_blank(),
                        panel.border = element_rect(linetype = "solid", fill = NA, size = 1),
                        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
                        panel.background = element_blank())  +
                    guides(colour = guide_legend(override.aes = list(size=4, alpha = 1))) +
                    theme(legend.position="bottom") + theme(legend.position="none")

fastMNN_clusters <- ggplot(meta, aes(x=fastMNN_UMAP1, y=fastMNN_UMAP2, color=fastMNN_clusters)) + 
                   geom_point(alpha = 0.3, size = 0.8, pch = 16) +  
                   labs(x = "UMAP 1", y = "UMAP 2") +
                   #scale_colour_manual(values = cols) + 
                   theme_bw() + 
                   theme(axis.text.x = element_blank(), axis.text.y = element_blank(), 
                         axis.ticks = element_blank(),
                        panel.border = element_rect(linetype = "solid", fill = NA, size = 1),
                        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
                        panel.background = element_blank())  +
                    guides(colour = guide_legend(override.aes = list(size=4, alpha = 1))) +
                    theme(legend.position="none") +
                    geom_label_repel(aes(label = fastMNN_clusters), 
                                     data = hc.norm.cent, 
                                     label.size = 0.05, 
                                     parse = T, 
                                     size = 3)

pdf("~/Desktop/fastMNNClustering_NatCan_Cluster.pdf", width = 8, height= 8)
fastMNN_clusters
dev.off()

pdf("~/Desktop/fastMNNClustering_NatCan_Sample.pdf", width = 8, height= 8)
fastMNN_sample
dev.off()

Adding missing grouping variables: `fastMNN_clusters`



----
## 2.0 Plot cluster properties
----

In [39]:
setwd("./Comparison/")

In [8]:
#### load data
meta <- readRDS("./Comparison/Global_GSC_BatchCorrection_metadata.rds")
head(meta)

Unnamed: 0_level_0,nGene,nUMI,percent.mito,PatientID,SampleID,Passage,CultureMethod,Pathology,Stage,Age,⋯,Conos_clusters,Conos_UMAP1,Conos_UMAP2,Liger_clusters,Liger_quantNorm_clusters,Liger_UMAP1,Liger_UMAP2,fastMNN_clusters,fastMNN_UMAP1,fastMNN_UMAP2
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<fct>,<chr>,<int>,<fct>,<fct>,<fct>,<int>,⋯,<chr>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>
BT127_L_AAACCTGCACGGACAA,640,875,0.043428571,BT127,BT127_L,,Sphere,GLIOBLASTOMA (GRADE IV),PRIMARY,55,⋯,C11,3.91472455,-2.492949,C67,C13,-0.4835586,2.379755,C37,6.795939,4.973665
BT127_L_AAACCTGCATCCGGGT,1036,2408,0.002076412,BT127,BT127_L,,Sphere,GLIOBLASTOMA (GRADE IV),PRIMARY,55,⋯,C2,-2.32181506,1.148725,C14,C21,0.9863441,5.40831,C14,1.2473862,-1.664298
BT127_L_AAACCTGGTACAGTTC,3240,10058,0.078047326,BT127,BT127_L,,Sphere,GLIOBLASTOMA (GRADE IV),PRIMARY,55,⋯,C1,-0.07283644,-2.741365,C56,C10,8.9876844,1.797849,C10,2.2776213,1.684389
BT127_L_AAACCTGTCTACGAGT,3337,10798,0.061863308,BT127,BT127_L,,Sphere,GLIOBLASTOMA (GRADE IV),PRIMARY,55,⋯,C1,2.88304897,-4.185107,C28,C7,-3.7655035,-4.287652,C21,0.3696045,4.849369
BT127_L_AAACGGGAGTGGTAAT,4140,14601,0.081501267,BT127,BT127_L,,Sphere,GLIOBLASTOMA (GRADE IV),PRIMARY,55,⋯,C4,-3.40395693,-1.246072,C8,C6,-2.0646044,5.311308,C3,3.0244613,-2.603526
BT127_L_AAACGGGCAGGACGTA,543,820,0.108536585,BT127,BT127_L,,Sphere,GLIOBLASTOMA (GRADE IV),PRIMARY,55,⋯,C6,3.92768212,-2.28528,C7,C17,4.213423,-1.124725,C20,6.4704309,4.624188


---
### Number clusters

In [139]:
##### NUMBER CLUSTERS
##### HISTOGRAM

clusters <- c(61, 12, 78, 39)
method <- c("Original", "Conos", "Liger", "fastMNN")
clust.hist <- cbind(method, clusters)
colnames(clust.hist) <- c("method", "clusters")
clust.hist <- data.frame(clust.hist)
clust.hist$method <- factor(clust.hist$method, 
                               levels = c("Original", "Conos", "Liger", "fastMNN")
                              )

pdf("ClusterCounts.pdf", width = 5, height = 5)
cluster.plot <- ggplot(clust.hist, aes(y=clusters, x=method, fill = method)) +
                geom_bar(stat="identity") + theme_classic() +
                ylab("Number of Clusters") + xlab("") +
                theme(legend.position="none") +
                theme(text = element_text(size=20))

cluster.plot
dev.off()

---
### Number samples per cluster

In [140]:
#### NUMBER SAMPLES / CLUSTER (>10 cells) 
#### BOXPLOT WITH POINTS

original <- table(meta$Original_clusters, meta$SampleID)
original <- original > 10
original <- rowSums(original)
original <- data.frame(original)
colnames(original) <- "Samples"
original$method <- "Original"
original$ClusterID <- rownames(original)
print(mean(original$Samples))

conos <- table(meta$Conos_clusters, meta$SampleID)
conos <- conos > 10
conos <- rowSums(conos)
conos <- data.frame(conos)
colnames(conos) <- "Samples"
conos$method <- "Conos"
conos$ClusterID <- rownames(conos)
print(mean(conos$Samples))

liger <- table(meta$Liger_clusters, meta$SampleID)
liger <- liger > 10
liger <- rowSums(liger)
liger <- data.frame(liger)
colnames(liger) <- "Samples"
liger$method <- "Liger"
liger$ClusterID <- rownames(liger)
print(mean(liger$Samples))


fastmnn <- table(meta$fastMNN_clusters, meta$SampleID)
fastmnn <- fastmnn > 10
fastmnn <- rowSums(fastmnn)
fastmnn<- data.frame(fastmnn)
colnames(fastmnn) <- "Samples"
fastmnn$method <- "fastMNN"
fastmnn$ClusterID <- rownames(fastmnn)
print(mean(fastmnn$Samples))

plot.dat <- rbind(original, conos, liger, fastmnn)
plot.dat$method <- factor(plot.dat$method, 
                               levels = c("Original", "Conos", "Liger", "fastMNN")
                              )
head(plot.dat)

[1] 1.229508
[1] 17.41667
[1] 19.78205
[1] 9.769231


Unnamed: 0_level_0,Samples,method,ClusterID
Unnamed: 0_level_1,<dbl>,<fct>,<chr>
C1,1,Original,C1
C10,1,Original,C10
C11,1,Original,C11
C12,1,Original,C12
C13,2,Original,C13
C14,1,Original,C14


In [141]:
pdf("ClusterSampleBoxplot.pdf", width = 5, height = 5)
p <- ggplot(plot.dat, aes(x=method, y=Samples, fill = method)) + 
      geom_boxplot() + theme_classic() + geom_jitter(shape=16, position=position_jitter(0.2)) +
      ylab("Samples with > 10 cells / cluster") + xlab("") + 
        theme(legend.position="none") + theme(text = element_text(size=20))
p
dev.off()

---
### Proportion samples per cluster

In [None]:
colfunc <- colorRampPalette(c("#54278f", "#bcbddc", "#084081", "#4eb3d3", "#238b45", "#ccebc5"))
dirks <- colfunc(21)
colfunc <- colorRampPalette(c("#800026", "#fc4e2a", "#feb24c", "#ffeda0"))
weiss <- colfunc(8)
cols <- c(weiss, dirks)
length(cols)

In [135]:
original <- prop.table(table(meta$SampleID, meta$Original_clusters), margin = 2)
original <- data.matrix(original)
original <- data.frame(original)
colnames(original) <- c("Sample", "Cluster", "Prop")

pdf("Original_ClusterProportions.pdf", height = 8, width = 23)
p1 <- ggplot() + geom_bar(aes(y=Prop, x=Cluster, fill = Sample), data = original, stat = "identity") +
     scale_fill_manual(values = cols) + theme_classic() + ggtitle("Original") + 
     xlab("") + ylab("Proportion Cluster")+ 
     theme(text = element_text(size=20), axis.text.x = element_text(angle=90, hjust=1))
p1
dev.off()

In [136]:
conos <- prop.table(table(meta$SampleID, meta$Conos_clusters), margin = 2)
conos <- data.matrix(conos)
conos <- data.frame(conos)
colnames(conos) <- c("Sample", "Cluster", "Prop")

pdf("Conos_ClusterProportions.pdf", height = 8, width = 23)
p1 <- ggplot() + geom_bar(aes(y=Prop, x=Cluster, fill = Sample), data = conos, stat = "identity") +
     scale_fill_manual(values = cols) + theme_classic() + ggtitle("Conos") + 
     xlab("") + ylab("Proportion Cluster") + 
     theme(text = element_text(size=20), axis.text.x = element_text(angle=90, hjust=1))
p1
dev.off()

In [137]:
liger <- prop.table(table(meta$SampleID, meta$Liger_clusters), margin = 2)
liger<- data.matrix(liger)
liger <- data.frame(liger)
colnames(liger) <- c("Sample", "Cluster", "Prop")

pdf("Liger_ClusterProportions.pdf", height = 8, width = 23)
p1 <- ggplot() + geom_bar(aes(y=Prop, x=Cluster, fill = Sample), data = liger, stat = "identity") +
     scale_fill_manual(values = cols) + theme_classic() + ggtitle("Liger") + 
     xlab("") + ylab("Proportion Cluster") + 
     theme(text = element_text(size=20), axis.text.x = element_text(angle=90, hjust=1))
p1
dev.off()


In [138]:
fastmnn <- prop.table(table(meta$SampleID, meta$fastMNN_clusters), margin = 2)
fastmnn<- data.matrix(fastmnn)
fastmnn <- data.frame(fastmnn)
colnames(fastmnn) <- c("Sample", "Cluster", "Prop")

pdf("fastMNN_ClusterProportions.pdf", height = 8, width = 23)
p1 <- ggplot() + geom_bar(aes(y=Prop, x=Cluster, fill = Sample), data = fastmnn, stat = "identity") +
     scale_fill_manual(values = cols) + theme_classic() + ggtitle("fastMNN") + 
     xlab("") + ylab("Proportion Cluster") + 
     theme(text = element_text(size=20), axis.text.x = element_text(angle=90, hjust=1))
p1
dev.off()