---
# Data Integration: Yost-BCC
*L.Richards*  
*2020-06-07*  
*/cluster/projects/pughlab/projects/cancer_scrna_integration/integration/*    

---



In [None]:
# set up dataset specfic parameters
inputSeurat <- "/cluster/projects/pughlab/projects/cancer_scrna_integration/data/Ma-LIHC/Ma-LIHC_seurat.rds"
filePrefix <- "Ma-LIHC"
sampleCol <- "Sample"
patientCol <- "Sample"
cellCol <- "CellType"
k.weight <- 50 # default = 100

In [None]:
setwd("/cluster/projects/pughlab/projects/cancer_scrna_integration/integration/")

# load custom functions
source("~/github/oicr-brain-tri-gbm/src/scRNA_helper_functions.r")

---
## 1.0 Cluster data without integration
---

module load R/4.0.0

In [None]:
library(Seurat)

# load input seurat object
dat <- readRDS(inputSeurat)

# cluster merged data
# Time elapsed.... 54.26 secs
dat <- quickCluster(dat,
                        normalize = TRUE,
                        vars.to.regress = NULL,
                        #k.param = 20,
                        dims = 20, # max dims 1:dims
                        n.vargenes = 2000,
                        min.resolution = 1.5,
                        max.resolution = 1.5,
                        n.resolution = 1, #how many resolutions to cluster over
                        verbose = FALSE,
                        pc.calc = 75, # how many PCs to calculate
                        pca.genes = "var" # accepts "all" or "var"
                       )

# plot data
plot.name <- paste0(filePrefix, "_NoBatchCorrection_UMAP.pdf")
pdf(plot.name, width = 18, height = 5)
DimPlot(dat, 
        group.by = c(sampleCol, patientCol, cellCol),
        ncol = 3
       )
dev.off()

# add umap coords to meta
umap <- data.frame(dat@reductions$umap@cell.embeddings)
colnames(umap) <- paste0("Uncorrected_",  colnames(umap))
dat <- AddMetaData(dat, metadata = umap)

# save data
seurat.name <- paste0(filePrefix, "_NoBatchCorrection_seurat.rds")
saveRDS(dat, file = seurat.name)

---
## 2.0 Data Integration with Conos
---

http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/conos.html

module load R/4.0

In [None]:
library(conos)
library(Seurat)
library(SeuratWrappers)

In [None]:
# load data
dat <- readRDS(inputSeurat)

In [None]:
# split up data and normalize
dat.panel <- SplitObject(dat, split.by = sampleCol)

for (i in 1:length(dat.panel)) {
    dat.panel[[i]] <- NormalizeData(dat.panel[[i]]) %>% FindVariableFeatures() %>% ScaleData() %>% 
        RunPCA(verbose = FALSE)
}

# run Conos
start <- Sys.time()

dat.con <- Conos$new(dat.panel)
dat.con$buildGraph(k = 15, 
                   k.self = 5, 
                   space = "PCA", 
                   ncomps = 30, 
                   n.odgenes = 2000, 
                   matching.method = "mNN", 
                   metric = "angular", 
                   score.component.variance = TRUE, 
                   verbose = TRUE
                  )
dat.con$findCommunities()
dat.con$embedGraph()
dat <- as.Seurat(dat.con)

end <- Sys.time()

end - start # Time difference of 3.443902 mins

# plot results
plot.name <- paste0(filePrefix, "_Conos_UMAP.pdf")
pdf(plot.name, width = 18, height = 5)
DimPlot(dat, 
        reduction = "largeVis", 
        group.by = c(sampleCol, patientCol, cellCol),
        ncol = 3
       )
dev.off()

# save results
seurat.name <- paste0(filePrefix, "_Conos_seurat.rds")
saveRDS(dat, file = seurat.name)

---
## 5.0 Data Integration with STACAS
---


https://carmonalab.github.io/STACAS/tutorial.html

module load R/4.0.0

Error = "Error in idx[i, ] <- res[[i]][[1]] :
  number of items to replace is not a multiple of replacement length" @ integrate data step

In [None]:
library(Seurat)
library(STACAS)

# load data
# load data
dat <- readRDS(inputSeurat)

# split by sample
ref.list <- SplitObject(dat, split.by = sampleCol)

In [None]:
# define STACAS running paramters
var.genes.n <- 2000 # multipled by 2 below (previously was 1000)
var.genes.integrated.n <- 2000
ndim <- 20
dist.pct <- 0.8

# split up reference list by sample and normalize
for (i in 1:length(ref.list)) {
    
    ref.list[[i]] <- NormalizeData(ref.list[[i]], verbose = FALSE)
    
    ref.list[[i]] <- FindVariableFeatures(ref.list[[i]], 
                                          selection.method = "vst", 
                                          nfeatures = var.genes.n*2, 
                                          verbose = FALSE
                                         )
    
    mito.genes <- grep(pattern = "^MT-", rownames(ref.list[[i]]), value = TRUE)
    ribo.genes <- grep(pattern = "^RP[LS]", rownames(ref.list[[i]]), value = TRUE)
    
    #ref.list[[i]]@assays$RNA@var.features <- setdiff(ref.list[[i]]@assays$RNA@var.features, cellCycle.symbol)
    ref.list[[i]]@assays$RNA@var.features <- setdiff(ref.list[[i]]@assays$RNA@var.features, mito.genes)
    ref.list[[i]]@assays$RNA@var.features <- setdiff(ref.list[[i]]@assays$RNA@var.features, ribo.genes)
    ref.list[[i]]@assays$RNA@var.features <- head( ref.list[[i]]@assays$RNA@var.features, var.genes.n)
    
}

# Run STACAS
start <- Sys.time()
ref.anchors <- FindAnchors.STACAS(ref.list, 
                                  dims=1:ndim, 
                                  anchor.features=var.genes.integrated.n
                                 )

ref.anchors.filtered <- FilterAnchors.STACAS(ref.anchors,
                                             dist.pct = dist.pct
                                            )

all.genes <- row.names(ref.list[[1]])

for (i in 2:length(ref.list)) {
   
    all.genes <- intersect(all.genes, row.names(ref.list[[i]]))
    
}

mySampleTree <- SampleTree.STACAS(ref.anchors.filtered)
print(mySampleTree)


ref.integrated <- IntegrateData(anchorset = ref.anchors.filtered, 
                                dims = 1:ndim, 
                                features.to.integrate = all.genes,
                                sample.tree = mySampleTree, 
                                preserve.order = T,
                                k.weight = k.weight
                               )


# process and cluster
ref.integrated <- ScaleData(ref.integrated, verbose = TRUE)
ref.integrated <- RunPCA(ref.integrated, 
                         features = ref.integrated@assays$integrated@var.features,
                         ndims.print = 1:5, 
                         nfeatures.print = 5
                        )
ref.integrated <- RunUMAP(ref.integrated, 
                          reduction = "pca", 
                          dims = 1:ndim, 
                          seed.use=123, 
                          n.neighbors = 30, 
                          min.dist=0.3
                         )

end <- Sys.time()
end - start # 

# plot results of integration
plot.name <- paste0(filePrefix, "_STACAS_UMAP.pdf")
pdf(plot.name, width = 18, height = 5)
DimPlot(ref.integrated, 
        reduction = "umap",
        group.by = c(sampleCol, patientCol, cellCol),
        ncol = 3
       )
dev.off()

# save results
seurat.name <- paste0(filePrefix, "_STACAS_seurat.rds")
saveRDS(ref.integrated, file = seurat.name)

---
## 4.0 Data Integration with Reciprocal PCA (RPCA)
---



module load R/4
https://satijalab.org/seurat/articles/integration_rpca.html

RPCA seems like it will be a better fit for cancer samples, as it is more conservative, you can adjust the strength of the integration and is recommneded over CCA when  a substantial fraction of cells in one dataset have no matching type in the other -- perfect for a mixed pathologt cohrot. If produces similar results to STACAS, this could be preferable since it likely runs much faster. 

Run over a range of k from 5 (least conservative) to 20 (more integration) to see which gives the best results. 

In [None]:
library(Seurat)

# load dataset
dat <- readRDS(inputSeurat)

# split up dataset by sample
dat.list <- SplitObject(dat, split.by = sampleCol)

# normalize and identify variable features for each dataset independently
dat.list <- lapply(X = dat.list, FUN = function(x) {
    x <- NormalizeData(x)
    x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})

# select features that are repeatedly variable across datasets for integration run PCA on each
# dataset using these features
features <- SelectIntegrationFeatures(object.list = dat.list) #2000
dat.list <- lapply(X = dat.list, FUN = function(x) {
    x <- ScaleData(x, features = features, verbose = FALSE)
    x <- RunPCA(x, features = features, verbose = FALSE)
})

# perform integration
# The results show that rpca-based integration is more conservative, 
# You can increase the strength of alignment by increasing the 
#k.anchor parameter, which is set to 5 by default. 
#Increasing this parameter to 20 will assist in aligning these 
#populations.

# define range of k.anchors
# k.anchors = How many neighbors (k) to use when picking anchors
k.anchors <- 5

start <- Sys.time()
#for (i in 2:length(k.anchors)){
    
    dat.anchors <- FindIntegrationAnchors(object.list = dat.list, 
                                          anchor.features = features, 
                                          reduction = "rpca",
                                          k.anchor = k.anchors
                                         )

    # this command creates an 'integrated' data assay
    dat.combined <- IntegrateData(anchorset = dat.anchors, k.weight = k.weight)

    # specify that we will perform downstream analysis on the corrected data note that the original
    # unmodified data still resides in the 'RNA' assay
    DefaultAssay(dat.combined) <- "integrated"

    # Run the standard workflow for visualization and clustering
    dat.combined <- ScaleData(dat.combined, verbose = FALSE)
    dat.combined <- RunPCA(dat.combined, npcs = 30, verbose = FALSE)
    dat.combined <- RunUMAP(dat.combined, reduction = "pca", dims = 1:20)
    dat.combined <- FindNeighbors(dat.combined, reduction = "pca", dims = 1:20)
    dat.combined <- FindClusters(dat.combined, resolution = 1.5)

    end <- Sys.time()
    end - start # Time difference of 6.263618 mins

    # Visualization of results 
    # plot results of integration
    plot.name <- paste0(filePrefix, "_RPCA_UMAP.pdf")
    pdf(plot.name, width = 18, height = 5)
    DimPlot(dat.combined, 
            reduction = "umap",
            group.by = c(sampleCol, patientCol, cellCol),
            ncol = 3
           )
    dev.off()

    # save results
    seurat.name <- paste0(filePrefix, "_RPCA_seurat.rds")
    saveRDS(dat.combined, file = seurat.name)
    
#}

---
## 5.0 Data Integration with fastMNN
---

http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/fast_mnn.html

module load R/3.6.1

This is being a pain because the objects are saved using Seurat v4 whcih need SeuratObject to run...but cant install in R/v3.6.1

In [None]:
# module load R/3.6.1
library(Matrix)
library(Seurat)
library(batchelor)
library(SeuratWrappers)

# load count matrix
matrix_dir <- "/cluster/projects/pughlab/projects/cancer_scrna_integration/data/Ma-LIHC/"
barcode.path <- paste0(matrix_dir, filePrefix, "_", "barcodes.tsv.gz")
features.path <- paste0(matrix_dir, filePrefix, "_", "features.tsv.gz")
matrix.path <- paste0(matrix_dir, filePrefix, "_", "matrix.mtx.gz")
mat <- readMM(file = matrix.path)
feature.names <- read.delim(features.path,
                            header = FALSE,
                            stringsAsFactors = FALSE)
barcode.names <- read.delim(barcode.path,
                            header = FALSE,
                            stringsAsFactors = FALSE)
colnames(mat) <- barcode.names$V1
rownames(mat) <- feature.names$V2

# load metadata
meta <- read.csv(paste0(matrix_dir, "/", filePrefix, "_meta.csv"))
rownames(meta) <- meta$X
meta$X <- NULL

# make seurat object
dat <- CreateSeuratObject(counts = mat,
                          meta.data = meta,
                         )

# run fastmnn
dat <- NormalizeData(dat)
dat <- FindVariableFeatures(dat, nfeatures = 2000)
start <- Sys.time()
dat <- RunFastMNN(object.list = SplitObject(dat, split.by = sampleCol))
dat <- RunUMAP(dat, reduction = "mnn", dims = 1:20)
dat <- FindNeighbors(dat, reduction = "mnn", dims = 1:20)
dat <- FindClusters(dat)
end <- Sys.time()
end - start # Time difference of 2.32405 mins

# plot results
plot.name <- paste0(filePrefix, "_fastmnn_UMAP.pdf")
pdf(plot.name, width = 18, height = 5)
DimPlot(dat, 
        group.by = c(sampleCol, patientCol, cellCol),
        ncol = 3
       )
dev.off()

# save results
seurat.name <- paste0(filePrefix, "_fastmnn_seurat.rds")
saveRDS(dat, file = seurat.name)

---
## 6.0 Data Integration with Harmony
---

https://github.com/satijalab/seurat-wrappers/blob/master/docs/harmony.md

module load R/3.6.1

In [None]:
# module load R/3.6.1

library(Seurat)
library(harmony)
library(SeuratWrappers)

# load count matrix
matrix_dir <- "/cluster/projects/pughlab/projects/cancer_scrna_integration/data/Ma-LIHC/"
barcode.path <- paste0(matrix_dir, filePrefix, "_", "barcodes.tsv.gz")
features.path <- paste0(matrix_dir, filePrefix, "_", "features.tsv.gz")
matrix.path <- paste0(matrix_dir, filePrefix, "_", "matrix.mtx.gz")
mat <- readMM(file = matrix.path)
feature.names <- read.delim(features.path,
                            header = FALSE,
                            stringsAsFactors = FALSE)
barcode.names <- read.delim(barcode.path,
                            header = FALSE,
                            stringsAsFactors = FALSE)
colnames(mat) <- barcode.names$V1
rownames(mat) <- feature.names$V2

# load metadata
meta <- read.csv(paste0(matrix_dir, "/", filePrefix, "_meta.csv"))
rownames(meta) <- meta$X
meta$X <- NULL

# make seurat object
dat <- CreateSeuratObject(counts = mat,
                          meta.data = meta,
                         )

In [None]:
# run harmony correction
dat <- NormalizeData(dat) %>% FindVariableFeatures() %>% ScaleData() %>% RunPCA(verbose = FALSE)
start <- Sys.time()
dat <- RunHarmony(dat, group.by.vars = sampleCol)
dat <- RunUMAP(dat, reduction = "harmony", dims = 1:20)
dat <- FindNeighbors(dat, reduction = "harmony", dims = 1:20) %>% FindClusters()
end <- Sys.time()
end - start # Time difference of 1.715977 mins

# plot results
plot.name <- paste0(filePrefix, "_Harmony_UMAP.pdf")
pdf(plot.name, width = 18, height = 5)
DimPlot(dat, 
        group.by = c(sampleCol, patientCol, cellCol),
        ncol = 3
       )
dev.off()

# save results
seurat.name <- paste0(filePrefix, "_Harmony_seurat.rds")
saveRDS(dat, file = seurat.name)