# Rectangular integration using CCA + WNN

In [1]:
source("path/preprocess/utils.R")
setwd("path")
library(gridExtra)
library(RColorBrewer)

Attaching SeuratObject

Registered S3 method overwritten by 'SeuratDisk':
  method            from  
  as.sparse.H5Group Seurat

Loading required package: ensembldb

Loading required package: BiocGenerics


Attaching package: 'BiocGenerics'


The following objects are masked from 'package:stats':

    IQR, mad, sd, var, xtabs


The following objects are masked from 'package:base':

    Filter, Find, Map, Position, Reduce, anyDuplicated, append,
    as.data.frame, basename, cbind, colnames, dirname, do.call,
    duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
    lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
    pmin.int, rank, rbind, rownames, sapply, setdiff, sort, table,
    tapply, union, unique, unsplit, which.max, which.min


Loading required package: GenomicRanges

Loading required package: stats4

Loading required package: S4Vectors


Attaching package: 'S4Vectors'


The following object is masked from 'package:future':

    values


The fol

In [2]:
output_dir <- pj("result", "comparison", 'wnn_full_cellmask', 'increamental_integration_cca')
mkdir(output_dir, remove_old = F)

K <- parseTOML("configs/model.toml")[["default"]]$dim_c
l <- 7.5  # figure size
L <- 10   # figure size
m <- 0.5  # legend margin

In [3]:
rna_all <- LoadH5Seurat('path/data/raw/rna+adt/wnn/rna_cellmask.h5seurat')
adt_all <- LoadH5Seurat('path/data/raw/rna+adt/wnn/adt_cellmask.h5seurat')

Validating h5Seurat file

Initializing rna with data



In [None]:
rna <- rna_all[, rna_all[['batch']]=='p1_0' | rna_all[['batch']]=='p2_0']
adt <- adt_all[, adt_all[['batch']]=='p1_0' | adt_all[['batch']]=='p2_0']

In [None]:
obj <- rna
obj[["adt"]] <- CreateAssayObject(counts = adt@assays$adt@counts)
obj

An object of class Seurat 
3841 features across 10120 samples within 2 assays 
Active assay: rna (3617 features, 0 variable features)
 1 other assay present: adt

# P1 and P2

In [6]:
obj_rna <- GetAssayData(object = obj, assay = "rna")
obj_rna <- CreateSeuratObject(counts = obj_rna, assay = "rna")
obj_rna@meta.data$l1 <- rna[['l1']]
obj_rna@meta.data$batch <- rna[['batch']]
obj_rna.list <- SplitObject(obj_rna, split.by = "batch")
obj_rna.list <- lapply(X = obj_rna.list, FUN = function(x) {
    x <- NormalizeData(x)
    x <- FindVariableFeatures(x, nfeatures = 5000)
})

rna_features <- SelectIntegrationFeatures(object.list = obj_rna.list, nfeatures = 5000)
obj_rna.list <- lapply(X = obj_rna.list, FUN = function(x) {
    x <- ScaleData(x, features = rna_features, verbose = FALSE)
    x <- RunPCA(x, features = rna_features, verbose = FALSE, reduction.name = "pca")
})

rna.anchors <- FindIntegrationAnchors(
    object.list = obj_rna.list,
    anchor.features = rna_features, 
    reduction = "cca")
rna.combined <- IntegrateData(anchorset = rna.anchors)

obj[["rna_int"]] <- GetAssay(rna.combined, assay = "integrated")
DefaultAssay(obj) <- "rna_int"
obj <- ScaleData(obj, verbose = FALSE)
obj <- RunPCA(obj, reduction.name = "pca_cca_rna")
obj_adt <- GetAssayData(object = obj, assay = "adt")
obj_adt <- CreateSeuratObject(counts = obj_adt, assay = "adt")
obj_adt@meta.data$l1 <- adt[['l1']]
obj_adt@meta.data$batch <- adt[['batch']]
obj_adt.list <- SplitObject(obj_adt, split.by = "batch")
obj_adt.list <- lapply(X = obj_adt.list, FUN = function(x) {
    x <- NormalizeData(x, normalization.method = "CLR", margin = 2)
    x <- FindVariableFeatures(x)
})

adt_features <- SelectIntegrationFeatures(object.list = obj_adt.list)
obj_adt.list <- lapply(X = obj_adt.list, FUN = function(x) {
    x <- ScaleData(x, features = adt_features, verbose = FALSE)
    x <- RunPCA(x, features = adt_features, verbose = FALSE, reduction.name = "pca")
})
adt.anchors <- FindIntegrationAnchors(
    object.list = obj_adt.list,
    anchor.features = adt_features,
    reduction = "cca")
adt.combined <- IntegrateData(anchorset = adt.anchors)

obj[["adt_int"]] <- GetAssay(adt.combined, assay = "integrated")
DefaultAssay(obj) <- "adt_int"
obj <- ScaleData(obj, verbose = FALSE)
obj <- RunPCA(obj, reduction.name = "pca_cca_adt")
# wnn
obj <- FindMultiModalNeighbors(obj, list("pca_cca_rna", "pca_cca_adt"),
                                    list(1:K, 1:K))
# save connectivity matrices for benchmarking
connectivities <- obj$wsnn
diag(connectivities) <- 0
mkdir(pj(output_dir,'step_1'), remove_old = F)
invisible(writeMM(connectivities, pj(output_dir, 'step_1', "connectivities.mtx")))
obj <- RunUMAP(obj, nn.name = "weighted.nn", reduction.name = "umap")
SaveH5Seurat(obj, pj(output_dir, 'step_1', "obj.h5seurat"), overwrite = TRUE)
# obj <- LoadH5Seurat(pj(output_dir, 'step_1', "obj.h5seurat"), assays = "ADT", reductions = "umap")
dim_plot(obj, w = l, h = l, reduction = "umap",
    group.by = "l1", label = F,
    repel = T, label.size = 4, pt.size = 0.5, cols = NULL,
    title = 'cca', legend = F,
    save_path = pj(output_dir, 'step_1', paste0('cca_', "celltype")))
dim_plot(obj, w = l, h = l, reduction = "umap",
    group.by = "batch", label = F,
    repel = T, label.size = 4, pt.size = 0.5, cols = NULL,
    title = 'cca', legend = F,
    save_path = pj(output_dir, 'step_1', paste0('cca_', "batch")))

Scaling features for provided objects

Finding all pairwise anchors

Running CCA

Merging objects

Finding neighborhoods

Finding anchors

	Found 12238 anchors

Filtering anchors

	Retained 5448 anchors

Merging dataset 2 into 1

Extracting anchors for merged samples

Finding integration vectors

Finding integration vector weights

Integrating data

PC_ 1 
Positive:  RPS27, RPS12, CD3E, RPL13, CD3D, TRBC2, IL7R, MALAT1, CD3G, IL32 
	   ARL4C, CD2, LTB, TRAC, CD247, LCK, SYNE2, LINC00861, CLEC2D, TRBC1 
	   TCF7, CD7, LBH, GZMM, CD52, ISG20, STK17A, BCL11B, SKAP1, SPOCK2 
Negative:  MNDA, FCN1, CSTA, SERPINA1, CST3, CYBB, CLEC12A, FGL2, VCAN, NCF2 
	   LYZ, CFD, GRN, MPEG1, MS4A6A, S100A9, CD14, CD68, S100A8, TNFAIP2 
	   SPI1, CFP, TYMP, IGSF6, LST1, AIF1, CLEC7A, KCTD12, CTSS, CTSZ 
PC_ 2 
Positive:  TMSB4X, S100A6, CD48, PTPRC, S100A11, S100A4, ACTB, IL32, ITGB2, LYZ 
	   CD3E, S100A9, S100A8, LGALS1, SRGN, CCL5, GZMA, GZMM, S100A12, CEBPB 
	   CD2, CD3D, AGTRAP, TRAC, CD14, CST7, CD

# for new batch

In [None]:
L <- 4
for(step in c(2:8)){
    obj <- LoadH5Seurat(paste0('result/comparison/wnn_full_cellmask/increamental_integration_cca/increamental_integration_cca/step_', step-1,'/obj.h5seurat'))

    rna <- rna_all[, rna_all[['batch']]==paste0('p',step+1,'_0')]
    rna@meta.data[['nCount_rna']] <- NULL
    rna@meta.data[['nCount_adt']] <- NULL
    rna@meta.data[['nFeature_adt']] <- NULL
    rna@meta.data[['nFeature_rna']] <- NULL
    rna@meta.data[['nCount_RNA']] <- NULL
    rna@meta.data[['nFeature_RNA']] <- NULL


    adt <- GetAssayData(object = adt_all[, adt_all[['batch']]==paste0('p',step+1,'_0')], assay = "adt")
    adt <- CreateSeuratObject(counts = adt, assay = "adt")
    
    adt@meta.data[['nCount_rna']] <- NULL
    adt@meta.data[['nCount_adt']] <- NULL
    adt@meta.data[['nFeature_adt']] <- NULL
    adt@meta.data[['nFeature_rna']] <- NULL
    adt@meta.data[['nCount_RNA']] <- NULL
    adt@meta.data[['nFeature_RNA']] <- NULL

    rna_ref <- GetAssayData(object = obj, assay = "rna_int")
    rna_ref <- CreateSeuratObject(counts = rna_ref, assay = "rna")
    rna_ref[['l1']] <- obj[['l1']]
    rna_ref[['batch']] <- 'reference'
    rna_ref[['nCount_RNA']] <- NULL
    rna_ref[['nFeature_RNA']] <- NULL

    adt_ref <- GetAssayData(object = obj, assay = "adt_int")
    adt_ref <- CreateSeuratObject(counts = adt_ref, assay = "adt")
    adt_ref[['l1']] <- obj[['l1']]
    adt_ref[['batch']] <- 'reference'
    adt_ref[['nCount_ADT']] <- NULL
    adt_ref[['nFeature_ADT']] <- NULL

    rna <- merge(rna_ref, rna)
    adt <- merge(adt_ref, adt)

    obj <- rna
    obj[["adt"]] <- CreateAssayObject(counts = adt@assays$adt@counts)
    obj

    obj_rna <- GetAssayData(object = obj, assay = "rna")
    obj_rna <- CreateSeuratObject(counts = obj_rna, assay = "rna")
    obj_rna[['l1']] <- rna[['l1']]
    obj_rna[['batch']] <- rna[['batch']]

    obj_rna.list <- SplitObject(obj_rna, split.by = "batch")
    obj_rna.list

    obj_rna.list[[paste0('p',step+1,'_0')]] <- NormalizeData(obj_rna.list[[paste0('p',step+1,'_0')]])

    obj_rna.list <- lapply(X = obj_rna.list, FUN = function(x) {
    # x <- NormalizeData(x)
    x <- FindVariableFeatures(x, nfeatures = 5000)
    })
    rna_features <- SelectIntegrationFeatures(object.list = obj_rna.list, nfeatures = 5000)
    obj_rna.list <- lapply(X = obj_rna.list, FUN = function(x) {
        x <- ScaleData(x, features = rna_features, verbose = FALSE)
        x <- RunPCA(x, features = rna_features, verbose = FALSE, reduction.name = "pca")
    })
    rna.anchors <- FindIntegrationAnchors(
        object.list = obj_rna.list,
        anchor.features = rna_features,
        reduction = "cca")
    rna.combined <- IntegrateData(anchorset = rna.anchors)

    obj[["rna_int"]] <- GetAssay(rna.combined, assay = "integrated")
    DefaultAssay(obj) <- "rna_int"
    obj <- ScaleData(obj, verbose = FALSE)
    obj <- RunPCA(obj, reduction.name = "pca_cca_rna")

    obj_adt <- GetAssayData(object = obj, assay = "adt")
    obj_adt <- CreateSeuratObject(counts = obj_adt, assay = "adt")
    obj_adt[['l1']] <- obj[['l1']]
    obj_adt[['batch']]<- obj[['batch']]
    obj_adt.list <- SplitObject(obj_adt, split.by = "batch")
    obj_adt.list[[paste0('p',step+1,'_0')]] <- NormalizeData(obj_adt.list[[paste0('p',step+1,'_0')]], normalization.method = "CLR", margin = 2)
    obj_adt.list <- lapply(X = obj_adt.list, FUN = function(x) {
        # x <- NormalizeData(x, normalization.method = "CLR", margin = 2)
        x <- FindVariableFeatures(x)
    })

    adt_features <- SelectIntegrationFeatures(object.list = obj_adt.list)
    obj_adt.list <- lapply(X = obj_adt.list, FUN = function(x) {
        x <- ScaleData(x, features = adt_features, verbose = FALSE)
        x <- RunPCA(x, features = adt_features, verbose = FALSE, reduction.name = "pca")
    })
    adt.anchors <- FindIntegrationAnchors(
        object.list = obj_adt.list,
        anchor.features = adt_features,
        reduction = "cca")
    adt.combined <- IntegrateData(anchorset = adt.anchors)

    obj[["adt_int"]] <- GetAssay(adt.combined, assay = "integrated")
    DefaultAssay(obj) <- "adt_int"
    obj <- ScaleData(obj, verbose = FALSE)
    obj <- RunPCA(obj, reduction.name = "pca_cca_adt")
    # wnn
    obj <- FindMultiModalNeighbors(obj, list("pca_cca_rna", "pca_cca_adt"),
                                        list(1:K, 1:K))
    # save connectivity matrices for benchmarking
    connectivities <- obj$wsnn
    diag(connectivities) <- 0
    mkdir(pj(output_dir,paste0('step_',step)), remove_old = F)
    invisible(writeMM(connectivities, pj(output_dir,paste0('step_',step), "connectivities.mtx")))
    
    obj <- RunUMAP(obj, nn.name = "weighted.nn", reduction.name = "umap")
    SaveH5Seurat(obj, pj(output_dir, paste0('step_',step), "obj.h5seurat"), overwrite = TRUE)
    
    # obj <- LoadH5Seurat(pj(output_dir, "obj.h5seurat"), assays = "adt", reductions = "umap")

    # dim_plot(obj, w = L, h = L, reduction = 'umap', no_axes = T,
    #     split.by = NULL, group.by = "orig.ident", label = F, repel = T, label.size = 4, pt.size = 0.1, cols = col_4, legend = F,
    #     save_path = pj(output_dir, paste0('step_',step), paste('cca', "merged_batch", sep = "_")))

    # dim_plot(obj, w = L, h = L, reduction = 'umap', no_axes = T,
    #     split.by = NULL, group.by = "l1", label = F, repel = T, label.size = 4, pt.size = 0.1, cols = col_8, legend = F,
    #     save_path = pj(output_dir, paste0('step_',step),paste('cca', "merged_label", sep = "_")))
    }