Goal: assess performance of tensor decompsition, and in particular the "how" paramater, in the presence of changing/missing cell types across contexts

In [1]:
suppressPackageStartupMessages({
    library(splatter)
    
    library(scater)
    library(scran)
    library(bluster)
    
    library(reshape2)
    library(StabEco, quietly = T)
    library(igraph, quietly = T)
    
    library(liana, quietly = T)
    library(tibble, quietly = T)

})

seed <- 888
set.seed(seed)
n.cores <- 15

In [2]:
# generate a scale-free, undirected, bipartite PPI network
# emulate from c2c_sim (https://github.com/hmbaghdassarian/c2c_sim) based on Simulate.LR_network method
# with same parameters as in first tensor-cell2cell paper (https://github.com/hmbaghdassarian/tc2c_analyses_1/tree/master/notebooks/time_simulation)
generate.lr.ppi<-function(lr.genes, seed = 888){
    alpha<-2
    degrees<-3
    edges<-NULL

    set.seed(seed)
    B = StabEco::BiGraph$new(n1=length(lr.genes)/2, beta=alpha, k=degrees, m=edges, 
                             type = 'bipartite_sf', directed = T, is_adj = F)# simulate
    G = B$get_graph()  # adjacency matrix

    if (!isSymmetric(G)){stop('Not a bipartite network')}

    node_groups = list(ligand = 1:B$n1, receptor = (B$n1+1):(B$n1+B$n2))

    G[lower.tri(G, diag = TRUE)] <- NA # symmetric
    G<-reshape2::melt(G) #adjacency list
    G = G[!is.na(G$value), ] # remove symmetric bidirectionality
    G = G[G$value != 0, ] # remove disconnected nodes

    colnames(G)<-c('Ligand', 'Receptor', 'Interaction.Strength')
    rownames(G)<-NULL
    G<-G[, c('Ligand', 'Receptor')]

    # map to simulated dataset gene names
    # the nodes must be a subset of those that are interacting
    if ((length(setdiff(G$Ligand, node_groups$ligand)) > 0) | (length(setdiff(G$Receptor, node_groups$receptor)) > 0)){
        stop('Something went wrong in ligand & receptor assignment')
    }
    ligand.map<-setNames(lr.genes[1:B$n1], node_groups$ligand)
    receptor.map<-setNames(lr.genes[(B$n1+1):(B$n1+B$n2)], node_groups$receptor)
    lr.map<-c(ligand.map, receptor.map)
    G[['Ligand']]<-ligand.map[as.character(G$Ligand)]
    G[['Receptor']]<-receptor.map[as.character(G$Receptor)]

    if (length(intersect(G$Ligand, G$Receptor)) != 0){stop('Not a bipartite network')}
    
    # format for LIANA
    colnames(G)<-c('source_genesymbol', 'target_genesymbol')
    G<-as_tibble(G)
    
    return(G)
    
}

qc.data<-function(sce){
    # taken from PMID: 34949812
    
    # QC of cells
    sce <- scater::addPerCellQC(sce) # typical QC as in batch correction paper
    discard <- lapply(unique(colData(sce)$Batch), function(batch) {
        in_batch <- colData(sce)$Batch == batch
        scater::quickPerCellQC(colData(sce)[in_batch, ], nmads = 2)$discard
    })
    discard <- unlist(discard)
    colData(sce)$Discard <- discard
    sce <- sce[, !discard]

    # QC of genes
    sce <- scater::addPerFeatureQC(sce)
    is_exprs <- rowData(sce)$detected >= 0.01
    sce <- sce[is_exprs, ]
    
    return(sce)
}

random.omit<-function(sim, md.group.label, frac, seed = 888){
    md.group<-unique(sim[[md.group.label]])
    n.md.group<-length(md.group)
    set.seed(seed)
    omit.md.group = sort(as.character(sample(md.group, size = frac*n.md.group, replace = FALSE)))
    
    return(omit.md.group)
}

split.by.context<-function(sim){
    sim.bc<-list()
    contexts<-unique(sim[['Batch']])
    for (context in contexts){
        bc<-rownames(colData(sim)[(colData(sim)$Batch == context),])
        sim.bc[[context]]<-sim[, bc]
    }
    return(sim.bc)
}

score.communication.sce<-function(sce, lr.ppi, seed = 888){
    communication.scores<-liana_wrap(sce = sce, 
                                       method = c('natmi', 'sca'), 
                                       idents_col = 'Group', 
                                       assay.type = 'logcounts',
                                       expr_prop = 0.1, # liana default
                                       seed = seed,
                                       parallelize = T, 
                                       workers = n.cores, 
                                       permutation.params = list(nperms = 1), # since we don't use p-values
                                       resource = 'custom',
                                       external_resource = lr.ppi
                                      )

    # filter for columns of interest and format
    communication.scores[['natmi']] <- communication.scores$natmi[,c('source', 'target', 'ligand', 'receptor', 'prod_weight')]
    communication.scores[['sca']] <- communication.scores$sca[,c('source', 'target', 'ligand', 'receptor', 'LRscore')]

    colnames(communication.scores$natmi) <- c('source', 'target', 'ligand', 'receptor', 'score')
    colnames(communication.scores$sca) <- c('source', 'target', 'ligand', 'receptor', 'score')

    if (min(communication.scores$natmi$score) < 0){stop('Unexpected negative score')}
    if (min(communication.scores$sca$score) < 0){stop('Unexpected negative score')}
    
    return(communication.scores)
}

# stuff to run on both sim.gold.list and sim.omit.list
score.communication<-function(sim.list, lr.ppi, do.contexts=NULL){
    if (!is.null(do.context)){
        sim.list <- sim.list[do.contexts]
    }
    
    #### score communication
    suppressMessages({
        suppressWarnings({
            score.list<-lapply(sim.list, FUN = function(sce) score.communication.sce(sce = sce, lr.ppi = lr.ppi))
            names(score.list)<-names(sim.list)
        })
    })
        
    # separate into the two scoring methods
    natmi.scores<-list()
    sca.scores<-list()
    for (context in names(sim.list)){
        natmi.scores[[context]]<-score.list[[context]]$natmi
        sca.scores[[context]]<-score.list[[context]]$sca
    }
    
    all.scores <- list(natmi.scores = natmi.scores, sca.scores = sca.scores)
    return(all.scores)
}

Simulation parameters:

In [3]:
base_params <- newSplatParams()
n.cells<-2e3#60e3
n.cell.types<-10#10
n.genes<-2e3
n.lrs<-2e2

n.batches<-5 # of contexts

sim_params <- setParams(
    base_params,
    seed = seed,
    nGenes = n.genes,
    
    # batches
    batchCells = rep(n.cells/n.batches, n.batches),
    batch.rmEffect = TRUE, # no batch effects, only used as Context medata
    
    # cell types
    group.prob = rep(1/n.cell.types, n.cell.types), 
)

Gold standard CCC tensor with no omitted cell types or LR pairs:

In [4]:
# create the gold standard dataset
sim.gold<-splatSimulateGroups(sim_params, verbose = F)

# run basic qc 
sim.gold<-qc.data(sim.gold)

# generate a LR PPI on a subset of the genes
set.seed(seed)
lr.genes<-sort(as.character(sample(rownames(sim.gold), size = n.lrs, replace = FALSE)))
lr.ppi<-generate.lr.ppi(lr.genes)
interacting.lr.genes<-unique(c(lr.ppi$source_genesymbol, lr.ppi$target_genesymbol))

#####

# split by context -- needed for LR omission and scoring
sim.gold.list<-split.by.context(sim.gold) 

# log-normalize
sim.gold.list<-lapply(sim.gold.list, FUN = function(sce) {
    sce <- scater::logNormCounts(sce)
})

scores.gold<-score.communication(sim.gold.list, lr.ppi)

ERROR: Error in score.communication(sim.gold.list, lr.ppi): object 'do.context' not found


In [None]:
# for loop

# fraction of .* to omit
frac.cts = seq(0.1, 0.6, 0.1) # fraction of cell types to be missing
frac.lrs = seq(0.1, 0.6, 0.1)
frac.contexts = seq(0.2, 0.6, 0.2) # fraction of contexts to omit cell types

# same or different .* to omit across contexts -- true/same is more difficult problem
consistent.remove.cts = c(TRUE, FALSE) 
consistent.remove.lrs = c(TRUE, FALSE)


#### practice params
frac.ct = 0.2
frac.lr = 0.2
frac.context = 0.2
cr.ct = F
cr.lr = F

In [None]:
# run.omitted.sim<-function(sim.gold.list, interacting.lr.genes, params){}
# select contexts, cell types, and LRs to omit-------------------------------------------------------------
seed.iter<-1
omit.contexts.ct <- random.omit(sim = sim.gold, md.group.label = 'Batch', frac = frac.context, seed = seed.iter)
seed.iter<-seed.iter + 1
omit.contexts.lr <- random.omit(sim = sim.gold, md.group.label = 'Batch', frac = frac.context, seed = seed.iter)
seed.iter<-seed.iter + 1

omit.cts <- list()
if (cr.ct){
    ocs <- random.omit(sim = sim.gold, md.group.label = 'Group', frac = frac.ct, seed = seed.iter)
    seed.iter<-seed.iter + 1
    for (oc in omit.contexts.ct){
        omit.cts[[oc]] = ocs
    }
}else{
    for (oc in omit.contexts.ct){
        omit.cts[[oc]] = random.omit(sim = sim.gold, md.group.label = 'Group', frac = frac.ct, seed = seed.iter)
        seed.iter<-seed.iter + 1
    }
}

omit.lrs <- list()
if (cr.lr){
    set.seed(seed.iter)
    olrs<-sort(as.character(sample(interacting.lr.genes, size = frac.lr*length(interacting.lr.genes), replace = FALSE)))
    seed.iter<-seed.iter + 1
    for (oc in omit.contexts.lr){
        omit.lrs[[oc]] = olrs
    }
}else{
    for (oc in omit.contexts.lr){
        set.seed(seed.iter)
        olrs<-as.character(sample(interacting.lr.genes, size = frac.lr*length(interacting.lr.genes), replace = FALSE))
        omit.lrs[[oc]] = olrs
        seed.iter<-seed.iter + 1
    }
}
# ----------------------------------------------------------------------------------------------------------
set.seed(seed) # reset to main

# apply the omissions
sim.omit.list<-sim.gold.list

for (context in names(omit.cts)){
    sce.omit <- sim.omit.list[[context]]
    barcodes.keep = rownames(colData(sce.omit)[!(colData(sce.omit)$Group %in% cell.types.omit),])
    sim.omit.list[[context]] <- sce.omit[, barcodes.keep]
}

for (context in names(omit.lrs)){
    sce.omit<-sim.omit.list[[context]]
    sim.omit.list[[context]]<-sce.omit[!(rownames(sce.omit) %in% omit.lrs[[context]]), ]
}

# get the communication scores------------------------------------------------------------------------------
# only re-calculate scores on altered contexts to save computation time

changed.contexts<-c(omit.contexts.ct, omit.contexts.lr)
scores.omit<-score.communication(sim.omit.list, lr.ppi, changed.contexts)

scores.omit[['natmi.scores']]<-c(scores.omit$natmi.scores, 
                                  scores.gold$natmi.scores[!(names(scores.gold$natmi.scores) %in% changed.contexts)])
scores.omit[['sca.scores']]<-c(scores.omit$sca.scores, 
                                  scores.gold$sca.scores[!(names(scores.gold$sca.scores) %in% changed.contexts)])
scores.omit<-lapply(scores.omit, function(x) x[sort(names(x))])

# build tensor
# calculate and record total fraction of missing indices
# decompose tensor
# calculate corrindex
