Goal: assess performance of tensor decompsition, and in particular the "how" paramater, in the presence of changing/missing cell types across contexts

In [1]:
suppressPackageStartupMessages({
    library(splatter)
    
    library(scater)
    library(scran)
    library(bluster)
    
    library(reshape2)
    library(StabEco, quietly = T)
    library(igraph, quietly = T)
    
    library(liana, quietly = T)

})

seed = 888
set.seed(seed)

In [2]:
check.bipartite<-function(node_groups, G){
    bipartite = TRUE

    for (node in node_groups$ligand){
        if (length(intersect(G[G[['Var1']] == node, 'Var2'], node_groups$ligand)) != 0){
            bipartite = FALSE
        }
        if (length(intersect(G[G[['Var2']] == node, 'Var1'], node_groups$ligand)) != 0){
            bipartite = FALSE
        }
    }

    for (node in node_groups$receptor){
        if (length(intersect(G[G[['Var1']] == node, 'Var2'], node_groups$receptor)) != 0){
            bipartite = FALSE
        }
        if (length(intersect(G[G[['Var2']] == node, 'Var1'], node_groups$receptor)) != 0){
            bipartite = FALSE
        }
    }

    if (!bipartite){stop('Not bipartite')}
}


# generate a scale-free, undirected, bipartite PPI network
# emulate from c2c_sim (https://github.com/hmbaghdassarian/c2c_sim) based on Simulate.LR_network method
# with same parameters as in first tensor-cell2cell paper (https://github.com/hmbaghdassarian/tc2c_analyses_1/tree/master/notebooks/time_simulation)
generate.lr.ppi<-function(lr.genes){
    
    alpha<-2
    degrees<-3
    edges<-NULL
    
    set.seed(seed)
    B = StabEco::BiGraph$new(n1=length(lr.genes)/2, beta=alpha, k=degrees, m=edges, 
                             type = 'bipartite_sf', directed = F, is_adj = T)# simulate
    G = B$get_graph()  # adjacency matrix

    node_groups = list(ligand = 1:B$n1, receptor = (B$n1+1):(B$n1+B$n2))
    G<-reshape2::melt(G) #adjacency list
    G = G[G$value != 0, ] # remove disconnected nodes
    
#     check.bipartite(node_groups, G)
    colnames(G)<-c('Out', 'In', 'Interaction.Strength')
    rownames(G)<-NULL
    G<-G[, c('Out', 'In')]
    
    # map to simulated dataset gene names
    ligand.map<-setNames(lr.genes[1:B$n1], node_groups$ligand)
    receptor.map<-setNames(lr.genes[(B$n1+1):(B$n1+B$n2)], node_groups$receptor)
    lr.map<-c(ligand.map, receptor.map)
    G[['Out']]<-lr.map[G$Out]
    G[['In']]<-lr.map[G$In]
    
    return(G)
    
}

qc.data<-function(sce){
    # taken from PMID: 34949812
    
    # QC of cells
    sce <- scater::addPerCellQC(sce) # typical QC as in batch correction paper
    discard <- lapply(unique(colData(sce)$Batch), function(batch) {
        in_batch <- colData(sce)$Batch == batch
        scater::quickPerCellQC(colData(sce)[in_batch, ], nmads = 2)$discard
    })
    discard <- unlist(discard)
    colData(sce)$Discard <- discard
    sce <- sce[, !discard]

    # QC of genes
    sce <- scater::addPerFeatureQC(sce)
    is_exprs <- rowData(sce)$detected >= 0.01
    sce <- sce[is_exprs, ]
    
    return(sce)
}

random.omit<-function(sim, md.group.label, frac, seed = 1){
    md.group<-unique(sim[[md.group.label]])
    n.md.group<-length(md.group)
    set.seed(seed)
    omit.md.group = sort(as.character(sample(md.group, size = frac*n.md.group, replace = FALSE)))
    
    return(omit.md.group)
}

split.by.context<-function(sim){
    sim.bc<-list()
    contexts<-unique(sim[['Batch']])
    for (context in contexts){
        bc<-rownames(colData(sim)[(colData(sim)$Batch == context),])
        sim.bc[[context]]<-sim[, bc]
    }
    return(sim.bc)
}

# stuff to run on both sim.gold and sim.omit
joint.processing<-function(sim){
    #log-norm
    sim<-lapply(sim, FUN = function(sce) {
        sce <- scater::logNormCounts(sce)
    })
    
    return(sim)
}

Simulation parameters:

In [3]:
base_params <- newSplatParams()
n.cells<-2e3#60e3
n.cell.types<-10#10
n.genes<-2e3
n.lrs<-2e2

n.batches<-10 # of contexts

sim_params <- setParams(
    base_params,
    seed = seed,
    nGenes = n.genes,
    
    # batches
    batchCells = rep(n.cells/n.batches, n.batches),
    batch.rmEffect = TRUE, # no batch effects, only used as Context medata
    
    # cell types
    group.prob = rep(1/n.cell.types, n.cell.types), 
)

Gold standard CCC tensor with no omitted cell types or LR pairs:

In [None]:
# create the gold standard dataset
sim.gold<-splatSimulateGroups(sim_params, verbose = F)

# run basic qc 
sim.gold<-qc.data(sim.gold)

# randomly select the LR genes
set.seed(seed)
lr.genes<-sort(as.character(sample(rownames(sim.gold), size = n.lrs, replace = FALSE)))
lr.ppi<-generate.lr.ppi(lr.genes)

#####
sim.gold.2<-split.by.context(sim.gold) # for consistent formatting with sim.omit when omitting LRs
sim.gold.2<-joint.processing(sim.gold.2)

In [4]:
# for loop

# fraction of .* to omit
frac.cts = seq(0.1, 0.9, 0.1) # fraction of cell types to be missing
frac.lrs = seq(0.1, 0.9, 0.1)
frac.contexts = seq(0.1, 0.9, 0.1) # fraction of contexts to omit cell types

# same or different .* to omit across contexts -- true/same is more difficult problem
consistent.remove.cts = c(TRUE, FALSE) 
consistent.remove.lrs = c(TRUE, FALSE)


#### practice params
frac.ct = 0.2
frac.lr = 0.2
frac.context = 0.2
cr.ct = F
cr.lr = F

In [6]:
# run.omitted.sim<-function(sim.gold, lr.genes, params){}
# select contexts, cell types, and LRs to omit-------------------------------------------------------------
omit.contexts.ct <- random.omit(sim = sim.gold, md.group.label = 'Batch', frac = frac.context)
omit.contexts.lr <- random.omit(sim = sim.gold, md.group.label = 'Batch', frac = frac.context, seed =2)

omit.cts <- list()
if (cr.ct){
    ocs <- random.omit(sim = sim.gold, md.group.label = 'Group', frac = frac.ct)
    for (oc in omit.contexts.ct){
        omit.cts[[oc]] = ocs
    }
}else{
    seed_ <- 1
    for (oc in omit.contexts.ct){
        omit.cts[[oc]] = random.omit(sim = sim.gold, md.group.label = 'Group', frac = frac.ct, seed = seed_)
        seed_ <- seed_ + 1
    }
}

omit.lrs <- list()
if (cr.lr){
    set.seed(1)
    olrs<-sort(as.character(sample(lr.genes, size = frac.lr*length(lr.genes), replace = FALSE)))
    for (oc in omit.contexts.lr){
        omit.lrs[[oc]] = olrs
    }
}else{
    seed_ <- 1
    for (oc in omit.contexts.lr){
        set.seed(seed_)
        olrs<-as.character(sample(lr.genes, size = frac.lr*length(lr.genes), replace = FALSE))
        omit.lrs[[oc]] = olrs
        seed_ <- seed_ + 1
    }
}
# ----------------------------------------------------------------------------------------------------------
set.seed(seed) # reset to as top of script

# apply the omissions
sim.omit<-sim.gold

# omit the cell types across context 
barcodes.exclude<-c()
for (context in names(omit.cts)){
    cell.types.omit = omit.cts[[context]]
    be = rownames(colData(sim.omit)[(colData(sim.omit)$Batch == context & colData(sim.omit)$Group %in% cell.types.omit),])
    barcodes.exclude<-c(barcodes.exclude, be)
}
if (length(unique(barcodes.exclude)) != length(barcodes.exclude)){
    stop('Something went wrong in excluding barcodes')
}

sim.omit<-sim.omit[, !(colnames(sim.omit) %in% barcodes.exclude)]

# omit the LRs across context 
sim.omit<-split.by.context(sim.omit) # split by context prior to filtering lrs in order to log-normalize appropriately

for (context in names(omit.lrs)){
    so<-sim.omit[[context]]
    sim.omit[[context]]<-so[!(rownames(so) %in% omit.lrs[[context]]), ]
}

sim.omit<-joint.processing(sim.omit) # TODO: complete this

In [7]:
# stuff to run on both sim.gold and sim.omit
joint.processing<-function(sim){
    #log-norm
    sim<-lapply(sim, FUN = function(sce) {
        sce <- scater::logNormCounts(sce)
    })
    
    # score communication
    
    # build tensor
    
    # decompose tensor
    
    return(sim) # tensor
}