-
Notifications
You must be signed in to change notification settings - Fork 17
/
findDoubletClusters.R
224 lines (204 loc) · 10.7 KB
/
findDoubletClusters.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
#' Detect doublet clusters
#'
#' Identify potential clusters of doublet cells based on whether they have intermediate expression profiles,
#' i.e., their profiles lie between two other \dQuote{source} clusters.
#'
#' @param x A numeric matrix-like object of count values,
#' where each column corresponds to a cell and each row corresponds to an endogenous gene.
#'
#' Alternatively, a \linkS4class{SummarizedExperiment} or \linkS4class{SingleCellExperiment} object containing such a matrix.
#' @param clusters A vector of length equal to \code{ncol(x)}, containing cluster identities for all cells.
#' If \code{x} is a SingleCellExperiment, this is taken from \code{\link{colLabels}(x)} by default.
#' @param subset.row See \code{?"\link{scran-gene-selection}"}.
#' @param threshold A numeric scalar specifying the FDR threshold with which to identify significant genes.
#' @param ... For the generic, additional arguments to pass to specific methods.
#'
#' For the ANY method, additional arguments to pass to \code{\link{findMarkers}}.
#'
#' For the SummarizedExperiment method, additional arguments to pass to the ANY method.
#'
#' For the SingleCellExperiment method, additional arguments to pass to the SummarizedExperiment method.
#' @param assay.type A string specifying which assay values to use, e.g., \code{"counts"} or \code{"logcounts"}.
#' @param get.all.pairs Logical scalar indicating whether statistics for all possible source pairings should be returned.
#'
#' @return
#' A \linkS4class{DataFrame} containing one row per query cluster with the following fields:
#' \describe{
#' \item{\code{source1}:}{String specifying the identity of the first source cluster.}
#' \item{\code{source2}:}{String specifying the identity of the second source cluster.}
#' \item{\code{num.de}:}{Integer, number of genes that are significantly non-intermediate
#' in the query cluster compared to the two putative source clusters.}
#' \item{\code{median.de}:}{Integer, median number of genes that are significantly non-intermediate
#' in the query cluster across all possible source cluster pairings.}
#' \item{\code{best}:}{String specifying the identify of the top gene with the lowest p-value
#' against the doublet hypothesis for this combination of query and source clusters.}
#' \item{\code{p.value}:}{Numeric, containing the adjusted p-value for the \code{best} gene.}
#' \item{\code{lib.size1}:}{Numeric, ratio of the median library sizes for the first source cluster to the query cluster.}
#' \item{\code{lib.size2}:}{Numeric, ratio of the median library sizes for the second source cluster to the query cluster.}
#' \item{\code{prop}:}{Numeric, proportion of cells in the query cluster.}
#' \item{\code{all.pairs}:}{A \linkS4class{SimpleList} object containing the above statistics
#' for every pair of potential source clusters, if \code{get.all.pairs=TRUE}.}
#' }
#' Each row is named according to its query cluster.
#'
#' @details
#' This function detects clusters of doublet cells in a manner similar to the method used by Bach et al. (2017).
#' For each \dQuote{query} cluster, we examine all possible pairs of \dQuote{source} clusters,
#' hypothesizing that the query consists of doublets formed from the two sources.
#' If so, gene expression in the query cluster should be strictly intermediate
#' between the two sources after library size normalization.
#'
#' We apply pairwise t-tests to the normalized log-expression profiles to reject this null hypothesis.
#' This is done by identifying genes that are consistently up- or down-regulated in the query compared to \emph{both} sources.
#' We count the number of genes that reject the null hypothesis at the specified FDR \code{threshold}.
#' For each query cluster, the most likely pair of source clusters is that which minimizes the number of significant genes.
#'
#' Potential doublet clusters are identified using the following characteristics, in order of importance:
#' \itemize{
#' \item Low number of significant genes (i.e., \code{num.de}).
#' Ideally, \code{median.de} is also high to indicate that the absence of strong DE is not due to a lack of power.
#' \item A reasonable proportion of cells in the cluster, i.e., \code{prop}.
#' This requires some expectation of the doublet rate in the experimental protocol.
#' \item Library sizes of the source clusters that are below that of the query cluster, i.e., \code{lib.size*} values below unity.
#' This assumes that the doublet cluster will contain more RNA and have more counts than either of the two source clusters.
#' }
#'
#' For each query cluster, the function will only report the pair of source clusters with the lowest \code{num.de}.
#' Setting \code{get.all.pairs=TRUE} will retrieve statistics for all pairs of potential source clusters.
#' This can be helpful for diagnostics to identify relationships between specific clusters.
#'
#' The reported \code{p.value} is of little use in a statistical sense, and is only provided for inspection.
#' Technically, it could be treated as the Simes combined p-value against the doublet hypothesis for the query cluster.
#' However, this does not account for the multiple testing across all pairs of clusters for each chosen cluster,
#' especially as we are chosing the pair that is most concordant with the doublet null hypothesis.
#'
#' We use library size normalization (via \code{\link{librarySizeFactors}}) even if existing size factors are present.
#' This is because intermediate expression of the doublet cluster is not guaranteed for arbitrary size factors.
#' For example, expression in the doublet cluster will be higher than that in the source clusters if normalization was performed with spike-in size factors.
#'
#' @author
#' Aaron Lun
#'
#' @references
#' Bach K, Pensa S, Grzelak M, Hadfield J, Adams DJ, Marioni JC and Khaled WT (2017).
#' Differentiation dynamics of mammary epithelial cells revealed by single-cell RNA sequencing.
#' \emph{Nat Commun.} 8, 1:2128.
#'
#' @seealso
#' \code{\link{findMarkers}}, to detect DE genes between clusters.
#'
#' @examples
#' # Mocking up an example.
#' library(SingleCellExperiment)
#' sce <- mockDoubletSCE(c(200,300,200))
#'
#' # Compute doublet-ness of each cluster:
#' dbl <- findDoubletClusters(counts(sce), sce$cluster)
#' dbl
#'
#' # Narrow this down to clusters with very low 'N':
#' library(scuttle)
#' isOutlier(dbl$num.de, log=TRUE, type="lower")
#'
#' # Get help from "lib.size" below 1.
#' dbl$lib.size1 < 1 & dbl$lib.size2 < 1
#'
#' @name findDoubletClusters
NULL
#' @importFrom scuttle librarySizeFactors logNormCounts
#' @importFrom scran findMarkers .logBH
#' @importFrom BiocGenerics "sizeFactors<-" sizeFactors
#' @importFrom stats p.adjust median
#' @importFrom methods as
#' @importClassesFrom S4Vectors SimpleList
.doublet_cluster <- function(x, clusters, subset.row=NULL, threshold=0.05, get.all.pairs=FALSE, ...) {
if (length(unique(clusters)) < 3L) {
stop("need at least three clusters to detect doublet clusters")
}
# Computing normalized counts using the library size (looking for compositional differences!)
sce <- SingleCellExperiment(list(counts=x))
sizeFactors(sce) <- librarySizeFactors(x, subset_row=subset.row)
sce <- logNormCounts(sce)
degs <- findMarkers(sce, clusters, subset.row=subset.row, full.stats=TRUE, ...)
med.lib.size <- vapply(split(sizeFactors(sce), clusters), FUN=median, FUN.VALUE=0)
n.cluster <- table(clusters)/length(clusters)
# Setting up the output.
all.clusters <- names(degs)
collected.top <- collected.all <- vector("list", length(all.clusters))
names(collected.top) <- names(collected.all) <- all.clusters
# Running through all pairs of clusters and testing against the third cluster.
for (ref in all.clusters) {
ref.stats <- degs[[ref]]
remnants <- setdiff(all.clusters, ref)
num <- length(remnants) * (length(remnants) - 1L)/2L
all.N <- med.N <- all.gene <- all.parent1 <- all.parent2 <- integer(num)
all.p <- numeric(num)
idx <- 1L
for (i1 in seq_along(remnants)) {
stats1 <- ref.stats[[paste0("stats.", remnants[i1])]]
for (i2 in seq_len(i1-1L)) {
stats2 <- ref.stats[[paste0("stats.", remnants[i2])]]
# Obtaining the IUT and setting opposing log-fold changes to 1.
max.log.p <- pmax(stats1$log.p.value, stats2$log.p.value)
max.log.p[sign(stats1$logFC) != sign(stats2$logFC)] <- 0
# Correcting across genes. We use [1] to get NA when there are
# no genes, which avoids an nrow() mismatch in DataFrame().
log.adj.p <- .logBH(max.log.p)
best.gene <- which.min(max.log.p)[1]
all.N[idx] <- sum(log.adj.p <= log(threshold), na.rm=TRUE)
all.gene[idx] <- best.gene
all.p[idx] <- exp(log.adj.p[best.gene])
all.parent1[idx] <- i1
all.parent2[idx] <- i2
idx <- idx + 1L
}
}
# Formatting the output.
parent1 <- remnants[all.parent1]
parent2 <- remnants[all.parent2]
stats <- DataFrame(source1=parent1, source2=parent2,
num.de=all.N,
median.de=rep(0, length(all.N)), # placeholder, see below.
best=rownames(ref.stats)[all.gene],
p.value=all.p,
lib.size1=unname(med.lib.size[parent1]/med.lib.size[ref]),
lib.size2=unname(med.lib.size[parent2]/med.lib.size[ref]))
o <- order(all.N, -all.p)
top <- cbind(stats[o[1],], prop=n.cluster[[ref]])
med.de <- median(all.N)
top$median.de <- med.de
rownames(top) <- ref
collected.top[[ref]] <- top
if (get.all.pairs) {
stats$median.de <- NULL
collected.all[[ref]] <- stats[o,]
}
}
# Returning the DataFrame of compiled results.
out <- do.call(rbind, collected.top)
if (get.all.pairs) {
out$all.pairs <- as(collected.all, "SimpleList")
}
out[order(out$num.de),]
}
##############################
# S4 method definitions here #
##############################
#' @export
#' @rdname findDoubletClusters
setGeneric("findDoubletClusters", function(x, ...) standardGeneric("findDoubletClusters"))
#' @export
#' @rdname findDoubletClusters
setMethod("findDoubletClusters", "ANY", .doublet_cluster)
#' @export
#' @rdname findDoubletClusters
#' @importFrom SummarizedExperiment assay
setMethod("findDoubletClusters", "SummarizedExperiment", function(x, ..., assay.type="counts") {
.doublet_cluster(assay(x, i=assay.type), ...)
})
#' @export
#' @rdname findDoubletClusters
#' @importFrom SingleCellExperiment colLabels
setMethod("findDoubletClusters", "SingleCellExperiment", function(x, clusters=colLabels(x, onAbsence="error"), ...) {
callNextMethod(x=x, clusters=clusters, ...)
})