-
Notifications
You must be signed in to change notification settings - Fork 17
/
atac.R
316 lines (293 loc) · 12.8 KB
/
atac.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
#' amulet
#'
#- A reimplementation of the Amulet doublet detection method for single-cell
#' ATACseq (Thibodeau, Eroglu, et al., Genome Biology 2021). The rationale is
#' that cells with unexpectedly many loci covered by more than two reads are
#' more likely to be doublets.
#'
#' @param x The path to a fragments file, or a GRanges object containing the
#' fragments (with the `name` column containing the barcode, and the `score`
#' column containing the count).
#' @param ... Any argument to \code{\link{getFragmentOverlaps}}.
#'
#' @details When used on normal (or compressed) fragment files, this
#' implementation is relatively fast (except for reading in the data) but it
#' has a large memory footprint since the overlaps are performed in memory. It
#' is therefore recommended to compress the fragment files using bgzip and index
#' them with Tabix; in this case each chromosome will be read and processed
#' separately, leading to a considerably lower memory footprint. See the
#' underlying \code{\link{getFragmentOverlaps}} for details.
#'
#' @return A data.frame including, for each barcode, the number sites covered by
#' more than two reads, the number of reads, and p- and q-values (low values
#' indicative of doublets).
#'
#' @examples
#' # here we use a dummy fragment file for example:
#' fragfile <- system.file( "extdata", "example_fragments.tsv.gz",
#' package="scDblFinder" )
#' res <- amulet(fragfile)
#'
#' @importFrom GenomicRanges reduce
#' @importFrom S4Vectors splitAsList mcols
#' @importFrom IRanges overlapsAny
#' @export
amulet <- function(x, ...){
d <- getFragmentOverlaps(x, ...)
d$p.value <- ppois(d$nAbove2, mean(d$nAbove2), lower.tail=FALSE)
d$q.value <- p.adjust(d$p.value, method="BH")
d
}
#' clamulet
#'
#' Classification-powered Amulet-like method
#'
#' @param x The path to a fragment file (see \code{\link{getFragmentOverlaps}}
#' for performance/memory-related guidelines)
#' @param artificialDoublets The number of artificial doublets to generate
#' @param iter The number of learning iterations (should be 1 to)
#' @param k The number(s) of nearest neighbors at which to gather statistics
#' @param minCount The minimum number of cells in which a locus is detected to
#' be considered. If lower than 1, it is interpreted as a fraction of the
#' number of cells.
#' @param threshold The score threshold used during iterations
#' @param maxN The maximum number of regions per cell to consider to establish
#' windows for meta-features
#' @param nfeatures The number of meta-features to consider
#' @param max_depth The maximum tree depth
#' @param returnAll Logical; whether to return data also for artificial doublets
#' @param verbose Logical; whether to print progress information
#' @param ... Arguments passed to \code{\link{getFragmentOverlaps}}
#'
#' @details
#' `clamulet` operates similarly to the `scDblFinder` method, but generates
#' doublets by operating on the fragment coverages. This has the advantage that
#' the number of loci covered by more than two reads can be computed for
#' artificial doublets, enabling the use of this feature (along with the
#' kNN-based ones) in a classification scheme. It however has the disadvantage
#' of being rather slow and memory hungry, and appears to be outperformed by a
#' simple p-value combination of the two methods (see vignette).
#'
#' @return A data.frame
#' @export
#' @importFrom IRanges Views viewMaxs slice
clamulet <- function(x, artificialDoublets=NULL, iter=2, k=NULL, minCount=0.001,
maxN=500, nfeatures=25, max_depth=5, threshold=0.75,
returnAll=FALSE, verbose=TRUE, ...){
m <- .clamulet.buildTable(x, nADbls=artificialDoublets, maxN=maxN,
nfeatures=nfeatures, verbose=verbose, ...)
d <- m$d
labels <- m$labels
if(verbose) message(format(Sys.time(), "%X"), " - Scoring network")
k <- .defaultKnnKs(k, sum(labels=="real"))
kd <- .evaluateKNN(m$m, labels, rep(NA_integer_,length(labels)), k=k)$d
d <- m <- cbind(d, kd[,grep("weighted|ratio", colnames(kd))])
m <- as.matrix(m)
if(verbose) message(format(Sys.time(), "%X"), " - Iterative training")
d$include.in.training <- TRUE
d$type <- labels
d$score <- predict(.xgbtrain(m, labels, max_depth=3), m)
max.iter <- iter
while(iter>0){
# remove cells with a high chance of being doublets from the training
w <- which(d$type=="real" & d$score > 0.6)
if(verbose) message("iter=",max.iter-iter,", ", length(w),
" cells excluded from training.")
d$score <- tryCatch({
fit <- .xgbtrain(m[-w,], d$type[-w], max_depth=max_depth, nthreads=1L)
predict(fit, m)
}, error=function(e) d$score)
iter <- iter-1
}
d$include.in.training[w] <- FALSE
if(!returnAll){
d <- d[d$type=="real",]
d$type <- NULL
}
if(verbose) message(format(Sys.time(), "%X"), " Done!")
d
}
#' @importFrom GenomicRanges seqnames
.clamulet.buildTable <- function(x, nADbls=NULL, minCount=0.001, maxN=500,
nfeatures=25, verbose=TRUE, ...){
co <- getFragmentOverlaps(x, ..., ret="coverages", fullInMemory=TRUE,
verbose=verbose)
if(verbose) message(format(Sys.time(), "%X"), " - Obtaining windows")
seqlvls <- unique(unlist(lapply(head(co,100), names)))
if(minCount<1) minCount <- max(as.integer(round(minCount*length(co))),2L)
windows <- coverage(unlist(GRangesList(lapply(co, FUN=function(x){
x <- .sliceGR(x, seqlvls=seqlvls)
if(length(x)>maxN) x <- x[sample.int(length(x),maxN)]
x
}))))
windows <- reduce(.sliceGR(windows, lower=minCount, seqlvls=seqlvls),
min.gapwidth=100L)
if(verbose) message(format(Sys.time(), "%X"), " - Obtaining window counts")
windows <- split(ranges(windows), seqnames(windows))
counts <- .getCovCounts(co, windows)
if(verbose) message(format(Sys.time(), "%X"), " - Aggregating features")
counts <- aggregateFeatures(counts, seq_len(min(ncol(counts)-1,10)),
k=min(nrow(counts), nfeatures))
if(verbose) message(format(Sys.time(), "%X"),
" - Computing features for artificial doublets")
# get combination of cells for artificial doublets
if(is.null(nADbls)) nADbls <- length(co)
comb <- matrix(sample.int(length(co), size=2*floor(nADbls*2.2), replace=TRUE),
ncol=2)
comb <- comb[comb[,1]!=comb[,2],]
comb <- head(comb[!duplicated(comb),], nADbls)
# get counts for artificial doublets
acounts <- counts[,comb[,1]]+counts[,comb[,2]]
colnames(acounts) <- paste("artificial", seq_len(ncol(acounts)))
if(verbose) message(format(Sys.time(), "%X"),
" - Counting overlaps for real cells")
d <- data.frame(row.names=c(colnames(counts), colnames(acounts)),
type=rep(factor(c("real","doublet"), c("real","doublet")),
c(ncol(counts),ncol(acounts))),
total=c(colSums(counts),colSums(acounts)))
d$total.nAbove2 <- d$nAbove2 <- 0L
# get overlaps for real cells
gr2 <- .getOvsFromCovs(co, seqlvls = seqlvls)
tt <- table(gr2$name)
d[names(tt),"total.nAbove2"] <- as.integer(tt)
gr2 <- .removeHighOverlapSites(gr2, retExclusionRanges=TRUE)
exclusion <- gr2$exclusion
gr2 <- gr2$gr
tt <- table(gr2$name)
d[names(tt),"nAbove2"] <- as.integer(tt)
if(verbose) message(format(Sys.time(), "%X"),
" - Counting overlaps for artificial doublets")
# get overlaps for artificial cells
co <- lapply(setNames(seq_len(nrow(comb)), colnames(acounts)),
FUN=function(x){
x <- comb[x,]
.sumRleLists(co[[x[1]]], co[[x[2]]])
})
gr2 <- .getOvsFromCovs(co, seqlvls=seqlvls)
tt <- table(gr2$name)
d[names(tt),"total.nAbove2"] <- as.integer(tt)
gr2 <- gr2[!overlapsAny(gr2, exclusion)]
tt <- table(gr2$name)
d[names(tt),"nAbove2"] <- as.integer(tt)
m <- t(scale(t(t(10000*cbind(counts, acounts))/d$total)))
list(m=m, d=d[,-1], labels=d$type)
}
# adds up to RleLists
#' @importFrom S4Vectors runValue
.sumRleLists <- function(x,y){
names(useqlvls) <- useqlvls <- union(names(x),names(y))
isEmptyRle <- function(x) length(runValue(x))==1
as(lapply(useqlvls, FUN=function(seql){
if(is.null(x[[seql]]) || isEmptyRle(x[[seql]])) return(y[[seql]])
if(is.null(y[[seql]]) || isEmptyRle(y[[seql]])) return(x[[seql]])
suppressWarnings(x[[seql]]+y[[seql]])
}), "RleList")
}
# obtains loci with >lower coverage from a list of coverage RleList
.getOvsFromCovs <- function(co, seqlvls=NULL, lower=3L){
if(is.null(seqlvls)) seqlvls <- unique(unlist(lapply(co,names)))
grl <- GRangesList(lapply(co, lower=lower, seqlvls=seqlvls, FUN=.sliceGR))
gr2 <- unlist(grl, use.names=FALSE)
gr2$name <- rep(factor(names(grl), names(co)),lengths(grl))
gr2
}
# obtains maximum count per window from a list of coverage RleList
#' @importFrom S4Vectors Rle
#' @importFrom GenomicRanges seqnames start end
.getCovCounts <- function(co, windows, fun=IRanges::viewMaxs){
if(is(windows, "GRanges"))
windows <- split(ranges(windows), seqnames(windows))
windows <- windows[lengths(windows)>0]
def <- as(lapply(windows, FUN=function(x) Rle(0L, max(end(x)))), "RleList")
as(sapply(co, FUN=function(x){
if(length( missing <- setdiff(names(def), names(x)) )>0)
x <- c(x, def[missing])
x <- x[names(windows)]
unlist(fun(Views(x, windows)), use.names=FALSE)
}), "CsparseMatrix")
}
# slide a RleList and returns a GR
.sliceGR <- function(x, lower=1L, seqlvls=NULL){
if(is.null(seqlvls)) seqlvls <- names(x)
x <- x[names(x) %in% seqlvls]
x <- slice(x, lower=as.integer(round(lower)), rangesOnly=TRUE)
if(sum(lengths(x))==0) return(GRanges(factor(levels=seqlvls)))
GRanges(rep(factor(names(x), seqlvls), lengths(x)),
unlist(x, use.names=FALSE))
}
#' amuletFromCounts
#'
#' A reimplementation of the Amulet doublet detection method for single-cell
#' ATACseq (Thibodeau, Eroglu, et al., Genome Biology 2021), based on tile/peak
#' counts. Note that this is only a fast approximation to the original Amulet
#' method, and *performs considerably worse*; for an equivalent implementation,
#' see \code{\link{amulet}}.
#'
#' @param x A `SingleCellExperiment` object, or a matrix of counts with cells
#' as columns. If the rows represent peaks, it is recommended to limite their
#' width (see details).
#' @param maxWidth the maximum width for a feature to be included. This is
#' ignored unless `x` is a `SingleCellExperiment` with `rowRanges`.
#' @param exclude an optional `GRanges` of regions to be excluded. This is
#' ignored unless `x` is a `SingleCellExperiment` with `rowRanges`.
#'
#' @return If `x` is a `SingleCellExperiment`, returns the object with an
#' additional `amuletFromCounts.q` colData column. Otherwise returns a vector of
#' the amulet doublet q-values for each cell.
#'
#' @details
#' The rationale for the amulet method is that a single diploid cell should not
#' have more than two reads covering a single genomic location, and the method
#' looks for cells enriched with sites covered by more than two reads.
#' If the method is applied on a peak-level count matrix, however, larger peaks
#' can however contain multiple reads even though no single nucleotide is
#' covered more than once. Therefore, in such case we recommend to limit the
#' width of the peaks used for this analysis, ideally to maximum twice the upper
#' bound of the fragment size. For example, with a mean fragment size of 250bp
#' and standard deviation of 125bp, peaks larger than 500bp are very likely to
#' contain non-overlapping fragments, and should therefore be excluded using the
#' `maxWidth` argument.
#'
#' @seealso \code{\link{amulet}}
#'
#' @importFrom IRanges width
#' @importFrom SummarizedExperiment ranges
#' @export
#' @examples
#' x <- mockDoubletSCE()
#' x <- amuletFromCounts(x)
#' table(call=x$amuletFromCounts.q<0.05, truth=x$type)
amuletFromCounts <- function(x, maxWidth=500L, exclude=c("chrM","M","Mt")){
if(is(x, "SingleCellExperiment")){
if(!is.null(ranges(x)) &&
!all(sum(IRanges::width(ranges(x)))==0L) ){
if(!is.null(exclude)){
if(is.character(exclude)){
x <- x[!(seqnames(granges(x)) %in% exclude),]
}else{
x <- x[!overlapsAny(granges(x),exclude)]
}
}
y <- counts(x)[which(IRanges::width(ranges(x))<=maxWidth),]
}else{
y <- counts(x)
}
}else if(is.matrix(x)){
y <- x
}else{
stop("Object should be a count matrix or a SingleCellExperiment")
}
libsizes <- colSums(y)
y <- y>2L
rs <- rowSums(y)
rp <- p.adjust(ppois(rs, lambda=mean(rs), lower.tail=FALSE))
y <- y[rp>0.01,]
d <- data.frame(row.names=colnames(y), nFrags=libsizes, nAbove2=colSums(y))
q <- p.adjust(ppois(d$nAbove2, lambda=mean(d$nAbove2), lower.tail=FALSE),
method="fdr")
if(is(x,"SummarizedExperiment")){
x$amuletFromCounts.q <- q
return(x)
}
q
}