/
de_analysis.R
512 lines (486 loc) · 20.4 KB
/
de_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
#' @rdname de_analysis
#'
#' @title Differential Expression Analysis using a Topic Model
#'
#' @description Implements methods for differential expression
#' analysis using a topic model. These methods are motivated by gene
#' expression studies, but could have other uses, such as identifying
#' \dQuote{key words} for topics.
#'
#' @details The methods are based on the Poisson model
#' \deqn{x_i ~ Poisson(\lambda_i),} in which the Poisson rates are
#' \deqn{\lambda_i = \sum_{j=1}^k s_i l_{ij} f_j,} the \eqn{l_{ik}}
#' are the topic proportions and the \eqn{f_j} are the unknowns to be
#' estimated. This model is applied separately to each column of
#' \code{X}. When \eqn{s_i} (specified by input argument \code{s}) is
#' equal the total count in row i (this is the default), the Poisson
#' model will closely approximate a binomial model of the count data,
#' and the unknowns \eqn{f_j} will approximate binomial
#' probabilities. (The Poisson approximation to the binomial is most
#' accurate when the total counts \code{rowSums(X)} are large and the
#' unknowns \eqn{f_j} are small.) Other choices for \code{s} are
#' possible, and implement different normalization schemes.
#'
#' To allow for some flexibility, \code{de_analysis} allows for the
#' log-fold change to be measured in several ways.
#'
#' One option is to compare against the probability under the null
#' model: \eqn{LFC(j) = log2(f_j/f_0)}, where \eqn{f_0} is the single
#' parameter in the Poisson model \eqn{x_i ~ Poisson(\lambda_i)} with
#' rates \eqn{\lambda_i = s_i f_0}. This LFC definition is chosen with
#' \code{lfc.stat = "vsnull"}.
#'
#' Another option is to compare against a chosen topic, k: \eqn{LFC(j)
#' = log2(f_j/f_k)}. By definition, \eqn{LFC(k)} is zero, and
#' statistics such as z-scores and p-values for topic k are set to
#' \code{NA}. This LFC definition is selected by setting
#' \code{lfc.stat = k}.
#'
#' A final option (which is the default) computes the \dQuote{least
#' extreme} LFC, defined as \eqn{LFC(j) = log2(f_j/f_k)} such that
#' \eqn{k} is the topic other than \eqn{j} that gives the ratio
#' \eqn{f_j/f_k} closest to 1. This option is chosen with
#' \code{lfc.stat = "le"}.
#'
#' We recommend setting \code{shrink.method = "ash"}, which uses the
#' \dQuote{adaptive shrinkage} method (Stephens, 2016) to improve
#' accuracy of the posterior mean estimates and z-scores. We follow
#' the settings used in \code{lfcShrink} from the \sQuote{DESeq2}
#' package, with \code{type = "ashr"}.
#'
#' Note that all LFC statistics are defined using the base-2 logarithm
#' following the conventioned used in differential expression
#' analysis.
#'
#' The \code{control} argument is a list in which any of the
#' following named components will override the default optimization
#' algorithm settings (as they are defined by
#' \code{de_analysis_control_default}):
#'
#' \describe{
#'
#' \item{\code{numiter}}{Maximum number of iterations performed in
#' fitting the Poisson models. When \code{fit.method = "glm"}, this is
#' passed as argument \code{maxit} to the \code{glm} function.}
#'
#' \item{\code{minval}}{A small, positive number. All topic
#' proportions less than this value and greater than \code{1 - minval}
#' are set to this value.}
#'
#' \item{\code{tol}}{Controls the convergence tolerance for fitting
#' the Poisson models. When \code{fit.method = "glm"}, this is passed
#' as argument \code{epsilon} to function \code{glm}.}
#'
#' \item{\code{conf.level}}{The size of the highest posterior density
#' (HPD) intervals. Should be a number greater than 0 and less than 1.}
#'
#' \item{\code{ns}}{Number of Monte Carlo samples simulated by
#' random-walk MCMC for estimating posterior LFC quantities.}
#'
#' \item{\code{rw}}{The standard deviation of the normal density used
#' to propose new states in the random-walk MCMC.}
#'
#' \item{\code{eps}}{A small, non-negative number added to the terms
#' inside the logarithms to avoid computing logarithms of zero.}
#'
#' \item{\code{nc}}{Number of threads used in the multithreaded
#' computations. This controls both (a) the number of RcppParallel
#' threads used to fit the factors in the Poisson models, and (b)
#' the number of cores used in \code{\link[parallel]{mclapply}} for
#' the MCMC simulation step. Note that mclapply relies on forking
#' hence is not available on Windows; will return an error on
#' Windows unless \code{nc = 1}. Also note that setting \code{nc >
#' 1} copies the contents of memory \code{nc} times, which can lead
#' to poor performance if the total resident memory required exceeds
#' available physical memory.}
#'
#' \item{\code{nc.blas}}{Number of threads used in the numerical
#' linear algebra library (e.g., OpenBLAS), if available. For best
#' performance, we recommend setting this to 1 (i.e., no
#' multithreading).}
#'
#' \item{\code{nsplit}}{The number of data splits used in the
#' multithreaded computations (only relevant when \code{nc > 1}). More
#' splits increase the granularity of the progress bar, but can also
#' slow down the mutithreaded computations by introducing more
#' overhead in the call to \code{\link[pbapply]{pblapply}}.}}
#'
#' @param fit An object of class \dQuote{poisson_nmf_fit} or
#' \dQuote{multinom_topic_model_fit}, or an n x k matrix of topic
#' proportions, where k is the number of topics. (The elements in each
#' row of this matrix should sum to 1.) If a Poisson NMF fit is
#' provided as input, the corresponding multinomial topic model fit is
#' automatically recovered using \code{\link{poisson2multinom}}.
#'
#' @param X The n x m counts matrix. It can be a sparse matrix (class
#' \code{"dgCMatrix"}) or dense matrix (class \code{"matrix"}).
#'
#' @param s A numeric vector of length n determining how the rates are
#' scaled in the Poisson models. See \dQuote{Details} for guidance on
#' the choice of \code{s}.
#'
#' @param pseudocount Observations with this value are added to the
#' counts matrix to stabilize maximum-likelihood estimation.
#'
#' @param fit.method Method used to fit the Poisson models. Note that
#' \code{fit.method = "glm"} is the slowest method, and is mainly used
#' for testing.
#'
#' @param shrink.method Method used to stabilize the posterior mean
#' LFC estimates. When \code{shrink.method = "ash"}, the "adaptive
#' shrinkage" method implemented in the \sQuote{ashr} package is used to
#' compute posterior. When \code{shrink.method = "none"}, no
#' stabilization is performed, and the \dQuote{raw} posterior mean LFC
#' estimates are returned.
#'
#' @param lfc.stat The log-fold change statistics returned:
#' \code{lfc.stat = "vsnull"}, the log-fold change relative to the
#' null; \code{lfc.stat = "le"}, the \dQuote{least extreme} LFC; or a
#' topic name or number, in which case the LFC is defined relative to
#' the selected topic. See \dQuote{Details} for more detailed
#' explanations of these choices.
#'
#' @param control A list of parameters controlling behaviour of
#' the optimization and Monte Carlo algorithms. See \sQuote{Details}.
#'
#' @param verbose When \code{verbose = TRUE}, progress information is
#' printed to the console.
#'
#' @param \dots When \code{shrink.method = "ash"}, these are
#' additional arguments passed to \code{\link[ashr]{ash}}.
#'
#' @return A list with the following elements:
#'
#' \item{est}{The log-fold change maximum-likelihood estimates.}
#'
#' \item{postmean}{Posterior mean LFC estimates.}
#'
#' \item{lower}{Lower limits of estimated HPD intervals. Note that
#' these are not updated by the shrinkage step.}
#'
#' \item{upper}{Upper limits of estimated HPD intervals. Note that
#' these are not updated by the shrinkage step.}
#'
#' \item{z}{z-scores for posterior mean LFC estimates.}
#'
#' \item{lpval}{-log10 two-tailed p-values obtained from the
#' z-scores. When \code{shrink.method = "ash"}, this is \code{NA}, and
#' the s-values are returned instead (see below).}
#'
#' \item{svalue}{s-values returned by
#' \code{\link[ashr]{ash}}. s-values are analogous to q-values, but
#' based on the local false sign rate; see Stephens (2016).}
#'
#' \item{lfsr}{When \code{shrink.method = "ash"} only, this output
#' contains the estimated local false sign rates.}
#'
#' \item{ash}{When \code{shrink.method = "ash"} only, this output
#' contains the \code{\link[ashr]{ash}} return value (after removing
#' the \code{"data"}, \code{"result"} and \code{"call"} list
#' elements).}
#'
#' \item{F}{Maximum-likelihood estimates of the Poisson model
#' parameters.}
#'
#' \item{f0}{Maximum-likelihood estimates of the null model
#' parameters.}
#'
#' \item{ar}{A vector containing the Metropolis acceptance ratios
#' from each MCMC run.}
#'
#' @references
#' Stephens, M. (2016). False discovery rates: a new deal.
#' \emph{Biostatistics} \bold{18}(2), kxw041.
#' \doi{10.1093/biostatistics/kxw041}
#'
#' Zhu, A., Ibrahim, J. G. and Love, M. I. (2019). Heavy-tailed prior
#' distributions for sequence count data: removing the noise and
#' preserving large differences. \emph{Bioinformatics} \bold{35}(12),
#' 2084–2092.
#'
#' @examples
#' # Perform a differential expression (DE) analysis using the previously
#' # fitted multinomial topic model. Note that the de_analysis call could
#' # take several minutes to complete.
#' \donttest{
#' set.seed(1)
#' data(pbmc_facs)
#' de <- de_analysis(pbmc_facs$fit,pbmc_facs$counts)
#'
#' # Compile the DE analysis results for topic 4 into a table, and
#' # rank the genes by the posterior mean log-fold change, limiting to
#' # DE genes identified with low lfsr ("local false sign rate").
#' k <- 4
#' dat <- data.frame(postmean = de$postmean[,k],
#' z = de$z[,k],
#' lfsr = de$lfsr[,k])
#' rownames(dat) <- with(pbmc_facs$genes,paste(symbol,ensembl,sep = "_"))
#' dat <- subset(dat,lfsr < 0.01)
#' dat <- dat[order(dat$postmean,decreasing = TRUE),]
#'
#' # Genes at the top of this ranking are genes that are much more
#' # highly expressed in the topic compared to other topics.
#' head(dat,n = 10)
#'
#' # The genes at the bottom of the ranking are genes that are much less
#' # expressed in the topic.
#' tail(dat,n = 10)
#'
#' # Create a volcano plot from the DE results for topic 4.
#' volcano_plot(de,k = k,ymax = 50,labels = pbmc_facs$genes$symbol)
#' }
#'
#' @importFrom utils modifyList
#' @importFrom Matrix rowSums
#' @importFrom Matrix colSums
#' @importFrom stats rnorm
#' @importFrom stats runif
#' @importFrom ashr ash
#' @importFrom RhpcBLASctl blas_set_num_threads
#' @importFrom RhpcBLASctl blas_get_num_procs
#'
#' @export
#'
de_analysis <- function (fit, X, s = rowSums(X), pseudocount = 0.01,
fit.method = c("scd","em","mu","ccd","glm"),
shrink.method = c("ash","none"), lfc.stat = "le",
control = list(), verbose = TRUE, ...) {
# CHECK AND PROCESS INPUTS
# ------------------------
# Check and process input argument "fit".
if (is.matrix(fit)) {
L <- fit
if (any((rowSums(L) - 1) > 1e-15))
warning("\"fit\" is a matrix but may not be topic proportions matrix; ",
"rowSums(fit) should be a vector of all ones")
m <- ncol(X)
k <- ncol(L)
F <- matrix(1,m,k)
rownames(F) <- colnames(X)
colnames(F) <- colnames(L)
fit <- init_poisson_nmf(X,F = F,L = L)
fit <- poisson2multinom(fit)
}
if (!(inherits(fit,"poisson_nmf_fit") |
inherits(fit,"multinom_topic_model_fit")))
stop("Input \"fit\" should be an object of class \"poisson_nmf_fit\" or ",
"\"multinom_topic_model_fit\", or a matrix of topic proportions")
if (inherits(fit,"poisson_nmf_fit"))
fit <- poisson2multinom(fit)
# Check and process input argument "X".
if (!((is.numeric(X) & is.matrix(X)) | is.sparse.matrix(X)))
stop("Input argument \"X\" should be a numeric matrix (a \"matrix\" or ",
"a \"dgCMatrix\")")
verify.fit.and.count.matrix(X,fit)
if (is.matrix(X) & is.integer(X))
storage.mode(X) <- "double"
# Get the number of rows (n) and columns (m) in the counts matrix, and
# the number of groups or topics (k).
n <- nrow(X)
m <- ncol(X)
k <- ncol(fit$F)
# Check input argument "s".
verify.positive.vector(s)
if (length(s) != n)
stop("Input argument \"s\" should be a vector of positive numbers, ",
"in which length(s) = nrow(X)")
# Check input argument "pseudocount".
if (any(pseudocount <= 0))
stop("Input argument \"pseudocount\" should be a positive number")
# Process input arguments "fit.method" and "shrink.method".
fit.method <- match.arg(fit.method)
shrink.method <- match.arg(shrink.method)
# Check and process input argument "control".
control <- modifyList(de_analysis_control_default(),control,keep.null = TRUE)
if (control$nc > 1 & .Platform$OS.type == "windows")
stop("Multithreading is not available on Windows; try again with ",
"control$nc = 1")
# Check and process input argument "lfc.stat".
if (!(all(lfc.stat == "vsnull") | all(lfc.stat == "le"))) {
if (!(any(lfc.stat == 1:k) | any(lfc.stat == colnames(fit$F))))
stop("Input argument \"lfc.stat\" should be either \"vsnull\", \"le\" ",
"a number between 1 and k, where k is the number of topics, or ",
"a name of a topic (column of fit$F)")
if (is.character(lfc.stat))
lfc.stat <- match(lfc.stat,colnames(fit$F))
}
# FIT NULL MODELS
# ---------------
# Compute the MLE f0 in the "null" model x ~ Poisson(u), with u =
# s*f0. (This calculation is performed for each column of the counts
# matrix. It must be done before adding pseudocounts to the data.)
# Equivalently, this is the maximum-likelihood estimate of the
# binomial probability in the "null" Binomial model x ~ Binom(s,p0).
f0 <- colSums(X)/sum(s)
names(f0) <- rownames(fit$F)
# SET UP DATA FOR FITTING POISSON MODELS
# --------------------------------------
# Ensure that none of the topic proportions are exactly zero or
# exactly one.
L <- fit$L
L <- pmax(L,control$minval)
L <- pmin(L,1 - control$minval)
# Add "pseudocounts" to the data, and get the Poisson NMF loadings
# matrix. From this point on, we will fit Poisson glm models x ~
# Poisson (u), u = sum(L*f), where x is a column of X.
out <- add_pseudocounts(X,s*L,pseudocount)
X <- out$X
L <- out$L
# FIT POISSON MODELS
# ------------------
# For each column j of the counts matrix, compute MLEs of the
# parameters in the Poisson glm, x ~ Poisson(u), in which the
# Poisson rates are u = sum(L*f), and f = F[j,].
if (verbose)
cat(sprintf("Fitting %d Poisson models with k=%d using method=\"%s\".\n",
m,k,fit.method))
nc <- initialize.multithreading(control$nc,verbose)
F <- fit_poisson_models(X,L,fit.method,control$eps,control$numiter,
control$tol,control$nc)
F <- pmax(F,control$minval)
dimnames(F) <- dimnames(fit$F)
# COMPUTE LOG-FOLD CHANGE STATISTICS
# ----------------------------------
# Perform MCMC to simulate the posterior distribution of the LFC
# statistics, then compute key posterior quantities from the
# simulated Monte Carlo samples.
if (verbose) {
cat("Computing log-fold change statistics from ")
cat(sprintf("%d Poisson models with k=%d.\n",m,k))
}
ns <- control$ns
D <- matrix(rnorm(ns*k),ns,k)
U <- matrix(runif(ns*k),ns,k)
M <- matrix(sample(k,ns*k,replace = TRUE),ns,k) - 1
ncb <- blas_get_num_procs()
blas_set_num_threads(control$nc.blas)
if (nc == 1)
out <- compute_lfc_stats(X,F,L,f0,D,U,M,lfc.stat,control$conf.level,
control$rw,control$eps,verbose)
else {
out <- compute_lfc_stats_multicore(X,F,L,f0,D,U,M,lfc.stat,
control$conf.level,control$rw,
control$eps,control$nc,control$nsplit,
verbose)
}
blas_set_num_threads(ncb)
if (any(out$ar == 0))
warning("One or more MCMC simulations yielded acceptance rates of zero; ",
"consider increasing the number of Monte Carlo samples ",
"(control$ns) or modifying the noise level of the random-walk ",
"proposal distribution (control$rw) to improve the acceptance ",
"rates")
# STABILIZE ESTIMATES USING ADAPTIVE SHRINKAGE
# --------------------------------------------
# If requested, use adaptive shrinkage to stabilize the log-fold
# change estimates. Here we need to carefully edge cases such as
# se's of zero.
if (shrink.method == "ash") {
if (verbose)
cat("Stabilizing posterior log-fold change estimates using adaptive",
"shrinkage.\n")
se <- with(out,postmean/z)
se[out$z == 0] <- as.numeric(NA)
se[out$postmean == 0] <- 0
res <- shrink_estimates(out$postmean,se,...)
out$postmean <- res$b
out$z <- res$z
out$lfsr <- res$lfsr
out$lpval <- as.numeric(NA)
out$svalue <- res$svalue
out$ash <- res$ash
dimnames(out$lfsr) <- dimnames(F)
dimnames(out$svalue) <- dimnames(F)
} else {
# Compute the -log10 two-tailed p-values computed from the z-scores.
out$lpval <- -lpfromz(out$z)
out$svalue <- as.numeric(NA)
out$lfsr <- as.numeric(NA)
}
# Return the Poisson model MLEs (F), the log-fold change statistics
# (est, postmean, lower, upper, z, lpval) and local false sign
# rates (lfsr), and the relative rates under the "null" model (f0).
out$F <- F
out$f0 <- f0
class(out) <- c("topic_model_de_analysis","list")
return(out)
}
# Perform adaptive shrinkage on the matrix of effect estimates b and
# their standard errors, then output the revised effect estimates (b),
# standard errors (se) and z-scores. All effects i in which either
# b[i] or se[i] is missing (NA) are not revised.
shrink_estimates <- function (b, se, ...) {
# Set up the z-scores output.
z <- b
z[is.na(b) | is.na(se)] <- as.numeric(NA)
# Run adaptive shrinkage.
i <- which(!(is.na(b) | is.na(se)))
out <- ash(b[i],se[i],mixcompdist = "normal",method = "shrink",...)
b1 <- out$result$PosteriorMean
se1 <- out$result$PosteriorSD
# Extract the posterior estimates and their standard errors.
b[i] <- b1
se[i] <- se1
z[i] <- b[i]/se[i]
z[i[b1 == 0]] <- 0
z[i[se1 == 0]] <- as.numeric(NA)
# Extract the lfsr estimates.
m <- nrow(b)
k <- ncol(b)
lfsr <- matrix(as.numeric(NA),m,k)
svalue <- matrix(as.numeric(NA),m,k)
lfsr[i] <- out$result$lfsr
svalue[i] <- out$result$svalue
# Output the revised estimates (b), the standard errors (se), the
# z-scores (z), the local false sign rates (lfsr), the s-values
# (svalue) and the raw ash output (ash).
out[c("data","result","call")] <- NULL
return(list(b = b,se = se,z = z,lfsr = lfsr,svalue = svalue,ash = out))
}
#' @rdname de_analysis
#'
#' @export
#'
de_analysis_control_default <- function()
list(numiter = 20,
minval = 1e-10,
tol = 1e-8,
conf.level = 0.68,
ns = 1000,
rw = 0.3,
eps = 1e-15,
nc = 1,
nc.blas = 1,
nsplit = 100)
# Select genes based on a de_analysis result. Input "de" is an object
# of class "topic_model_de_analysis"; k is a topic, and "subset" is a
# function that returns TRUE or FALSE depending on whether to select
# the gene based on the DE analysis result. Here, "rank" and
# "quantile" should be interpreted as the rank and quantile of the
# gene based on the posterior mean estimate of the log-fold change, so
# that largest positive log-fold change has rank 1 and quantile 1.
# The output is a vector of gene indices (e.g., corresponding to
# rows of de$postmean).
select_de_genes <-
function (de, k,
subset = function (postmean, lpval, lfsr, rank, quantile)
lfsr < 0.05) {
# Compile data used to select genes.
n <- nrow(de$postmean)
dat <- data.frame(postmean = de$postmean[,k],
lpval = de$lpval[,k],
lfsr = as.numeric(NA),
rank = rank(-de$postmean[,k],na.last = TRUE,
ties.method = "average"),
quantile = rank(de$postmean[,k],na.last = FALSE,
ties.method = "average")/n)
if (!all(is.na(de$lfsr)))
dat$lfsr <- de$lfsr[,k]
# Select the genes according to the provided subset function.
x <- rep(FALSE,n)
names(x) <- rownames(de$postmean)
for (i in 1:n)
x[i] <- all(subset(dat[i,"postmean"],dat[i,"lpval"],dat[i,"lfsr"],
dat[i,"rank"],dat[i,"quantile"]))
return(which(x))
}