-
Notifications
You must be signed in to change notification settings - Fork 0
/
testMotifPos.R
221 lines (211 loc) · 9.25 KB
/
testMotifPos.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#' Test for a Uniform Distribution across a set of best matches
#'
#' @details
#' This function tests for an even positional spread of motif matches across a
#' set of sequences, using the assumption (i.e. H~0~) that if there is no
#' positional bias, matches will be evenly distributed across all positions
#' within a set of sequences.
#' Conversely, if there is positional bias, typically but not necessarily near
#' the centre of a range, this function intends to detect this signal, as a
#' rejection of the null hypothesis.
#'
#' Input can be provided as the output from \link{getPwmMatches} setting
#' `best_only = TRUE` if these matches have already been identified.
#' If choosing to provide this object to the argument `matches`, nothing is required
#' for the arguments `pwm`, `stringset`, `rc`, `min_score` or `break_ties`
#' Otherwise, a Position Weight Matrix (PWM) and an `XStringSet` are required,
#' along with the relevant arguments, with best matches identified within the
#' function.
#'
#' The set of best matches are then grouped into bins along the range, with the
#' central bin containing zero, and tallied.
#' Setting `abs` to `TRUE` will set all positions from the centre as
#' *absolute values*, returning counts purely as bins with distances from zero,
#' marking this as an inclusive lower bound.
#' Motif alignments are assigned into bins based on the central position of the
#' match, as provided in the column `from_centre` when calling
#' \link{getPwmMatches}.
#'
#' The \link[stats]{binom.test} is performed on each bin using the alternative
#' hypothesis, with the returned p-values across all bins combined using the
#' Harmonic Mean p-value (HMP) (See \link[harmonicmeanp]{p.hmp}).
#' All bins with raw p-values below the HMP are identified and the returned
#' values for start, end, centre, width, matches in region, expected and
#' enrichment are across this set of bins.
#' The expectation is that where a positional bias is evident, this will be a
#' narrow range containing a non-trivial proportion of the total matches.
#'
#' It should also be noted that `binom.test()` can return p-values of zero, as
#' beyond machine precision. In these instances, zero p-values are excluded from
#' calculation of the HMP. This will give a very slight conservative bias, and
#' assumes that for these extreme cases, neighbouring bins are highly likely to
#' also return extremely low p-values and no significance will be lost.
#'
#'
#' @return
#' A data.frame with columns `start`, `end`, `centre`, `width`, `total_matches`,
#' `matches_in_region`, `expected`, `enrichment`, `prop_total`, `p`
#' and `consensus_motif`
#' The total matches represent the total number of matches within the set of
#' sequences, whilst the number observed in the final region are also given,
#' along with the proportion of the total this represents.
#' Enrichment is simply the ratio of observed to expected based on the
#' expectation of the null hypothesis
#'
#' The consensus motif across all matches is returned as a Position Frequency
#' Matrix (PFM) using \link[Biostrings]{consensusMatrix}.
#'
#'
#' @param x A Position Weight Matrix, universalmotif object or list thereof.
#' Alternatively can be a single DataFrame or list of DataFrames as returned
#' by \link{getPwmMatches} with `best_only = TRUE`
#' @param stringset An XStringSet. Not required if matches are supplied as x
#' @param binwidth Width of bins across the range to group data into
#' @param abs Use absolute positions around zero to find symmetrical enrichment
#' @param rc logical(1) Also find matches using the reverse complement of pwm
#' @param min_score The minimum score to return a match
#' @param break_ties Choose how to resolve matches with tied scores
#' @param alt Alternative hypothesis for the binomial test
#' @param sort_by Column to sort results by
#' @param mc.cores Passed to \link[parallel]{mclapply}
#' @param ... Passed to \link[Biostrings]{matchPWM}
#'
#'
#' @examples
#' ## Load the example PWM
#' data("ex_pwm")
#' esr1 <- ex_pwm$ESR1
#'
#' ## Load the example sequences
#' data("ar_er_seq")
#'
#' ## Get the best match and use this data
#' matches <- getPwmMatches(esr1, ar_er_seq, best_only = TRUE)
#' ## Test for enrichment in any position
#' testMotifPos(matches)
#'
#' ## Provide a list of PWMs, testing for distance from zero
#' testMotifPos(ex_pwm, ar_er_seq, abs = TRUE, binwidth = 10)
#'
#'
#' @importFrom parallel mclapply
#' @importFrom stats p.adjust
#' @export
testMotifPos <- function(
x, stringset, binwidth = 10, abs = FALSE, rc = TRUE, min_score = "80%",
break_ties = "all", alt = c("greater", "less", "two.sided"),
sort_by = c("p", "none"), mc.cores = 1,
...
) {
## Handle single objects by coercing to a list where appropriate
valid_cl <- c("matrix", "universalmotif", "DataFrame")
if (any(vapply(valid_cl, \(cl) is(x, cl), logical(1)))) x <- list(x)
if (!is.list(x))
stop("Input should be provided as a list, PWM/matrix or DF of matches")
## Handle a list of PWMs
isPWM <- vapply(
x, \(el) is(el, "matrix") | is(el, "universalmotif"), logical(1)
)
if (all(isPWM)) {
## Ensure all sequences are the same width
if (length(unique(width(stringset))) != 1)
stop("All sequences are required to be the same width")
matches <- getPwmMatches(
x, stringset, rc, min_score, best_only = TRUE, break_ties,
mc.cores, ...
)
} else {
## Now for a list of matches
.checkMatches(x) ## Will fail if not valid
## Also check for sequences of the same width
same_width <- vapply(x, \(x) length(unique(x$seq_width)) <= 1, logical(1))
stopifnot(all(same_width))
matches <- x
}
if (missing(matches)) stop("Provided input is not in a recognised format")
alt <- match.arg(alt)
cols <- c(
"start", "end", "centre", "width", "total_matches", "matches_in_region",
"expected", "enrichment", "prop_total", "p", "fdr", "consensus_motif"
)
out <- mclapply(
matches, .testSingleMotifPos, binwidth = binwidth, abs = abs, rc = rc,
min_score = min_score, break_ties = break_ties, alt = alt, ...,
mc.cores = mc.cores
)
out <- do.call("rbind", out)
out$fdr <- p.adjust(out$p, "fdr")
o <- seq_len(nrow(out))
sort_by <- match.arg(sort_by)
if (sort_by != "none") o <- order(out[[sort_by]])
out[o, cols]
}
#' @importFrom harmonicmeanp p.hmp
#' @importFrom stats binom.test
#' @keywords internal
.testSingleMotifPos <- function(
matches, binwidth, abs, rc, min_score, break_ties, alt, ...
) {
## Setup a well formed output object for easy combining with other tests
out_cols <- c(
"start", "end", "centre", "width", "total_matches", "matches_in_region",
"expected", "enrichment", "prop_total", "p", "consensus_motif"
)
out <- lapply(out_cols, \(x) integer())
out$consensus_motif <- list()
names(out) <- out_cols
n_matches <- nrow(matches)
if (n_matches == 0) return(data.frame(out))
matches <- .makeBmBins(matches, binwidth, abs)
bins <- levels(matches$bin)
## Get the counts & form a df with every bin as a row
counts <- vapply(
splitAsList(matches, matches$bin), \(x) sum(x$weight), numeric(1)
)
df <- data.frame(bin = names(counts), matches_in_bin = as.integer(counts))
bin_coords <- do.call("rbind", strsplit(df$bin, split = ","))
bin_coords <- apply(
bin_coords, 2, \(x) as.integer(gsub("\\[|\\(|\\]", "", x))
)
df$start <- bin_coords[, 1]
df$end <- bin_coords[, 2]
df$bin <- factor(df$bin, levels = bins)
## Given matches are measured from the centre, only bases > d_from_edge
## away from the ends of the range are viable
motif_width <- max(width(matches$match))
d_from_edge <- (motif_width - 1) / 2
valid_pos <- seq(min(df$start) + d_from_edge, max(df$end) - d_from_edge)
if (abs) valid_pos <- seq(d_from_edge %% 1, max(df$end) - d_from_edge)
df$valid_width <- vapply(
seq_len(nrow(df)),
\(i) sum(valid_pos > df$start[i] & valid_pos <= df$end[i]), numeric(1)
)
df$bin_prob <- df$valid_width / sum(df$valid_width)
df$expected <- n_matches * df$bin_prob
df$p <- vapply(
seq_along(df$bin),
\(i) {
x <- df[i,]
binom.test(x$matches_in_bin, n_matches, x$bin_prob, alt)$p.value
}, numeric(1)
)
## It is entirely possible for this to return p < Machine precision
## This will cause the HMP to fail so setting to the lowest Machine
## precision will be conservative, but a bit of a hack.
df$p[df$p < .Machine$double.xmin] <- .Machine$double.xmin
hmp <- p.hmp(df$p, L = length(bins))
df <- subset(df, df$p < hmp | df$p == min(df$p))
out <- data.frame(start = min(df$start), end = max(df$end))
out$centre <- (out$start + out$end) / 2
out$width <- out$end - out$start
out$total_matches <- n_matches
out$matches_in_region <- sum(df$matches_in_bin)
out$expected <- sum(df$expected)
out$enrichment <- out$matches_in_region / out$expected
out$prop_total <- out$matches_in_region / out$total_matches
out$p <- as.numeric(hmp)
## Setup the consensus motif to return
consensus <- consensusMatrix(matches$match)[c("A", "C", "G", "T"),]
out$consensus_motif <- list(consensus)
out[, out_cols]
}