-
Notifications
You must be signed in to change notification settings - Fork 0
/
standard_stats.R
209 lines (143 loc) · 7.03 KB
/
standard_stats.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#' Convert a list of peaks (rt / m/z pairs) to a Region of Interest (ROI) list for use with \code{\link{findPeaks}}
#'
#' @param tbl \code{\link{tibble}} containing the columns "rt" and "mz". rt needs to be in seconds.
#' @param raw \code{\link{xcmsRaw}} object to create ROI for. It needs to be a specific \code{\link{xcmsRaw}} to match retention times to scan nubmers.
#' @param ppm ppm tolerance for the generated ROI.
#' @param rt_tol Retention time tolerance (in sec!) for the generated ROI.
#'
#' @return List containing the ROIs. Each list contains mz, mzmin, mzmax, scmin, scmax, length (set to -1, not used by centWave) and intensity (set to -1, not used by centWave) columns.
#' @export
#'
#' @importFrom dplyr %>% rowwise transmute ungroup mutate
#' @importFrom purrr map
#' @importFrom purrrlyr by_row
#' @importFrom massageR attr_rem
#' @importFrom magrittr extract2
#'
tbl2ROI <- function(tbl, raw, ppm, rt_tol) {
rt <- mz <- .out <- NULL
tbl %>%
rowwise %>%
transmute(mz = mz, # not used!
mzmin = mz - ppm * mz * 1E-6,
mzmax = mz + ppm * mz * 1E-6,
scmin = which.min(abs((rt-rt_tol)-raw@scantime)),
scmax = which.min(abs((rt+rt_tol)-raw@scantime)),
length = -1, # not used!
intensity = -1 # not used!
) %>%
ungroup %>%
by_row(as.list) %>%
mutate( .out = map(.out, ~ attr_rem(.x,"indices"))) %>% # we get some indices attribute. Dunno why. But lets remove it.
extract2(".out") ->
out
return(out)
}
#' Match list of standards to peak table
#'
#' This function will match a table of standard compounds and a peak table by m/z and retention time.
#' If there is more than one possible hit the highest intensity peak will be chosen.
#'
#' @param stds \code{\link{tibble}} of standards to match to a peak table
#' @param peakTable \code{\link{tibble}} containing peak table supplied by \code{\link{findPeaks}} (but converted to \code{\link{tibble}}/\code{\link{data.frame}}).
#' @param rt_tol Retention time tolerance for matching peaks. Pay attention to the unit of your tables.
#' rt_tol should match and stds and peakTable should use same units (i.e. minutes of seconds).
#' @param mz_ppm ppm for matching peaks.
#' @param rt_col Character string giving the column containing the retention times. Must be same in standards and peak table.
#' @param mz_col Character string giving the column containing the m/z values. Must be same in standards and peak table.
#' @param int_col Character string giving the column containing the intensities in the peak table.
#'
#' @return A vector having the length equivalent to the number of rows in stds giving the indices of the hits in peakTable.
#'
#' @export
#'
#' @importFrom magrittr extract2
#' @importFrom dplyr %>% slice
#'
closest_match <- function(stds, peakTable, rt_tol = 0.25, mz_ppm = 30, rt_col = "rt", mz_col = "mz", int_col = "into"){
indices <- rep_len(as.numeric(NA), nrow(stds))
peakTable_mz <- peakTable %>% extract2(mz_col)
peakTable_rt <- peakTable %>% extract2(rt_col)
for(i in 1:nrow(stds)){
stds_mz <- stds %>% slice(i) %>% extract2(mz_col)
stds_rt <- stds %>% slice(i) %>% extract2(rt_col)
idx <- abs(stds_rt-peakTable_rt) < rt_tol &
(abs(stds_mz-peakTable_mz)/stds_mz)*1E6 < mz_ppm
# no match is found
if(sum(idx, na.rm = TRUE)==0){ next } # we filled the vector with NA so we don't need to do anyting
# 1 match is found
if(sum(idx, na.rm = TRUE)==1){
indices[i] <- which(idx)
next
}
# More than one match is found. We take the highest intensity peak then
if(sum(idx, na.rm = TRUE)>1){
idx2 <- peakTable %>% slice(which(idx)) %>% extract2(int_col) %>% which.max
indices[i] <- which(idx)[idx2]
next
}
}
return(indices)
}
#' Calculate Tailing Factor and Asymmetry Factor
#'
#' @param EIC EIC containing the peak to calculate for.
#' \code{\link{tibble}} as produced with \code{\link{get_EICs}}.
#' @param rt Retention time of the center of the peak (Numeric)
#' @param factor to calculate. Character string either "TF" (Tailing Factor) or "ASF" (Asymmetry Factor).
#'
#' @references http://www.chromforum.org/viewtopic.php?t=20079
#'
#' @return Numeric
#' @export
#'
#' @importFrom dplyr %>% filter arrange desc mutate slice
#' @importFrom stats approx smooth
#'
peak_factor <- function(EIC, rt, factor="TF"){
intensity <- per_max <- scan_rt <- NULL
if(is.na(rt)) return(NA)
# C = midpoint = highest scan
C <- rt # EIC is in minutes so we change here
C_scan <- EIC$scan[ which.min(abs(EIC$scan_rt-C)) ]
C <- EIC %>% filter(scan==C_scan) %>% extract2("scan_rt")
# we get the max of the 5 central scans to avoid problems if the center is not he highest
max_int <- max(EIC$intensity[(C_scan-2):(C_scan+2)], na.rm = TRUE)
# if all 3 middle peaks are zero give up
if( all( EIC$intensity[(C_scan-1):(C_scan+1)] == 0) ) return(NA)
# Median smoothing. Avoids single zero values breaking things. Already smooth peaks are unaffected.
if( !all( c(EIC$intensity[C_scan-1], EIC$intensity[C_scan+1]) == 0 ) ){ # If the values on each side of the mid of the peak are both 0 don't do smoothing (a 1 scan spike would cause this).
EIC <- EIC %>% mutate(intensity = stats::smooth(intensity, kind="3"))
}
# If middle scan is zero after smoothing give up
# We check agina after smoothing
if( EIC$intensity[C_scan] == 0 ) return(NA)
# Get the lower RT side of the peak
A_side <- EIC %>%
filter(scan <= C_scan) %>%
arrange(desc(scan)) %>%
mutate(per_max = intensity/max_int)
# Get the upper RT side of the peak
B_side <- EIC %>%
filter(scan >= C_scan) %>%
arrange(scan) %>%
mutate(per_max = intensity/max_int)
# which factor are we doing?
cut_off <- switch(factor, TF=0.05, ASF=0.1)
# Get A
A_scans <- A_side %>% with(match(-1,sign(per_max-cut_off)))
if(is.na(A_scans)) return(NA)
A <- A_side %>%
slice((A_scans-1):A_scans)
if(nrow(A)<2) return(NA)
A %<>% with(approx(per_max,scan_rt,xout=cut_off)$y)
# Get B
B_scans <- B_side %>% with(match(-1,sign(per_max-cut_off)))
if(is.na(B_scans)) return(NA)
B <- B_side %>%
slice((B_scans-1):B_scans)
if(nrow(B)<2) return(NA)
B %<>% with(approx(per_max,scan_rt,xout=cut_off)$y)
if(factor=="TF") return( (B-A)/(2*(C-A)) )
if(factor=="ASF") return( (B-C)/(C-A) )
}