-
Notifications
You must be signed in to change notification settings - Fork 16
/
subset_funcs.R
293 lines (251 loc) · 11.8 KB
/
subset_funcs.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
#' Identify All Biomolecules for Use in Normalization
#'
#' Selects biomolecules for normalization via choosing all biomolecules
#' currently in the data
#'
#' @param e_data a \eqn{p \times n + 1} data.frame, where \eqn{p} is the number
#' of peptides, proteins, lipids, or metabolites and \eqn{n} is the number of
#' samples. Each row corresponds to data for a peptide, protein, lipid, or
#' metabolite, with one column giving the biomolecule identifier name.
#' @param edata_id character string indicating the name of the peptide, protein,
#' lipid, or metabolite identifier. Usually obtained by calling
#' \code{attr(omicsData, "cnames")$edata_cname}.
#'
#' @details This function returns the subset of all biomolecules. These will be
#' used for normalization.
#'
#' @return Character vector containing all biomolecules.
#'
#' @author Kelly Stratton
#'
all_subset <- function(e_data, edata_id) {
# pull off the column for edata_id
edata_id_ind <- which(colnames(e_data) == edata_id)
# subset to all cases
peps <- as.character(e_data[, edata_id_ind])
if (length(peps) < 2) stop("There are <2 biomolecules in the subset; cannot proceed.")
return(peps)
}
#' Identify Biomolecules from the Top L Order Statistics for Use in
#' Normalization
#'
#' Select biomolecules for normalization via the method of the top L order
#' statistics (LOS)
#'
#' @param e_data a \eqn{p \times n + 1} data.frame, where \eqn{p} is the number
#' of peptides, proteins, lipids, or metabolites and \eqn{n} is the number of
#' samples. Each row corresponds to data for a peptide, protein, lipid, or
#' metabolite, with one column giving the biomolecule identifier name.
#' @param edata_id character string indicating the name of the column giving the
#' peptide, protein, lipid, or metabolite identifier. Usually obtained by
#' calling \code{attr(omicsData, "cnames")$edata_cname}.
#' @param L numeric value between 0 and 1, indicating the top proportion of
#' biomolecules to be retained (default value 0.05)
#'
#' @details The biomolecule abundances of the top \code{L} order statistics are
#' identified and returned. Specifically, for each sample, the biomolecules with
#' the top \code{L} proportion of highest absolute abundance are retained, and
#' the union of these biomolecules is taken as the subset identified.
#'
#' @return Character vector containing the biomolecules belonging to the subset.
#'
#' @author Kelly Stratton, Lisa Bramer
#'
los <- function(e_data, edata_id, L = .05) {
# pull off the column for edata_id
edata_id_ind <- which(colnames(e_data) == edata_id)
n_samps <- ncol(e_data[, -edata_id_ind])
n_peps <- nrow(e_data[, -edata_id_ind])
peps <- e_data[, edata_id_ind]
row.names(e_data) <- peps
mydata <- e_data[, -edata_id_ind]
# get number of peps to keep from L that was provided
num_kp <- round(L * n_peps)
kp <- c()
# for each sample, ... #
for (i in 1:n_samps) {
# rank the features from highest to lowest (NAs are low) #
cur_data <- mydata[, i]
names(cur_data) <- peps
# keep the top L percent of the features (excluding NAs, if NAs are included)
ordered_data <- sort(abs(cur_data), decreasing = TRUE) # absolute value!
# take the union of those features - that is the subset to return #
kp <- c(kp, names(ordered_data)[1:num_kp])
}
kp <- unique(kp)
if (length(kp) < 2) stop("There are <2 biomolecules in the subset; cannot proceed.")
return(kp)
}
#' Identify Biomolecules from the Proportion Present (PPP) for Use in
#' Normalization
#'
#' Selects biomolecules for normalization via the method of percentage of the
#' peptides (or proteins, metabolites, etc.) present (PPP)
#'
#' @param e_data a \eqn{p \times n + 1} data.frame, where \eqn{p} is the number
#' of peptides, proteins, lipids, or metabolites and \eqn{n} is the number of
#' samples. Each row corresponds to data for a peptide, protein, lipid, or
#' metabolite, with one column giving the biomolecule identifier name.
#' @param edata_id character string indicating the name of the peptide, protein,
#' lipid, or metabolite identifier. Usually obtained by calling
#' \code{attr(omicsData, "cnames")$edata_cname}.
#' @param proportion numeric value between 0 and 1, indicating the proportion at
#' or above which a biomolecule must be present across all samples in order to be
#' retained (default value 0.5)
#'
#' @details Biomolecules present across \code{proportion} samples are designated as
#' PPP.
#'
#' @return Character vector containing the biomolecules belonging to the PPP
#' subset.
#'
#' @author Kelly Stratton
#'
ppp <- function(e_data, edata_id, proportion = 0.5) {
# pull off the column for edata_id
edata_id_ind <- which(colnames(e_data) == edata_id)
# get column of proportion present #
mydata_pct_present <- rowSums(!is.na(e_data[, -edata_id_ind])) /
ncol(e_data[, -edata_id_ind])
# which features have proportion present above the value of "proportion" #
inds <- which(mydata_pct_present >= proportion)
subset_peps <- as.character(e_data[inds, edata_id_ind])
if (length(subset_peps) < 2) stop("There are <2 biomolecules in the subset; cannot proceed.")
return(subset_peps)
}
#' Identify Proportion of Peptides Present (PPP) and Rank Invariant Peptides
#' (RIP) for Use in Normalization
#'
#' Selects biomolecules for normalization via the method of proportion of
#' biomolecules present and rank invariant biomolecules (ppp_rip)
#'
#' @param e_data a \eqn{p \times n + 1} data.frame, where \eqn{p} is the number
#' of peptides, proteins, lipids, or metabolites and \eqn{n} is the number of
#' samples. Each row corresponds to data for a peptide, protein, lipid, or
#' metabolite, with one column giving the biomolecule identifier name.
#' @param edata_id character string indicating the name of the peptide, protein,
#' lipid, or metabolite identifier. Usually obtained by calling
#' \code{attr(omicsData, "cnames")$edata_cname}.
#' @param fdata_id character string indicating the name of the sample column
#' name in f_data.
#' @param groupDF data.frame created by \code{group_designation} with columns
#' for sample.id and group. If two main effects are provided the original main
#' effect levels for each sample are returned as the third and fourth columns
#' of the data.frame.
#' @param alpha numeric p-value threshold, above which the biomolecules are retained
#' as rank invariant (default value 0.25)
#' @param proportion numeric value between 0 and 1, indicating the percentage at
#' or above which a biomolecule must be present across all samples in order to be
#' retained (default value 0.5)
#'
#' @details Biomolecules present across \code{proportion} samples are subjected to a
#' Kruskal-Wallis test (non-parametric one-way ANOVA, where NAs are ignored)
#' on group membership, and those biomolecules with p-value greater than a
#' defined threshold \code{alpha} (common values include 0.1 or 0.25) are
#' retained as rank-invariant biomolecules.
#'
#' @return Character vector containing the biomolecules belonging to the ppp_rip subset.
#'
#' @author Kelly Stratton
#'
ppp_rip <- function(e_data, edata_id, fdata_id, groupDF, alpha = 0.2, proportion = 0.5) {
samp_id = fdata_id
# pull off the column for edata_id
edata_id_ind <- which(colnames(e_data) == edata_id)
# subset to features present in at least proportion samples
peps <- e_data[, edata_id_ind]
# get column of proportion present #
mydata_pct_present <- rowSums(!is.na(e_data[, -edata_id_ind])) /
ncol(e_data[, -edata_id_ind])
# which features have proportion present above the value of "proportion" #
inds <- which(mydata_pct_present >= proportion)
mydata <- e_data[inds, -edata_id_ind]
peps <- peps[inds]
# make sure groups are aligned with the columns of e_data
reorder = match(colnames(mydata), as.character(groupDF[,fdata_id]))
group_dat = as.character(groupDF[reorder,]$Group)
# conduct K-W test using kw_rcpp function
pvals = kw_rcpp(as.matrix(mydata), group_dat)
pvals = data.frame(pvals)
RIPeps <- as.character(peps[as.numeric(pvals[, 1]) > alpha &
!is.na(as.numeric(pvals[, 1]))])
if (length(RIPeps) < 2) stop("There are <2 biomolecules in the subset; cannot proceed.")
return(RIPeps)
}
#' Identify Rank-Invariant Biomolcules for Use in Normalization
#'
#' Selects biomolecules for normalization via the method of rank-invariant
#' biomolcules (RIP)
#'
#' @param e_data a \eqn{p \times n} data.frame, where \eqn{p} is the number of
#' peptides, proteins, lipids, or metabolites and \eqn{n} is the number of
#' samples. Each row corresponds to data for a peptide, protein, lipid, or
#' metabolite, with one column giving the biomolecule identifier name.
#' @param edata_id character string indicating the name of the peptide, protein,
#' lipid, or metabolite identifier. Usually obtained by calling
#' \code{attr(omicsData, "cnames")$edata_cname}.
#' @param fdata_id character string indicating the name of the sample column
#' name in f_data.
#' @param groupDF data.frame created by \code{group_designation} with columns
#' for sample.id and group. If two main effects are provided the original main
#' effect levels for each sample are returned as the third and fourth columns
#' of the data.frame.
#' @param alpha numeric p-value threshold, above which the biomolecules are
#' retained as rank invariant (default value 0.25)
#'
#' @details Biomolecules with complete data are subjected to a Kruskal-Wallis
#' test (non-parametric one-way ANOVA) on group membership, and those
#' biomolecules with p-value greater than a defined threshold \code{alpha}
#' (common values include 0.1 or 0.25) are retained as rank-invariant
#' biomolecules.
#'
#' @return Character vector containing the biomolecules belonging to the RIP
#' subset.
#'
#' @author Kelly Stratton
#'
rip <- function(e_data, edata_id, fdata_id, groupDF, alpha = .2) {
# pull off the column for edata_id
edata_id_ind <- which(colnames(e_data) == edata_id)
# subset to complete cases (features with complete data)
peps <- e_data[, edata_id_ind]
inds <- which(complete.cases(e_data[, -edata_id_ind]) == TRUE)
mydata <- e_data[inds, -edata_id_ind]
peps <- peps[inds]
# added 2/6/17 iobani
group_dat = as.character(groupDF$Group[order(groupDF$Group)])
mydata <- mydata[, order(groupDF$Group)]
# conduct K-W test on un-normalized data, used kw_rcpp function
pvals = kw_rcpp(as.matrix(mydata), group_dat)
pvals = data.frame(pvals)
RIPeps <- as.character(peps[as.numeric(pvals[, 1]) > alpha &
!is.na(as.numeric(pvals[, 1]))])
if (length(RIPeps) < 2) stop("There are <2 biomolecules in the subset; cannot proceed.")
return(RIPeps)
}
#' Identify biomolecules with no missing values across samples
#'
#' Selects biomolecules that have complete rows in e_data, equivalent to 'ppp'
#' with proportion = 1.
#'
#' @param e_data a \eqn{p \times n + 1} data.frame, where \eqn{p} is the number
#' of peptides, proteins, lipids, or metabolites and \eqn{n} is the number of
#' samples. Each row corresponds to data for a peptide, protein, lipid, or
#' metabolite, with one column giving the biomolecule identifier name.
#' @param edata_id character string indicating the name of the peptide, protein,
#' lipid, or metabolite identifier. Usually obtained by calling
#' \code{attr(omicsData, "cnames")$edata_cname}.
#'
#' @return Character vector containing the biomolecules with no missing values
#' across all samples.
#'
complete_mols <- function(e_data, edata_id) {
# pull off the column for edata_id
edata_id_ind <- which(colnames(e_data) == edata_id)
# rows with no missing values
complete_inds <- which(complete.cases(e_data[, -edata_id_ind]) == TRUE)
# retain only character peptide names for complete rows
peps <- as.character(e_data[complete_inds, edata_id_ind])
if (length(peps) < 2) stop("There are <2 biomolecules in the subset; cannot proceed.")
return(peps)
}