-
-
Notifications
You must be signed in to change notification settings - Fork 3
/
rcompanion_groupwiseMean.R
316 lines (313 loc) · 11.2 KB
/
rcompanion_groupwiseMean.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
#' @title Get group means and CIs (rcompanion::groupwiseMean)
#'
#' @description Get group means and bootstrapped effect sizes
#' from the `rcompanion::groupwiseMean` function.
#' The function had to be taken separately from the package as
#' the dependency is failing upon install of the current package.
#'
#' From the original documentation: "Calculates means and
#' confidence intervals for groups."
#'
#' From: https://rcompanion.org/handbook/C_03.html
#'
#' "For routine use, I recommend using bootstrapped confidence
#' intervals, particularly the BCa or percentile methods (but...)
#' by default, the function reports confidence intervals by the
#' traditional method."
#'
#' @param formula A formula indicating the measurement variable and
#' the grouping variables. e.g. y ~ x1 + x2.
#' @param data The data frame to use.
#' @param var The measurement variable to use. The name is in double quotes.
#' @param group The grouping variable to use. The name is in double quotes.
#' Multiple names are listed as a vector. (See example.)
#' @param trim The proportion of observations trimmed from each end of the
#' values before the mean is calculated. (As in \code{mean()})
#' @param na.rm If \code{TRUE}, \code{NA} values are removed during
#' calculations. (As in \code{mean()})
#' @param conf The confidence interval to use.
#' @param R The number of bootstrap replicates to use for bootstrapped
#' statistics.
#' @param boot If \code{TRUE}, includes the mean of the bootstrapped means.
#' This can be used as an estimate of the mean for
#' the group.
#' @param traditional If \code{TRUE}, includes the traditional confidence
#' intervals for the group means, using the t-distribution.
#' If \code{trim} is not 0,
#' the traditional confidence interval
#' will produce \code{NA}.
#' Likewise, if there are \code{NA} values that are not
#' removed, the traditional confidence interval
#' will produce \code{NA}.
#' @param normal If \code{TRUE}, includes the normal confidence
#' intervals for the group means by bootstrap.
#' See \code{{boot::boot.ci}}.
#' @param basic If \code{TRUE}, includes the basic confidence
#' intervals for the group means by bootstrap.
#' See \code{{boot::boot.ci}}.
#' @param percentile If \code{TRUE}, includes the percentile confidence
#' intervals for the group means by bootstrap.
#' See \code{{boot::boot.ci}}.
#' @param bca If \code{TRUE}, includes the BCa confidence
#' intervals for the group means by bootstrap.
#' See \code{{boot::boot.ci}}.
#' @param digits The number of significant figures to use in output.
#' @param ... Other arguments passed to the \code{boot} function.
#'
#' @details The input should include either \code{formula} and \code{data};
#' or \code{data}, \code{var}, and \code{group}. (See examples).
#'
#' Results for ungrouped (one-sample) data can be obtained by either
#' setting the right side of the formula to 1, e.g. y ~ 1, or by
#' setting \code{group=NULL} when using \code{var}.
#'
#' @note The parsing of the formula is simplistic. The first variable on the
#' left side is used as the measurement variable. The variables on the
#' right side are used for the grouping variables.
#'
#' In general, it is advisable to handle \code{NA} values before
#' using this function.
#' With some options, the function may not handle missing values well,
#' or in the manner desired by the user.
#' In particular, if \code{bca=TRUE} and there are \code{NA} values,
#' the function may fail.
#'
#' For a traditional method to calculate confidence intervals
#' on trimmed means,
#' see Rand Wilcox, Introduction to Robust Estimation and
#' Hypothesis Testing.
#'
#' @export
#' @author Salvatore Mangiafico, \email{mangiafico@njaes.rutgers.edu}
#' @references \url{https://rcompanion.org/handbook/C_03.html}
#' @concept mean confidence interval bootstrap
#' @return A data frame of requested statistics by group.
#' @keywords group means confidence intervals bootstrapping internal
#' @examplesIf requireNamespace("boot", quietly = TRUE)
#' \donttest{
#' ### Example with formula notation
#' data(mtcars)
#' rcompanion_groupwiseMean(mpg ~ factor(cyl),
#' data = mtcars,
#' traditional = FALSE,
#' percentile = TRUE
#' )
#'
#' # Example with variable notation
#' data(mtcars)
#' rcompanion_groupwiseMean(
#' data = mtcars,
#' var = "mpg",
#' group = c("cyl", "am"),
#' traditional = FALSE,
#' percentile = TRUE
#' )
#' }
#'
#' @importFrom dplyr syms pick group_by summarize rename all_of across everything
rcompanion_groupwiseMean <- function(formula = NULL,
data = NULL,
var = NULL,
group = NULL,
trim = 0,
na.rm = FALSE,
conf = 0.95,
R = 5000,
boot = FALSE,
traditional = TRUE,
normal = FALSE,
basic = FALSE,
percentile = FALSE,
bca = FALSE,
digits = 3,
...) {
rlang::check_installed("boot", reason = "for this function.")
ddply <- function(.data, .variables, var, .fun, ...) {
.data %>%
group_by(across(all_of(.variables))) %>%
summarize(V1 = .fun(as.data.frame(pick(everything())), var), .groups = "drop") %>%
as.data.frame()
}
if (!is.null(formula)) {
var <- all.vars(formula[[2]])[1]
group <- all.vars(formula[[3]])
}
if (na.rm) {
DF <- ddply(.data = data, .variables = group, var, .fun = function(x,
idx) {
sum(!is.na(x[, idx]))
})
}
if (!na.rm) {
DF <- ddply(.data = data, .variables = group, var, .fun = function(x,
idx) {
length(x[, idx])
})
}
fun1 <- function(x, idx) {
as.numeric(mean(x[, idx], trim = trim, na.rm = na.rm))
}
D1 <- ddply(.data = data, .variables = group, var, .fun = fun1)
if (boot == TRUE) {
fun2 <- function(x, idx) {
mean(boot::boot(x[, idx], function(y, j) {
mean(y[j], trim = trim, na.rm = na.rm)
}, R = R, ...)$t[, 1])
}
D2 <- ddply(.data = data, .variables = group, var, .fun = fun2)
}
if (basic == TRUE) {
fun4 <- function(x, idx) {
boot::boot.ci(
boot::boot(x[, idx], function(y, j) {
mean(y[j], trim = trim, na.rm = na.rm)
}, R = R, ...),
conf = conf,
type = "basic", ...
)$basic[4]
}
fun5 <- function(x, idx) {
boot::boot.ci(
boot::boot(x[, idx], function(y, j) {
mean(y[j], trim = trim)
}, R = R, ...),
conf = conf, type = "basic",
...
)$basic[5]
}
D4 <- ddply(.data = data, .variables = group, var, .fun = fun4)
D5 <- ddply(.data = data, .variables = group, var, .fun = fun5)
}
if (normal == TRUE) {
fun6 <- function(x, idx) {
boot::boot.ci(
boot::boot(x[, idx], function(y, j) {
mean(y[j], trim = trim, na.rm = na.rm)
}, R = R, ...),
conf = conf,
type = "norm", ...
)$normal[2]
}
fun7 <- function(x, idx) {
boot::boot.ci(
boot::boot(x[, idx], function(y, j) {
mean(y[j], trim = trim, na.rm = na.rm)
}, R = R, ...),
conf = conf,
type = "norm", ...
)$normal[3]
}
D6 <- ddply(.data = data, .variables = group, var, .fun = fun6)
D7 <- ddply(.data = data, .variables = group, var, .fun = fun7)
}
if (percentile == TRUE) {
fun8 <- function(x, idx) {
boot::boot.ci(
boot::boot(x[, idx], function(y, j) {
mean(y[j], trim = trim, na.rm = na.rm)
}, R = R, ...),
conf = conf,
type = "perc", ...
)$percent[4]
}
fun9 <- function(x, idx) {
boot::boot.ci(
boot::boot(x[, idx], function(y, j) {
mean(y[j], trim = trim, na.rm = na.rm)
}, R = R, ...),
conf = conf,
type = "perc", ...
)$percent[5]
}
D8 <- ddply(.data = data, .variables = group, var, .fun = fun8)
D9 <- ddply(.data = data, .variables = group, var, .fun = fun9)
}
if (bca == TRUE) {
fun10 <- function(x, idx) {
boot::boot.ci(
boot::boot(x[, idx], function(y, j) {
mean(y[j], trim = trim, na.rm = na.rm)
}, R = R, ...),
conf = conf,
type = "bca", ...
)$bca[4]
}
fun11 <- function(x, idx) {
boot::boot.ci(
boot::boot(x[, idx], function(y, j) {
mean(y[j], trim = trim, na.rm = na.rm)
}, R = R, ...),
conf = conf,
type = "bca", ...
)$bca[5]
}
D10 <- ddply(.data = data, .variables = group, var, .fun = fun10)
D11 <- ddply(.data = data, .variables = group, var, .fun = fun11)
}
if (traditional == TRUE) {
Confy <- function(x, ...) {
S <- sd(x, na.rm = na.rm)
if (na.rm) {
N <- length(x[!is.na(x)])
}
if (!na.rm) {
N <- length(x)
}
Dist <- conf + (1 - conf) / 2
Inty <- stats::qt(Dist, df = (N - 1)) * S / sqrt(N)
if (trim == 0) {
return(Inty)
}
if (trim != 0) {
return(NA)
}
}
fun12 <- function(x, idx) {
mean(x[, idx], na.rm = na.rm) - Confy(x[, idx])
}
fun13 <- function(x, idx) {
mean(x[, idx], na.rm = na.rm) + Confy(x[, idx])
}
D12 <- ddply(.data = data, .variables = group, var, .fun = fun12)
D13 <- ddply(.data = data, .variables = group, var, .fun = fun13)
}
DF <- rename(DF, n = V1)
DF$Mean <- signif(D1$V1, digits = digits)
if (boot == TRUE) {
DF$Boot.mean <- signif(D2$V1, digits = digits)
}
if (basic | normal | percentile | bca | traditional) {
DF$Conf.level <- conf
}
if (traditional == TRUE) {
DF$Trad.lower <- signif(D12$V1, digits = digits)
}
if (traditional == TRUE) {
DF$Trad.upper <- signif(D13$V1, digits = digits)
}
if (basic == TRUE) {
DF$Basic.lower <- signif(D4$V1, digits = digits)
}
if (basic == TRUE) {
DF$Basic.upper <- signif(D5$V1, digits = digits)
}
if (normal == TRUE) {
DF$Normal.lower <- signif(D6$V1, digits = digits)
}
if (normal == TRUE) {
DF$Normal.upper <- signif(D7$V1, digits = digits)
}
if (percentile == TRUE) {
DF$Percentile.lower <- signif(D8$V1, digits = digits)
}
if (percentile == TRUE) {
DF$Percentile.upper <- signif(D9$V1, digits = digits)
}
if (bca == TRUE) {
DF$Bca.lower <- signif(D10$V1, digits = digits)
}
if (bca == TRUE) {
DF$Bca.upper <- signif(D11$V1, digits = digits)
}
return(DF)
}