-
Notifications
You must be signed in to change notification settings - Fork 1
/
empirical-tidy-bootstrap.R
93 lines (84 loc) · 2.73 KB
/
empirical-tidy-bootstrap.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#' Bootstrap Empirical Data
#'
#' @family Bootstrap
#'
#' @author Steven P. Sanderson II, MPH
#'
#' @details This function will take in a numeric input vector and produce a tibble
#' of bootstrapped values in a list. The table that is output will have two columns:
#' `sim_number` and `bootstrap_samples`
#'
#' The `sim_number` corresponds to how many times you want the data to be resampled,
#' and the `bootstrap_samples` column contains a list of the boostrapped resampled
#' data.
#'
#' @description Takes an input vector of numeric data and produces a bootstrapped
#' nested tibble by simulation number.
#'
#' @param .x The vector of data being passed to the function. Must be a numeric
#' vector.
#' @param .num_sims The default is 2000, can be set to anything desired. A warning
#' will pass to the console if the value is less than 2000.
#' @param .proportion How much of the original data do you want to pass through
#' to the sampling function. The default is 0.80 (80%)
#' @param .distribution_type This can either be 'continuous' or 'discrete'
#'
#' @examples
#' x <- mtcars$mpg
#' tidy_bootstrap(x)
#'
#' @return
#' A nested tibble
#'
#' @export
#'
tidy_bootstrap <- function(.x, .num_sims = 2000, .proportion = 0.8,
.distribution_type = "continuous") {
# Tidyeval ----
x_term <- as.numeric(.x)
n <- length(x_term)
dist_type <- tolower(as.character(.distribution_type))
num_sims <- as.integer(.num_sims)
prop <- as.numeric(.proportion)
# Checks ----
if (!is.vector(x_term)) {
rlang::abort(
message = "You must pass a vector as the .x argument to this function.",
use_cli_format = TRUE
)
}
if (prop <= 0 | prop > 1) {
rlang::abort(
message = "The '.proportion' parameter must be greater than 0 and up to or including 1.",
use_cli_format = TRUE
)
}
if (!dist_type %in% c("continuous", "discrete")) {
rlang::abort(
message = "You must choose either 'continuous' or 'discrete'.",
use_cli_format = TRUE
)
}
if (num_sims < 2000) {
rlang::warn(
message = "Setting '.num_sims' to less than 2000 means that results can be
potentially unstable. Consider setting to 2000 or more.",
use_cli_format = TRUE
)
}
# Data ----
df <- dplyr::tibble(sim_number = as.factor(1:num_sims)) |>
dplyr::group_by(sim_number) |>
dplyr::mutate(bootstrap_samples = list(
sample(x = x_term, size = floor(prop * n), replace = TRUE)
)) |>
dplyr::ungroup()
# Attach descriptive attributes to tibble
attr(df, "distribution_family_type") <- dist_type
attr(df, ".x") <- .x
attr(df, ".num_sims") <- .num_sims
attr(df, "tibble_type") <- "tidy_bootstrap_nested"
attr(df, "dist_with_params") <- "Empirical"
# Return ----
return(df)
}