/
get_sample.R
57 lines (49 loc) · 1.69 KB
/
get_sample.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#' Samples data for a prediction model with a specified AUC and prevalence.
#'
#' @param auc The Area Under the (receiver operating characteristic) Curve.
#' @param n_samples Number of samples to draw.
#' @param prevalence Prevalence or event rate of the binary outcome as
#' a proportion (0.1 = 10%).
#' @param min_events Minimum number of events required in the sample.
#'
#' @return Returns a \code{data.frame}.
#' @export
#'
#' @examples get_sample(0.7, 1000, 0.1)
get_sample <- function(auc, n_samples, prevalence, min_events = 0) {
# method for converting of auc to Cohen's D
# http://dx.doi.org/10.5093/ejpalc2018a5
t <- sqrt(log(1 / (1 - auc)**2))
z <- t - ((2.515517 + 0.802853 * t + 0.0103328 * t**2) /
(1 + 1.432788 * t + 0.189269 * t**2 + 0.001308 * t**3))
d <- z * sqrt(2)
sampled_data <- sample(
c(0, 1),
n_samples,
replace = TRUE,
prob = c(1 - prevalence, prevalence)
)
n_pos <- sum(sampled_data)
# if n_pos < min_events, add a value to the sample until n_pos == min_events
while (n_pos < min_events) {
added_sample <- sample(
c(0, 1),
size = 1,
replace = TRUE,
prob = c(1 - prevalence, prevalence)
)
if (added_sample == 1) {
n_pos <- n_pos + 1
}
n_samples <- n_samples + 1
}
n_neg <- n_samples - n_pos
# if by chance all samples are either positive or negative, repeat process
# almost all the cutpoint selection methods will fail if there's only 1 class.
if (n_pos == 0 | n_neg == 0) {
return(get_sample(auc, n_samples, prevalence, min_events))
}
x <- c(stats::rnorm(n_neg, mean = 0), stats::rnorm(n_pos, mean = d))
y <- c(rep(0, n_neg), rep(1, n_pos))
return(data.frame(x = x, actual = y))
}