/
esm_gbm.R
288 lines (262 loc) · 10.6 KB
/
esm_gbm.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
#' Fit and validate Generalized Boosted Regression models based on Ensembles of Small Models approach
#'
#' @description This function constructs Generalized Boosted Regression using the
#' Ensembles of Small Models (ESM) approach (Breiner et al., 2015, 2018).
#'
#' @param data data.frame. Database with the response (0,1) and predictors values.
#' @param response character. Column name with species absence-presence data (0,1)
#' @param predictors character. Vector with the column names of quantitative
#' predictor variables (i.e. continuous variables). This can only construct models with continuous variables and does not allow categorical variables.
#' Usage predictors = c("aet", "cwd", "tmin")
#' @param partition character. Column name with training and validation partition groups.
#' @param thr character. Threshold used to get binary suitability values (i.e. 0,1). It is useful for threshold-dependent performance metrics. It is possible to use more than one threshold type. It is necessary to provide a vector for this argument. The following threshold criteria are available:
#' \itemize{
#' \item equal_sens_spec: Threshold at which the sensitivity and specificity are equal.
#' \item max_sens_spec: Threshold at which the sum of the sensitivity and specificity is the highest (aka threshold that maximizes the TSS).
#' \item max_jaccard: The threshold at which the Jaccard index is the highest.
#' \item max_sorensen: The threshold at which the Sorensen index is highest.
#' \item max_fpb: The threshold at which FPB (F-measure on presence-background data) is highest.
#' \item sensitivity: Threshold based on a specified sensitivity value.
#' Usage thr = c('sensitivity', sens='0.6') or thr = c('sensitivity'). 'sens' refers to sensitivity value. If a sensitivity value is not specified, the default value is 0.9.
#' }
#' In the case of use more than one threshold type it is necessary concatenate threshold types, e.g., thr=c('max_sens_spec', 'max_jaccard'), or thr=c('max_sens_spec', 'sensitivity', sens='0.8'), or thr=c('max_sens_spec', 'sensitivity'). Function will use all thresholds if no threshold is specified.
#' @param n_trees Integer specifying the total number of trees to fit.
#' This is equivalent to the number of iterations and the number of basis
#' functions in the additive expansion. Default is 100.
#' @param n_minobsinnode Integer specifying the minimum number of
#' observations in the terminal nodes of the trees. Note that this
#' is the actual number of observations, not the total weight. If n_minobsinnode is NULL,
#' this parameter will assume a value equal to nrow(data)*0.5/4. Default is NULL.
#' @param shrinkage Numeric. This parameter applied to each tree in the
#' expansion. Also known as the learning rate or step-size reduction;
#' 0.001 to 0.1 usually works, but a smaller learning rate typically
#' requires more trees. Default is 0.1.
#'
#'
#' @details This method consists of creating bivariate models with all the pair-wise combinations
#' of predictors and perform an ensemble based on the average of suitability weighted by
#' Somers' D metric (D = 2 x (AUC -0.5)). ESM is recommended for modeling species with few occurrences.
#' This function does not allow categorical variables because the use of these types of variables
#' could be problematic when using with few occurrences. For further detail see
#' Breiner et al. (2015, 2018).
#'
#' @return
#'
#' A list object with:
#' \itemize{
#' \item esm_model: A list with "gbm" class object from gbm package for each bivariate model. This object can be used
#' for predicting ensembles of small models with \code{\link{sdm_predict}} function.
#' \item predictors: A tibble with variables use for modeling.
#' \item performance: Performance metrics (see \code{\link{sdm_eval}}).
#' Threshold dependent metrics are calculated based on the threshold specified in thr argument.
#' }
#'
#' @seealso \code{\link{esm_gam}}, \code{\link{esm_gau}}, \code{\link{esm_glm}},
#' \code{\link{esm_max}}, \code{\link{esm_net}}, and \code{\link{esm_svm}}.
#' @export
#'
#' @references
#' \itemize{
#' \item Breiner, F. T., Guisan, A., Bergamini, A., & Nobis, M. P. (2015). Overcoming limitations of modelling rare species by using ensembles of small models. Methods in Ecology and Evolution, 6(10), 1210-218. https://doi.org/10.1111/2041-210X.12403
#' \item Breiner, F. T., Nobis, M. P., Bergamini, A., & Guisan, A. (2018). Optimizing ensembles of small models for predicting the distribution of species with few occurrences. Methods in Ecology and Evolution, 9(4), 802-808. https://doi.org/10.1111/2041-210X.12957
#' }
#'
#' @importFrom dplyr bind_rows filter group_by distinct pull mutate inner_join select starts_with bind_cols summarise across left_join relocate
#' @importFrom stats sd
#' @importFrom utils combn txtProgressBar setTxtProgressBar
#'
#' @examples
#' \dontrun{
#' data("abies")
#' require(dplyr)
#'
#' # Using k-fold partition method
#' set.seed(10)
#' abies2 <- abies %>%
#' na.omit() %>%
#' group_by(pr_ab) %>%
#' dplyr::slice_sample(n = 10) %>%
#' group_by()
#'
#' abies2 <- part_random(
#' data = abies2,
#' pr_ab = "pr_ab",
#' method = c(method = "rep_kfold", folds = 3, replicates = 5)
#' )
#' abies2
#'
#' # Without threshold specification and with kfold
#' esm_gbm_t1 <- esm_gbm(
#' data = abies2,
#' response = "pr_ab",
#' predictors = c("aet", "cwd", "tmin", "ppt_djf", "ppt_jja", "pH", "awc", "depth"),
#' partition = ".part",
#' thr = NULL,
#' n_trees = 100,
#' n_minobsinnode = NULL,
#' shrinkage = 0.1
#' )
#'
#' esm_gbm_t1$esm_model # bivariate model
#' esm_gbm_t1$predictors
#' esm_gbm_t1$performance
#' }
esm_gbm <- function(data,
response,
predictors,
partition,
thr = NULL,
n_trees = 100,
n_minobsinnode = NULL,
shrinkage = 0.1) {
. <- part <- model <- TPR <- IMAE <- rnames <- thr_value <- n_presences <- n_absences <- AUC_mean <- pr_ab <- NULL
variables <- dplyr::bind_rows(c(c = predictors))
# N of predictor requirement
if (length(predictors) <= 2) {
stop("The 'esm_' family function should be used to build models with more than 2 predictors, use the 'fit_' or 'tune_' family functions instead")
}
if (is.null(n_minobsinnode)) {
n_minobsinnode <- as.integer(nrow(data) * 0.5 / 4)
}
# Formula
formula1 <- utils::combn(variables, 2)
nms <- apply(utils::combn(variables, 2), 2, function(x) paste(x, collapse = "_")) %>%
paste0(".", .)
# Fit models
eval_esm <- list()
list_esm <- list()
pb <- utils::txtProgressBar(min = 0, max = ncol(formula1), style = 3)
for (f in 1:ncol(formula1)) {
suppressMessages(
list_esm[[f]] <- fit_gbm(
data = data,
response = response,
predictors = unlist(formula1[, f]),
predictors_f = NULL,
partition = partition,
thr = thr,
fit_formula = NULL,
n_trees = 100,
n_minobsinnode = n_minobsinnode,
shrinkage = 0.1
)
)
utils::setTxtProgressBar(pb, which(1:ncol(formula1) == f))
}
close(pb)
# Extract performance
eval_esm <- lapply(list_esm, function(x) {
x <- x$performance
x$model <- "esm_gbm"
x
})
names(eval_esm) <- nms
eval_esm <- eval_esm %>%
dplyr::bind_rows(., .id = "variables") %>%
dplyr::filter(threshold != "lpt")
# Calculate Somers' metric and remove small models with bad performance (AUC<0.5)
mtrc <- eval_esm %>%
dplyr::group_by(variables) %>%
dplyr::distinct(AUC_mean) %>%
dplyr::pull()
D <- 2 * (mtrc - 0.5) # Somers'D
filt <- D > 0
if (sum(filt) == 0) {
message("None bivariate model had Somer's D > 0.5. Try with another esm_* function. NA will be returned")
return(NA)
}
# Filter data based on Somers<0.5
D <- D[filt]
list_esm <- list_esm[filt]
nms <- nms[filt]
# Perform weighted ensemble
data_ens <- sapply(list_esm, function(x) {
x["data_ens"]
})
data_ens <- mapply(function(x, cn) {
colnames(x)[colnames(x) %in% "pred"] <- cn
x
}, data_ens, nms, SIMPLIFY = FALSE)
data_ens <- lapply(data_ens, function(x) {
x %>% dplyr::mutate(pr_ab = pr_ab %>%
as.character() %>%
as.double())
})
if(length(data_ens) > 1){
data_ens2 <-
dplyr::inner_join(data_ens[[1]],
data_ens[[2]],
by = c("rnames", "replicates", "part", "pr_ab")
)
if (length(data_ens) > 2) {
for (i in 3:length(data_ens)) {
data_ens2 <-
dplyr::inner_join(data_ens2,
data_ens[[i]],
by = c("rnames", "replicates", "part", "pr_ab")
)
}
}
} else {
data_ens2 <- data_ens[[1]]
}
rm(data_ens)
#### Extract predicted suitability of each model
values <- data_ens2 %>%
dplyr::select(dplyr::starts_with("."))
#### Remove suitability values from data_ens2
data_ens2 <- data_ens2 %>% dplyr::select(-dplyr::starts_with("."))
pred <- mapply(function(x, v) {
(x * v)
}, values, D, SIMPLIFY = TRUE) %>%
apply(., 1, function(x) {
sum(x, na.rm = TRUE)
}) / sum(D)
pred_test <- dplyr::bind_cols(data_ens2, pred = pred) # This dataset will be use to calculate
# esm peformance
# Validate ensemble base on weighted average suitability, split data by
# replicates and partition
testlist <- pred_test %>% dplyr::distinct(replicates, part)
replicates <- as.list(unique(pred_test$replicates))
names(replicates) <- unique(pred_test$replicates)
for (r in unique(pred_test$replicates)) {
x0 <- pred_test %>% dplyr::filter(replicates == r)
replicates[[r]] <- lapply(
split(x0, x0$part),
function(x) {
sdm_eval(
p = x$pred[x$pr_ab == 1],
a = x$pred[x$pr_ab == 0],
thr = thr
)
}
) %>%
dplyr::bind_rows(., .id = "part")
}
eval_esm <- dplyr::bind_rows(replicates, .id = "replicates")
eval_final <- eval_esm %>%
dplyr::group_by(threshold) %>%
dplyr::summarise(dplyr::across(
TPR:IMAE,
list(mean = mean, sd = stats::sd)
), .groups = "drop")
# Calculate final threshold
threshold <- sdm_eval(
p = pred_test$pred[pred_test$pr_ab == 1],
a = pred_test$pred[pred_test$pr_ab == 0],
thr = thr
)
# List of models used for prediction
mod <- lapply(list_esm, function(x) x$mode)
names(mod) <- D
result <- list(
esm_model = mod,
predictors = variables,
performance = dplyr::left_join(tibble(model = "esm_gbm", eval_final),
threshold[1:4],
by = "threshold"
) %>%
dplyr::relocate(model, threshold, thr_value, n_presences, n_absences)
)
return(result)
}