/
part_senv.R
369 lines (333 loc) · 12.3 KB
/
part_senv.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
#' Environmental and spatial cross-validation
#'
#' @description This function explores different numbers of environmental partitions (clusters)
#' based on the K-means clustering algorithm and returns the number of partitions best suited for
#' a given presence, presence-absences, or presence-pseudo-absences database. Selection of the
#' best number of partitions is performed automatically considering spatial autocorrelation,
#' environmental similarity, and the number of presence and/or absence records in each partition.
#'
#' @param env_layer SpatRaster. Raster with environmental
#' variable. This will be used to evaluate spatial autocorrelation and
#' environmental similarity between training and testing partitions. Because this function
#' calculate dissimilarity based on Euclidean distances, it can only be used with continuous
#' variables
#' @param data data.frame. Data.frame or tibble object with presence
#' (or presence-absence, or presences-pseudo-absence) records, and coordinates
#' @param x character. Column name with spatial x coordinates
#' @param y character. Column name with spatial y coordinates
#' @param pr_ab character. Column with presences, presence-absence,
#' or pseudo-absence. Presences must be represented by 1 and absences by 0
#' @param min_n_groups integer. Minimum number of groups to be tested. Default 2.
#' @param max_n_groups integer. Maximum number of groups to be tested. Default 10.
#' @param min_occ numeric. Minimum number of presences or absences in a partition fold.
#' The min_occ value should be base on the amount of predictors in order to avoid over-fitting
#' or error when fitting models for a given fold. Default 10.
#' @param prop numeric. Proportion of point used for testing autocorrelation between
#' groups (values > 0 and <=1). The smaller this number is, the faster the function will work.
#' Default 0.5
#'
#' @return
#' A list with:
#' \itemize{
#' \item part: A tibble object with information used in 'data' arguments and a additional column
#' .part with partition group.
#' \item best_part_info: A tibble with information about the best partition. It contains the
#' number of partition (n_groups), standard deviation of presences (sd_p), standard deviation
#' of absences (sd_a), Moran's I spatial autocorrelation (spa_auto) and environmental similarity
#' based on Euclidean distance (env_sim)
#' }
#'
#' @details The part_sblock allows test with different numbers of partitions defined in the
#' envirnomental clusters delimited the K-mean cluster algorithm. This function explores a range
#' of environmental clusters and automatically selects best number of cluster for a given given
#' presence, presence-absences, or presence-pseudo-absences dataset. Such selection of number of
#' clusters is based on an optimization procedure that explores partition size in three dimensions
#' determined by spatial autocorrelation (measured by Moran's I), environmental similarity
#' (Euclidean distance), and difference in the amount of data among clusters
#' (Standard Deviation - SD; Velazco et al., 2019). This procedure will cyclically select
#' those partitions with autocorrelation values less than the lowest quartile of Morans I, then
#' those with environmental similarity values greater than the third quartile of the Euclidean
#' distances than those with a difference in the amount of data less than the lowest quartile of SD.
#' This selection is repeated until only one partition is retained (Velazco et al., 2019). The
#' main benefit of this partition selection are i) this is not subjective, ii) balances the
#' environmental similarity and special autocorrelation between partitions, and iii) controls
#' the partition selection with few data that may be problematic for model fitting
#' ("min_occ" argument)..
#'
#' Partitions geographically structured tend to evaluate model transferability more directly than
#' conventional ones (e.g., those performed by \code{\link{part_random}}) (Roberts et al., 2017;
#' Santini et al., 2021), being relevant for models that want to be used for projections in other
#' regions outside the calibration area or for other periods.
#'
#' This function can interact with \code{\link{get_block}}, \code{\link{sample_background}},
#' and \code{\link{sample_pseudoabs}} for sampling background points or pseudo-absences within
#' spatial partition broups
#'
#' @references
#' \itemize{
#' \item Roberts, D. R., Bahn, V., Ciuti, S., Boyce, M. S., Elith, J., Guillera-Arroita, G.,
#' Hauenstein, S., Lahoz-Monfort, J. J., Schroder, B., Thuiller, W., Warton, D. I., Wintle, B. A.,
#' Hartig, F., & Dormann, C. F. (2017). Cross-validation strategies for data with temporal, spatial,
#' hierarchical, or phylogenetic structure. Ecography, 40,
#' 913-929. https://doi.org/10.1111/ecog.02881
#' \item Santini, L., Benitez-Lopez, A., Maiorano, L., Cengic, M., & Huijbregts, M. A. J. (2021).
#' Assessing the reliability of species distribution projections in climate change research.
#' Diversity and Distributions, ddi.13252. https://doi.org/10.1111/ddi.13252
#' \item Velazco, S. J. E., Villalobos, F., Galvao, F., & De Marco Junior, P. (2019). A dark
#' scenario for Cerrado plant species: Effects of future climate, land use and protected areas
#' ineffectiveness. Diversity and Distributions, 25(4), 660-673. https://doi.org/10.1111/ddi.12886
#' }
#'
#' @export
#' @importFrom dplyr tibble pull bind_cols group_by count mutate filter select slice_sample
#' @importFrom stats complete.cases kmeans sd
#' @importFrom terra extract
#' @importFrom utils combn
#'
#' @seealso \code{\link{part_random}}, \code{\link{part_sblock}}, and \code{\link{part_sband}}
#'
#' @examples
#' \dontrun{
#' require(terra)
#' require(ggplot2)
#'
#' f <- system.file("external/somevar.tif", package = "flexsdm")
#' somevar <- terra::rast(f)
#'
#' # Select a species
#' spp1 <- spp %>% dplyr::filter(species == "sp1")
#'
#' part1 <- part_senv(
#' env_layer = somevar,
#' data = spp1,
#' x = "x",
#' y = "y",
#' pr_ab = "pr_ab",
#' min_n_groups = 2,
#' max_n_groups = 10,
#' min_occ = 10,
#' prop = 0.2
#' )
#'
#' part1
#'
#' ggplot(part1$part, aes(x, y, col = factor(.part))) +
#' geom_point(aes(shape = factor(pr_ab)))
#'
#' ggplot(part1$part, aes(x, y, col = factor(.part))) +
#' geom_point(aes(shape = factor(pr_ab))) +
#' facet_wrap(. ~ .part)
#'
#' ggplot(part1$part, aes(x, y, col = factor(.part))) +
#' geom_point(aes(shape = factor(pr_ab))) +
#' facet_wrap(. ~ pr_ab)
#' }
part_senv <- function(env_layer,
data,
x,
y,
pr_ab,
min_n_groups = 2,
max_n_groups = 10,
min_occ = 10,
prop = 0.5) {
group <- NULL
# Select columns
data <- data.frame(data)
data <- data[, c(pr_ab, x, y)]
colnames(data) <- c("pr_ab", "x", "y")
if (any(!unique(data[, "pr_ab"][[1]]) %in% c(0, 1))) {
stop(
"values in pr_ab column did not match with 0 and 1:
unique list values in pr_ab column are: ",
paste(unique(data[, "pr_ab"]), collapse = " ")
)
}
# Extract data
data <- dplyr::tibble(data, terra::extract(env_layer, data[, 2:3])[-1])
filt <- stats::complete.cases(data)
if (sum(!filt) > 0) {
data <- data[filt, ]
message(sum(!filt), " rows were excluded from database because NAs were found")
}
rm(filt)
# Vector with presences and absences
pa <- data %>%
dplyr::pull(pr_ab) %>%
unique()
# Vector with grid cell-size used
cell_size <- seq(min_n_groups, max_n_groups)
message(
"The following grid cell sizes will be tested:\n",
paste(round(cell_size, 2), collapse = " | "),
"\n"
)
message("Searching best partition...\n")
# k-mean algorithm
part <- list()
for (i in 1:length(cell_size)) {
part[[i]] <- stats::kmeans(data[, -1], centers = cell_size[i])$cluster
}
names(part) <- paste0(".g", cell_size)
# Bind groups
part <- dplyr::bind_cols(part)
### Remove problematic partition
n_records <- apply(part, 2, function(x) {
dplyr::tibble(x, data[1]) %>%
dplyr::group_by(x, pr_ab) %>%
dplyr::count() %>%
dplyr::mutate(filt = (n <= min_occ)) # 2
})
filt <- sapply(n_records, function(x) any(x %>% dplyr::pull("filt")))
if (sum(!filt) == 0) {
message("It was not possible to find a good partition. Try to change values in 'min_n_groups' and 'max_n_groups'")
return(NA)
}
n_records <- n_records[!filt]
part <- part[!filt]
# Perform SD
sd_p <- sapply(n_records, function(x) {
x %>%
dplyr::filter(pr_ab == 1) %>%
dplyr::pull(n) %>%
stats::sd()
})
sd_a <- sapply(n_records, function(x) {
x %>%
dplyr::filter(pr_ab == 0) %>%
dplyr::pull(n) %>%
sd()
})
rm(n_records)
# # Environmental similarity between train and test based on euclidean -----
Env.P <- data %>% dplyr::select(-pr_ab, -x, -y)
env_sim <- rep(NA, ncol(part))
for (i in 1:ncol(part)) {
cmb <- unique(part[, i][[1]]) %>% utils::combn(2)
Env.P1 <- cbind(part[i], Env.P)
Env.P1 <- Env.P1[stats::complete.cases(Env.P1), ]
Env.P1 <- stats::complete.cases(Env.P1[, -1], Env.P1[, 1])
euq_c <- list()
for (r in 1:ncol(cmb)) {
euq_c[[r]] <- euc_dist(Env.P1[[cmb[1, r]]], Env.P1[[cmb[2, r]]]) %>% mean()
}
env_sim[i] <- euq_c %>%
unlist() %>%
mean()
}
# # I moran-----
spa_auto <- rep(NA, ncol(part))
dist <- euc_dist(
data[, c("x", "y")] %>% as.data.frame(),
data[, c("x", "y")] %>% as.data.frame()
)
dist <- 1 / dist
diag(dist) <- 0
dist[which(dist == Inf)] <- 0
for (p in 1:ncol(part)) {
cmb <- unique(part[, p][[1]]) %>% utils::combn(2)
imoran_grid_c <- rep(NA, ncol(cmb))
dff <- dplyr::tibble(nrow = 1:nrow(part), data["pr_ab"], group = part[p][[1]])
for (c in 1:ncol(cmb)) {
filt <- dff %>%
dplyr::group_by(group, pr_ab) %>%
dplyr::slice_sample(prop = prop) %>%
dplyr::pull(nrow) %>%
sort()
odd <- which((part[p][[1]] == cmb[1, c])[filt])
even <- which((part[p][[1]] == cmb[2, c])[filt])
dist2 <- dist[filt, filt]
dist2[odd, odd] <- 0
dist2[even, even] <- 0
mins <- apply(dist2, 2, function(x) {
max(x, na.rm = TRUE)
})
for (i in 1:length(mins)) {
dist2[, i] <- ifelse(dist2[, i] == mins[i], mins[i], 0)
}
if (nrow(data) < 3) {
imoran_grid_c[c] <- NA
} else {
im <- sapply(
data[filt, names(env_layer)],
function(x) {
morani(x,
dist2,
na.rm = TRUE,
scaled = TRUE
)
}
)
imoran_grid_c[c] <- mean(abs(im))
}
}
spa_auto[p] <- mean(imoran_grid_c)
}
# # SELLECTION OF THE BEST CELL SIZE----
Opt <-
if (any(unique(pa) == 0)) {
data.frame(
n_parition = 1:ncol(part),
n_groups = gsub(".g", "", colnames(part)),
round(
data.frame(sd_p, sd_a, spa_auto, env_sim),
3
)
)
} else {
data.frame(n_groups = gsub(".g", "", colnames(part)), round(
data.frame(
sd_p, spa_auto, env_sim
),
3
))
}
Opt2 <- Opt
while (nrow(Opt2) > 1) {
# SD presence
Opt2 <-
Opt2[which(Opt2$sd_p <= summary(Opt2$sd_p)[2]), ]
if (nrow(Opt2) == 1) {
break
}
# SD absences
if (any(unique(pa) == 0)) {
Opt2 <-
Opt2[which(Opt2$sd_a <= summary(Opt2$sd_a)[2]), ]
if (nrow(Opt2) == 1) {
break
}
}
# I MORAN
if (nrow(Opt2) == 1) {
break
}
Opt2 <-
Opt2[which(Opt2$spa_auto <= summary(Opt2$spa_auto)[2]), ]
if (nrow(Opt2) == 1) {
break
}
# Euclidean
Opt2 <-
Opt2[which(Opt2$env_sim >= summary(Opt2$env_sim)[5]), ]
if (nrow(Opt2) == 1) {
break
}
if ((length(unique(Opt2$spa_auto)) == 1) &&
(length(unique(Opt2$env_sim)) == 1) &&
(length(unique(Opt2$sd_p)) == 1)) {
Opt2 <- Opt2[nrow(Opt2), ]
}
}
# Final data.frame result----
result <- data.frame(data, .part = c(part[, rownames(Opt2)])[[1]])
result <- result %>% dplyr::select(-names(env_layer))
colnames(result) <- c("pr_ab", "x", "y", ".part")
result <- result[c("x", "y", "pr_ab", ".part")]
# Final data.frame result2----
result <- list(
part = dplyr::tibble(result),
best_part_info = dplyr::tibble(Opt2)
)
return(result)
}