-
Notifications
You must be signed in to change notification settings - Fork 23
/
change_em_binning.r
323 lines (299 loc) · 11.4 KB
/
change_em_binning.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
#' Change population and observed length-composition bins
#'
#' `change_em_binning()` alters the bin structure for the population and
#' length-composition data in a Stock Synthesis estimation model (EM).
#' The original length-composition data from the EM `.dat` is changed
#' according to the user's specification. If the data file also contains
#' conditional age-at-length data, then these data will be re-binned as well.
#'
#' @template dat_list
#' @template outfile
#' @param bin_vector A numeric vector of new length bins to substitute into the
#' `*.dat` file.
#' @param lbin_method A numeric value of either `NULL, 1, 2, 3` to change
#' the lbin_method for the population bin.
#' `NULL` means to not re-bin.
#' @param pop_binwidth Population length bin width.
#' Only necessary for `lbin_method = 2`.
#' Note that this value must be smaller than the bin width
#' specified in length-composition data `len_bins` or
#' Stock Synthesis will fail (see notes in the Stock Synthesis manual).
#' @param pop_minimum_size Population minimum length bin value.
#' Only necessary for `lbin_method = 2`.
#' @param pop_maximum_size Population maximum length bin value.
#' Only necessary for `lbin_method = 2`.
#' @export
#' @family sample functions
#' @family change functions
#' @author Kotaro Ono (length-composition rebinning), Sean Anderson
#' (conditional age-at-length rebinning)
#' @examples
#' # Note that typically this function is used with estimation models in ss3sim,
#' # but it is used with an operating model data file in the following examples.
#' f <- system.file("extdata", "models", "cod-om", "codOM.dat", package = "ss3sim")
#' d <- r4ss::SS_readdat(f, verbose = FALSE)
#'
#' # An example with lbin_method = 1
#' l1 <- change_em_binning(d,
#' outfile = NULL, lbin_method = 1,
#' bin_vector = seq(20, 152, by = 4)
#' )
#' l1$lbin_vector
#' head(l1$lencomp)
#'
#' # An example with lbin_method = 2
#' new_bin_vec <- seq(min(d$lbin_vector), max(d$lbin_vector), by = 4)
#' # add the max value if necessary.
#' if (new_bin_vec[length(new_bin_vec)] != d$lbin_vector[length(d$lbin_vector)]) {
#' new_bin_vec <- c(
#' new_bin_vec,
#' d$lbin_vector[length(d$lbin_vector)]
#' )
#' }
#' pop_bin_input <- 5
#' pop_min_size_input <- min(d$lbin_vector_pop) - 1
#' pop_max_size_input <- max(d$lbin_vector_pop) + 5
#' lbin_vec_pop <- seq(pop_min_size_input,
#' pop_max_size_input,
#' length.out = (pop_max_size_input - pop_min_size_input) /
#' pop_bin_input + 1
#' )
#' l2 <- change_em_binning(
#' dat_list = d,
#' bin_vector = new_bin_vec,
#' lbin_method = 2,
#' # Note: need more inputs with lbin_method = 2
#' pop_binwidth = pop_bin_input,
#' pop_minimum_size = pop_min_size_input,
#' pop_maximum_size = pop_max_size_input
#' )
#' l2$lbin_method
#' # note bin width is now the same as the input
#' pop_bin_input
#' l2$binwidth
#' # note the minimum size has changed based on the input:
#' pop_min_size_input
#' l2$minimum_size
#' # so has max
#' l2$maximum_size
#' l2$lbin_vector
#' # other modified components:
#' l2$lbin_vector_pop
#' head(l2$lencomp)
change_em_binning <- function(dat_list, outfile = NULL, bin_vector, lbin_method = NULL,
pop_binwidth = NULL, pop_minimum_size = NULL,
pop_maximum_size = NULL) {
## If lbin_method is NULL then don't do anything
if (is.null(lbin_method)) {
warning("lbin_method must be supplied for em rebinning to run. Skipping rebinning.")
return(dat_list)
}
# error checking
if (!is.numeric(bin_vector)) {
stop("bin_vector must be numeric")
}
if (length(bin_vector) > length(dat_list$lbin_vector)) {
stop(
"The specified bin_vector is longer than the original ",
"lbin_vector in the Stock Synthesis data file and therefore can't be re-binned."
)
}
if (length(bin_vector) == 1) {
warning(
"length(bin_vector) == 1; are you sure you input a full numeric ",
"vector of bins and not a bin size?"
)
}
## verify correct pop bin specification
if (!is.null(lbin_method)) {
## If you implement method 3 need to alter code below
if (lbin_method == 1 & !all(is.null(pop_binwidth) & is.null(pop_minimum_size) & is.null(pop_maximum_size))) {
warning("lbin_method=1 so pop bin parameters ignored and data bins used")
}
if (lbin_method == 2 & any(is.null(pop_binwidth), is.null(pop_minimum_size), is.null(pop_maximum_size))) {
stop("you must specify all pop bin parameters if using lbin_method=2")
}
if (lbin_method > 2) {
stop("lbin_method method should be either NULL, 1 or 2; 3 is not currently implemented")
}
}
if (is.null(dat_list$lencomp)) {
stop("no lcomp data. Verify your input arguments.")
}
if (dat_list$Nsexes > 1) {
stop(
"_Nsexes is greater than 1 in the model.change_em_binning only ",
"works with single-sex models."
)
}
# todo: determine if the commented out stops should be brought back
# if (!identical(as.integer(max(bin_vector)), as.integer(max(dat_list$lbin_vector)))) {
# stop("The maximum value in the bin_vector is not equal to the original ",
# "maximum length bin value.")
# }
# if(!identical(as.integer(min(bin_vector)), as.integer(min(dat_list$lbin_vector)))) {
# stop("The minimum value in the bin_vector is not equal to the original ",
# "minimum length bin value.")
# }
if (any(!is_divisible(bin_vector, by_ = dat_list$binwidth))) {
stop(
"One or more of the values in bin_vector are not divisible by the ",
"population binwidth specified in the Stock Synthesis data file."
)
}
# Find ID columns and data columns to replace:
old_len_columns <- grep("^l[0-9.]+$", names(dat_list$lencomp))
old_lcomp_total <- sum(dat_list$lencomp[, old_len_columns]) # check later
id_columns <- seq_along(names(dat_list$lencomp))[-old_len_columns]
newdummy <- dat_list$lencomp[, old_len_columns]
old_binvector <- dat_list$lbin_vector
# change population length bin width
lcomp_new <- as.data.frame(matrix(0,
nrow = nrow(newdummy),
ncol = length(bin_vector)
))
names(lcomp_new) <- paste0("l", bin_vector)
# Re-bin length comps:
for (i in seq_along(bin_vector)) {
if (i == 1) {
select_col <- which(dat_list$lbin_vector < bin_vector[i + 1])
if (length(select_col) > 1) {
lcomp_new[, i] <- apply(newdummy[, select_col], 1, sum, na.rm = TRUE)
}
if (length(select_col) == 1) {
lcomp_new[, i] <- newdummy[, select_col]
}
}
if (i > 1 & i < length(bin_vector)) {
select_col <- which(dat_list$lbin_vector >= bin_vector[i] &
dat_list$lbin_vector < bin_vector[i + 1])
if (length(select_col) > 1) {
lcomp_new[, i] <- apply(newdummy[, select_col], 1, sum, na.rm = TRUE)
}
if (length(select_col) == 1) {
lcomp_new[, i] <- newdummy[, select_col]
}
}
if (i == length(bin_vector)) {
select_col <- which(dat_list$lbin_vector >= bin_vector[i])
if (length(select_col) > 1) {
lcomp_new[, i] <- apply(newdummy[, select_col], 1, sum, na.rm = TRUE)
}
if (length(select_col) == 1) {
lcomp_new[, i] <- newdummy[, select_col]
}
}
}
new_lcomp_total <- sum(lcomp_new)
# if (!identical(old_lcomp_total, new_lcomp_total)) {
# #TODO: is this a necessary check? remove if not add test if so.
# stop("Number of samples in the new lcomp data matrix does not match the ",
# "number of samples in the original dataset.")
# }
# Substitute new bins:
dat_list$lencomp <- data.frame(dat_list$lencomp[, id_columns], lcomp_new)
dat_list$lbin_vector <- bin_vector
dat_list$N_lbins <- length(dat_list$lbin_vector)
# change the lbin_method, if it's NULL leave it as is
if (!is.null(lbin_method)) {
## otherwise if 1 it will use the data bins and requires no more
## input
if (lbin_method == 1) {
dat_list$lbin_method <- lbin_method
dat_list$binwidth <- NULL
dat_list$minimum_size <- NULL
dat_list$maximum_size <- NULL
dat_list$lbin_vector_pop <- dat_list$lbin_vector
} else {
## it is 2 so we need to specify width, min and max
dat_list <- change_pop_bin(dat_list,
binwidth = pop_binwidth,
minimum_size = pop_minimum_size,
maximum_size = pop_maximum_size
)
}
}
# Re-bin conditional age-at-length comps (not implemented)
# if all Lbin_lo == -1 then there aren't any CAL data:
if (length(unique(dat_list$agecomp$Lbin_lo)) > 1) {
if (!identical(dat_list$Lbin_method, 3)) {
stop(
"Lbin_method was not set to 3 in the Stock Synthesis data file. ",
"change_em_binning() requires the data file to specify conditional ",
"age-at-length data with Lbin_method == 3. See the Stock Synthesis manual. Note ",
"the capital L in Lbin_method."
)
}
if (dat_list$lbin_method == 2) {
population_bins <- seq(dat_list$minimum_size, dat_list$maximum_size,
by = dat_list$binwidth
)
if (!all(bin_vector %in% population_bins)) {
stop(
"One or more of bin_vector is not contained in the ",
"population bins (and lbin_method = 2). This is required in ",
"Stock Synthesis for conditional-age-at-length data."
)
}
}
# to check later:
a_ids <- grep("^a[0-9.]+$", names(dat_list$agecomp))
old_age_dat <- dat_list$agecomp[, a_ids]
old_agecomp_total <- sum(old_age_dat)
# grab the data we'll work with:
old_age <- dat_list$agecomp[dat_list$agecomp$Lbin_lo == -1, ]
old_cal_all <- dat_list$agecomp[dat_list$agecomp$Lbin_lo != -1, ]
# remove columns we'll merge back in after:
a_ids_character <- names(old_cal_all)[a_ids]
old_cal <- old_cal_all[, c("Yr", "Lbin_lo", a_ids_character)]
# make a lookup table of old and new bins:
lookup <- data.frame(
Lbin_lo = old_binvector,
lbin_new_low = bin_vector[findInterval(old_binvector, bin_vector)],
lbin_new_high = bin_vector[findInterval(old_binvector, bin_vector)]
)
# the re-binning happens here:
new_cal <- merge(old_cal, lookup, by = "Lbin_lo", all = FALSE, sort = FALSE)
dat_cols <- names(new_cal)[grep("^a[0-9.]+$", names(new_cal))]
new_cal <- stats::aggregate(new_cal[, dat_cols],
by = list(
"Yr" = new_cal$Yr,
"Lbin_lo" = new_cal$lbin_new_low,
"Lbin_hi" = new_cal$lbin_new_high
),
sum
)
new_cal$Nsamp <- rowSums(new_cal[, grepl("^a[0-9.]+$", names(new_cal))])
new_cal_meta_dat <- old_cal_all[
seq_len(nrow(new_cal)),
-which(names(old_cal_all) %in%
c("Yr", "Lbin_lo", a_ids_character, "Lbin_hi", "Nsamp"))
]
new_cal <- cbind(new_cal_meta_dat, new_cal)
# re-order the columns:
new_cal <- new_cal[, match(names(old_cal_all), names(new_cal))]
# and slot the new data into the .dat file:
dat_list$agecomp <- rbind(old_age, new_cal)
dat_list$N_agecomp <- nrow(dat_list$agecomp)
new_age_dat <- dat_list$agecomp[, grepl("^a[0-9.]+$", names(dat_list$agecomp))]
new_agecomp_total <- sum(new_age_dat)
if (!identical(old_agecomp_total, new_agecomp_total)) {
stop(
"Number of samples in the new agecomp data matrix does not match",
"the number of samples in the original dataset."
)
}
}
if (!is.null(outfile)) {
r4ss::SS_writedat(
datlist = dat_list,
outfile = outfile,
overwrite = TRUE,
verbose = FALSE
)
}
invisible(dat_list)
}
is_divisible <- function(x, by_ = 2L) {
x %% by_ == 0
}