/
airdas_comments_process.R
361 lines (316 loc) · 14.7 KB
/
airdas_comments_process.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
#' Process comments in AirDAS data
#'
#' Extract miscellaneous information recorded in AirDAS data comments, i.e. comment-data
#'
#' @param x \code{airdas_dfr} or \code{airdas_df} object,
#' or a data frame that can be coerced to a \code{airdas_dfr} object
#' @param ... ignored
#' @param comment.format list; default is \code{NULL}.
#' See the 'Using \code{comment.format}' section
#'
#' @details Historically, project-specific or miscellaneous data have been
#' recorded in AirDAS comments using specific formats and character codes.
#' This functions identifies and extracts this data from the comment text strings.
#' However, different data types have different comment-data formats.
#' Specifically, TURTLE and PHOCOENA comment-data uses identifier codes
#' that each signify a certain data pattern, while other comment-data
#' (usually that of CARETTA) uses data separated by some delimiter.
#'
#' @section TURTLE and PHOCOENA comment-data:
#'
#' Current supported data types are: fish balls, molas, jellyfish, and crab pots.
#' See any of the AirDAS format PDFs (\code{\link{airdas_format_pdf}})
#' for information about the specific codes and formats used to
#' record this data. All comments are converted to lower case for processing
#' to avoid missing data.
#'
#' These different codes contain (at most):
#' a level one descriptor (e.g. fish ball or crab pot),
#' a level two descriptor (e.g. size or jellyfish species),
#' and a value (a count or percentage).
#' Thus, the extracted data are returned together in this structure.
#' The output data frame is long data, i.e. it has one piece of information per line.
#' For instance, if the comment is "fb1s fb1m", then the output data frame
#' will have one line for the small fish ball and one for the medium fish ball.
#' See Value section for more details.
#'
#' Currently this function only recognizes mola data recorded using the
#' "m1", "m2", and "m3" codes (small, medium, and large mola, respectively).
#' Thus, "mola" is not recognized and processed.
#'
#' The following codes are used for the level two descriptors:
#' \tabular{lr}{
#' \emph{Description} \tab \emph{Code}\cr
#' Small \tab s\cr
#' Medium \tab m\cr
#' Large \tab l\cr
#' Unknown \tab u\cr
#' Chrysaora \tab c\cr
#' Moon jelly \tab m\cr
#' Egg yolk \tab e\cr
#' Other \tab o\cr
#' }
#'
#' @section Using \code{comment.format}:
#'
#' \code{comment.format} is a list that allows the user to specify the comment-data format.
#' To use this argument, data must be separated by a delimiter.
#' This list must contain three named elements:
#' \itemize{
#' \item n: A single number indicating the number of elements of data in each comment.
#' Must equal the length of \code{type}.
#' A comment must contain exactly this number of \code{sep} to be recognized as comment-data
#' \item sep: A single string indicating the field separator string (delimiter).
#' Values within each comment are separated by this string.
#' Currently accepted values are ";" and ","
#' \item type: A character vector of length \code{n} indicating the data type of
#' each data element (column).
#' All values must be one of: "character", "numeric", or "integer".
#' }
#'
#' For instance, for most CARETTA data \code{comment.format} should be
#' \code{list(n = 5, sep = ";", type = c("character", "character", "numeric", "numeric", "character"))}
#'
#' @return \code{x}, filtered for comments with recorded data,
#' with the following columns added:
#' \itemize{
#' \item comment_str: the full comment string
#' \item Misc#: Some number of descriptor columns. There should be \code{n} columns,
#' although the minimum number will be two columns
#' \item Value: Associated count or percentage for TURTLE/PHOCOENA data
#' \item flag_check: logical indicating if the TURTLE/PHOCOENA
#' comment string was longer than an expected number of characters,
#' and thus should be manually inspected
#' }
#'
#' See the additional sections for more context.
#' If \code{comment.format} is \code{NULL},
#' then the output data frame would two Misc# columns:
#' a level one descriptor, e.g. "Fish ball" or "Jellyfish",
#' and a level two descriptor, e.g. s, m, or c.
#' However, if \code{comment.format$n} is say 4, then the output data frame would have
#' columns Misc1, Misc2, Misc3, and Misc4.
#'
#' Messages are printed if either \code{comment.format} is not \code{NULL}
#' and not comment-data is identified using \code{comment.format}, or if
#' \code{x} has TURTLE/PHOCOENA data but no TURTLE/PHOCOENA comment-data
#'
#' @examples
#' y <- system.file("airdas_sample.das", package = "swfscAirDAS")
#' y.proc <- airdas_process(y)
#'
#' airdas_comments_process(y.proc)
#'
#' @export
airdas_comments_process <- function(x, ...) UseMethod("airdas_comments_process")
#' @name airdas_comments_process
#' @export
airdas_comments_process.data.frame <- function(x, ...) {
airdas_comments_process(as_airdas_dfr(x, ...))
}
#' @name airdas_comments_process
#' @export
airdas_comments_process.airdas_dfr <- function(x, comment.format = NULL,...) {
as_airdas_dfr(.airdas_comments_process(x, comment.format, ...))
}
#' @name airdas_comments_process
#' @export
airdas_comments_process.airdas_df <- function(x, comment.format = NULL, ...) {
as_airdas_df(.airdas_comments_process(x, comment.format, ...))
}
.airdas_comments_process <- function(x, comment.format, ...) {
#----------------------------------------------------------------------------
# Input checks
stopifnot(
inherits(x, "airdas_df") | inherits(x, "airdas_dfr"),
all(x$file_type %in% c("turtle", "caretta", "survey", "phocoena"))
)
# Prep
x.c.all <- airdas_comments(x) %>%
select(.data$file_das, .data$line_num, .data$file_type, .data$comment_str) %>%
mutate(str_lower = tolower(.data$comment_str),
idx = seq_along(.data$file_das))
df.na <- x.c.all %>%
mutate(Misc1 = NA_character_, Misc2 = NA_character_, Value = NA,
flag_check = FALSE) %>%
slice(0)
#----------------------------------------------------------------------------
### Extract data based on comment.format
if (!is.null(comment.format)) {
# comment.format input checks
# stopifnot(
# "comment.format must be a list" = inherits(comment.format, "list"),
# "comment.format must be of length 3" = length(comment.format) == 3,
#
# "The names of comment.format must be 'n', 'sep', and 'type', resepctively" =
# identical(names(comment.format), c("n", "sep", "type")),
#
# "n must be a numeric (or integer) of length 1" =
# length(comment.format$n) == 1 & inherits(comment.format$n, c("integer", "numeric")),
#
# "n must be a whole number of at least 1" =
# isTRUE(all.equal(comment.format$n, as.integer(comment.format$n))) & comment.format$n >= 1,
#
# "sep must be a character of length 1" =
# length(comment.format$sep) == 1 & inherits(comment.format$sep, "character"),
#
# "In comment.format, the value of n must be equal to the length of type" =
# isTRUE(all.equal(comment.format$n, length(comment.format$type))),
#
# "In comment.format, all of type must be one of 'character', 'numeric', or 'integer'" =
# all(comment.format$type %in% c("character", "numeric", "integer")),
#
# "sep must be one of ';' or ','" = comment.format$sep %in% c(";", ",")
# )
stopifnot(
inherits(comment.format, "list"),
length(comment.format) == 3
)
if (!(identical(names(comment.format), c("n", "sep", "type"))))
stop("The names of comment.format must be 'n', 'sep', and 'type', resepctively")
if (!(length(comment.format$n) == 1 & inherits(comment.format$n, c("integer", "numeric"))))
stop("n must be a numeric (or integer) of length 1")
if (!(isTRUE(all.equal(comment.format$n, as.integer(comment.format$n))) & comment.format$n >= 1))
stop("n must be a whole number of at least 1")
if (!(length(comment.format$sep) == 1 & inherits(comment.format$sep, "character")))
stop("sep must be a character of length 1")
if (!(isTRUE(all.equal(comment.format$n, length(comment.format$type)))))
stop("In comment.format, the value of n must be equal to the length of type")
if (!(all(comment.format$type %in% c("character", "numeric", "integer"))))
stop("In comment.format, all of type must be one of 'character', 'numeric', or 'integer'")
if (!(comment.format$sep %in% c(";", ",")))
stop("sep must be one of ';' or ','")
# Process
if (any(str_count(x.c.all$str_lower, comment.format$sep) > (comment.format$n - 1)))
warning("Some comments have more than n of the provided sep character; ",
"did you provide the right values to comment.format?")
x.c.ext <- x.c.all %>%
filter(str_count(.data$str_lower, comment.format$sep) == (comment.format$n - 1)) %>%
mutate(var_extract = str_split(.data$str_lower, comment.format$sep))
if (nrow(x.c.ext) > 0) {
x.c.ext.proc <- data.frame(mapply(function(i, j) {
if (j == "numeric") {
as.numeric(i)
} else if (j == "integer") {
as.integer(i)
} else {
trimws(as.character(i))
}
}, i = data.frame(t(as.data.frame(x.c.ext$var_extract))),
j = comment.format$type, SIMPLIFY = FALSE))
names(x.c.ext.proc) <- paste0("Misc", seq_len(comment.format$n))
x.c.ext <- bind_cols(x.c.ext, x.c.ext.proc) %>%
select(-.data$var_extract)
} else {
message("None of the data contained any comments with the format ",
"specified in comment.format")
x.c.ext <- df.na
}
} else { #if comment.format is NULL
x.c.ext <- df.na
}
#----------------------------------------------------------------------------
### Extract specific coded data from turtle and phocoena
if (any(x.c.all$file_type %in% c("turtle", "phocoena"))) {
# Prep
x.c <- x.c.all %>% filter(.data$file_type %in% c("turtle", "phocoena"))
#--------------------------------------------
### Fish balls
x.c.fb <- x.c %>% filter(str_detect(.data$str_lower, "fb"))
if (nrow(x.c.fb) > 0) {
# Get data for 0-9 fbs
fb.4 <- x.c.fb %>%
mutate(var_extract = str_match_all(.data$str_lower, "fb(..)")) %>%
unnest(cols = c(.data$var_extract), keep_empty = FALSE) %>%
mutate(var_extract = .data$var_extract[, 1],
Misc1 = "fish ball",
Misc2 = substr(.data$var_extract, 4, 4),
Value = suppressWarnings(as.numeric(substr(.data$var_extract, 3, 3))))
# Get data for 10+ fbs
fb.5 <- x.c.fb %>%
mutate(var_extract = str_match_all(.data$str_lower, "fb(...)")) %>%
unnest(cols = c(.data$var_extract), keep_empty = FALSE) %>%
mutate(var_extract = .data$var_extract[, 1],
Misc1 = "fish ball",
Misc2 = substr(.data$var_extract, 5, 5),
Value = suppressWarnings(as.numeric(substr(.data$var_extract, 3, 4))))
fb.df <- fb.4 %>%
bind_rows(fb.5) %>%
mutate(flag_check = nchar(.data$str_lower) > 12) %>%
filter(!is.na(.data$Value), .data$Misc2 %in% c("s", "m", "l", "u")) %>%
arrange(.data$line_num)
} else {
fb.df <- df.na
}
#--------------------------------------------
### Molas
x.c.mola <- x.c %>%
filter(str_detect(.data$str_lower, "m1") | str_detect(.data$str_lower, "m2") |
str_detect(.data$str_lower, "m3")) #str_detect(.data$str_lower, "mola")
mola.df <- if (nrow(x.c.mola) > 0) {
x.c.mola %>%
mutate(str_lower = paste0(" ", .data$str_lower), #in case comment is e.g. "1 m1"
var_extract = str_match_all(.data$str_lower, "(..) m(.)")) %>%
unnest(cols = c(.data$var_extract), keep_empty = FALSE) %>%
mutate(var_extract = .data$var_extract[, 1],
Misc1 = "mola",
Misc2 = substr(.data$var_extract, 5, 5),
Value = suppressWarnings(as.numeric(substr(.data$var_extract, 1, 2))),
flag_check = nchar(.data$str_lower) > 12) %>%
filter(!is.na(.data$Value), .data$Misc2 %in% c(1, 2, 3))
} else {
df.na
}
#--------------------------------------------
### Jellyfish
x.c.jelly <- x.c %>% filter(str_detect(.data$str_lower, "jf"))
jelly.df <- if (nrow(x.c.jelly) > 0) {
x.c.jelly %>%
mutate(var_extract = str_match_all(.data$str_lower, "jf(....)")) %>%
unnest(cols = c(.data$var_extract), keep_empty = FALSE) %>%
mutate(var_extract = .data$var_extract[, 1],
Misc1 = "jellyfish",
Misc2 = substr(.data$var_extract, 3, 3),
Value = suppressWarnings(as.numeric(substr(.data$var_extract, 4, 6))),
flag_check = nchar(.data$str_lower) > 15) %>%
filter(!is.na(.data$Value),
.data$Misc2 %in% c("c", "m", "e", "o", "u"))
} else {
df.na
}
#--------------------------------------------
### Crab pots
x.c.cp <- x.c %>% filter(str_detect(.data$str_lower, "cp"))
cp.df <- if (nrow(x.c.cp) > 0) {
x.c.cp %>%
mutate(str_lower = paste0(" ", .data$str_lower), #in case comment is e.g. "1 cp"
var_extract = str_match_all(.data$str_lower, "(..) cp")) %>%
unnest(cols = c(.data$var_extract), keep_empty = FALSE) %>%
mutate(var_extract = .data$var_extract[, 1],
Misc1 = "crab pot",
Misc2 = NA_character_,
Value = suppressWarnings(as.numeric(substr(.data$var_extract, 1, 2))),
flag_check = nchar(.data$str_lower) > 10) %>%
filter(!is.na(.data$Value))
} else {
df.na
}
#--------------------------------------------
# Concatenate different data type data frames
df.proc <- bind_rows(fb.df, mola.df, jelly.df, cp.df)
if (nrow(df.proc) == 0) {
message("None of the turtle/phocoena data contained any ",
"comment-recorded data")
} else {
df.proc <- df.proc %>% select(-.data$var_extract)
}
} else { #if there are no turtle or phocoena data in the data
df.proc <- df.na
}
#----------------------------------------------------------------------------
### Combine, sort, and return
df.out <- bind_rows(x.c.ext, df.proc) %>%
arrange(.data$idx) %>%
select(-.data$idx, -.data$str_lower)
right_join(x, df.out, by = c("file_das", "line_num", "file_type"))
}