/
dbGetFieldsIntoDf.R
199 lines (168 loc) · 6.72 KB
/
dbGetFieldsIntoDf.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
### ctrdata package
#' Create data frame of specified fields from database collection
#'
#' Fields in the collection are retrieved from all records into a data
#' frame (or tibble). Within a given trial record, a fields can be
#' hierarchical and structured, that is, nested.
#' Th function uses the field names to appropriately type the values
#' that it returns, harmonising original values (e.g. "Information not present
#' in EudraCT" to `NA`, "Yes" to `TRUE`, "false" to `FALSE`,
#' date strings to dates or time differences, number strings to numbers).
#' The function simplifies the structure of nested data and
#' may concatenate multiple strings in a field using " / " (see example)
#' and may have widened the returned data frame with additional columns that
#' were recursively expanded from simply nested data (e.g., "externalRefs"
#' to columns "externalRefs.doi", "externalRefs.eudraCTNumber" etc.).
#' For an alternative way for handling the complex nested data, see
#' \link{dfTrials2Long} followed by \link{dfName2Value} for extracting
#' the sought variable(s).
#'
#' @param fields Vector of one or more strings, with names of sought fields.
#' See function \link{dbFindFields} for how to find names of fields.
#' Dot path notation ("field.subfield") without indices is supported.
#' If compatibility with `nodbi::src_postgres()` is needed, specify fewer
#' than 50 fields, consider also using parent fields e.g., `"a.b"` instead
#' of `c("a.b.c.d", "a.b.c.e")`, accessing sought fields with
#' \link{dfTrials2Long} followed by \link{dfName2Value} or other R functions.
#'
#' @param verbose Printing additional information if set to \code{TRUE};
#' (default \code{FALSE}).
#'
#' @inheritParams ctrDb
#'
#' @param ... Do not use (captures deprecated parameter \code{stopifnodata})
#'
#' @return A data frame (or tibble, if \code{tibble} is loaded)
#' with columns corresponding to the sought fields.
#' A column for the records' `_id` will always be included.
#' The maximum number of rows of the returned data frame is equal to,
#' or less than the number of trial records in the database collection.
#'
#' @importFrom nodbi docdb_query
#' @importFrom stats na.omit
#' @importFrom tibble as_tibble
#'
#' @export
#'
#' @examples
#'
#' dbc <- nodbi::src_sqlite(
#' dbname = system.file("extdata", "demo.sqlite", package = "ctrdata"),
#' collection = "my_trials")
#'
#' # get fields that are nested within another field
#' # and can have multiple values with the nested field
#' dbGetFieldsIntoDf(
#' fields = "b1_sponsor.b31_and_b32_status_of_the_sponsor",
#' con = dbc)
#'
#' # fields that are lists of string values are
#' # returned by concatenating values with a slash
#' dbGetFieldsIntoDf(
#' fields = "keyword",
#' con = dbc)
#'
dbGetFieldsIntoDf <- function(fields = "",
con, verbose = FALSE, ...) {
# check fields
if (!is.vector(fields) ||
!all(class(fields) %in% "character")) {
stop("Input should be a vector of strings of field names.", call. = FALSE)
}
# deprecated
params <- list(...)
if (!is.null(params[["stopifnodata"]])) warning(
'Parameter "stopifnodata" is deprecated.'
)
# remove NA, NULL if included in fields
fields <- fields[!is.null(fields) & !is.na(fields)]
# remove _id if included in fields
fields <- unique(fields["_id" != fields])
# check if valid fields
if (any(fields == "") || (length(fields) == 0)) {
stop("'fields' contains empty elements; ",
"please provide a vector of strings of field names. ",
"Function dbFindFields() can be used to find field names. ",
call. = FALSE)
}
if (length(fields) > 49L) {
message(
"If compatibility with nodbi::src_postgres() is needed, specify fewer ",
"than 50 (was: ", length(fields), ") fields; parent fields, ",
'e.g., "a.b" instead of c("a.b.c.d", "a.b.c.e"), can also be used ',
"and sought fields can be extracted with dfTrials2Long() followed by ",
"dfName2Value() or other R functions.")
}
# check database connection
if (is.null(con$ctrDb)) con <- ctrDb(con = con)
# get the data
dfi <- nodbi::docdb_query(
src = con,
key = con$collection,
query = '{}',
fields = paste0('{"', paste0(fields, collapse = '": 1, "'), '": 1}')
)
# early exit
if (is.null(dfi) || !ncol(dfi)) stop(
"No records with values for any specified field."
)
# user info
if (verbose) message("Retrieved ", nrow(dfi), " rows of data.")
# warn
notFound <- setdiff(fields, names(dfi))
if (length(notFound)) warning("No data could be extracted for '",
paste0(notFound, collapse = "', '"), "'.")
# recursively widen results if a column is a data frame
# by adding the data frame's columns as result columns.
# this is a convenience function and historically users
# may expect such behaviour from package ctrdata.
doneNmnt <- NULL
while (TRUE) {
nm <- names(dfi)
nt <- sapply(dfi, is.data.frame)
if (!any(nt)) break
doNmnt <- nm[nt]
if (!length(doNmnt) || setequal(doNmnt, doneNmnt)) break
for (colN in doNmnt) {
expDf <- dfi[[colN]]
if (!is.data.frame(expDf) || !ncol(expDf) || !nrow(expDf)) next
names(expDf) <- paste0(colN, ".", names(expDf))
# handle any data frames within expDf
colK <- sapply(expDf, function(c) ifelse(is.null(ncol(c)) || ncol(c) > 0L, 1L, 0L))
colK <- names(colK)[sapply(colK, function(c) !is.integer(c) || c > 0L)]
if (length(colK)) expDf <- expDf[, colK, drop = FALSE]
# merge with dfi
colD <- na.omit(match(c(colN, names(expDf)), names(dfi)))
dfi <- cbind(dfi[, -colD, drop = FALSE], expDf)
}
doneNmnt <- doNmnt
} # while
# for date time conversion
lct <- Sys.getlocale("LC_TIME")
on.exit(Sys.setlocale("LC_TIME", lct), add = TRUE)
Sys.setlocale("LC_TIME", "C")
# iterates over columns for typing
nm <- names(dfi)
for (i in seq_len(ncol(dfi))) {
if (nm[i] == "_id") next
if (inherits(dfi[[i]], "data.frame") && !ncol(dfi[[i]])) next
# simplify and replace NULL with NA
dfi[[i]][!sapply(dfi[[i]], length)] <- NA
# type column (NOTE this should not be a data frame because
# the extraction would result in a data frame and this would
# break the typing approach, thus the recursion above)
dfi[[i]] <- typeField(dfi[[i]], nm[i])
if (verbose) message(nm[i], " of type ", typeof(dfi[[i]]), ", ", appendLF = FALSE)
}
# remove rows with only NAs
naout <- is.na(dfi)
nc <- length(setdiff(attr(naout, "dimnames")[[2]], "_id"))
dfi <- dfi[rowSums(naout) < nc, , drop = FALSE]
# sort, add meta data
dfi <- addMetaData(
dfi[order(dfi[["_id"]]), , drop = FALSE],
con = con)
# return
if (any("tibble" == .packages())) return(tibble::as_tibble(dfi))
return(dfi)
} # end dbGetFieldsIntoDf