-
Notifications
You must be signed in to change notification settings - Fork 8
/
epmc_search.r
222 lines (218 loc) · 7.39 KB
/
epmc_search.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
#' Search Europe PMC publication database
#'
#' @description This is the main function to search Europe PMC RESTful Web
#' Service (\url{https://europepmc.org/RestfulWebService}). It fully supports
#' the comprehensive Europe PMC query language. Simply copy & paste your query
#' terms to R. To get familiar with the Europe PMC query syntax, check the
#' Advanced Search Query Builder \url{https://europepmc.org/advancesearch}.
#'
#' @seealso \url{https://europepmc.org/Help}
#'
#' @param query character, search query. For more information on how to build a
#' search query, see \url{https://europepmc.org/Help}
#' @param output character, what kind of output should be returned. One of
#' 'parsed', 'id_list' or 'raw' As default, parsed key metadata will be
#' returned as data.frame. 'id_list' returns a list of IDs and sources. Use
#' 'raw' to get full metadata as list. Please be aware that these lists can
#' become very large.
#' @param limit integer, limit the number of records you wish to retrieve. By
#' default, 100 are returned.
#' @param synonym logical, synonym search. If TRUE, synonym terms from MeSH
#' terminology and the UniProt synonym list are queried, too.
#' In order to replicate results from the website, with the Rest API
#' you need to turn synonyms ON!
#' @param sort character, relevance ranking is used by default. Use
#' \code{sort = 'cited'} for sorting by the number of citations, or
#' \code{sort = 'date'} by the most recent publications.
#' @param verbose logical, print progress bar. Activated by default.
#' @return tibble
#' @examples \dontrun{
#' #Search articles for 'Gabi-Kat'
#' my.data <- epmc_search(query='Gabi-Kat')
#'
#' #Get article metadata by DOI
#' my.data <- epmc_search(query = 'DOI:10.1007/bf00197367')
#'
#' #Get article metadata by PubMed ID (PMID)
#' my.data <- epmc_search(query = 'EXT_ID:22246381')
#'
#' #Get only PLOS Genetics article with EMBL database references
#' my.data <- epmc_search(query = 'ISSN:1553-7404 HAS_EMBL:y')
#' #Limit search to 250 PLOS Genetics articles
#' my.data <- epmc_search(query = 'ISSN:1553-7404', limit = 250)
#'
#' # exclude MeSH synonyms in search
#' my.data <- epmc_search(query = 'aspirin', synonym = FALSE)
#'
#' # get 100 most cited atricles from PLOS ONE publsihed in 2014
#' epmc_search(query = '(ISSN:1932-6203) AND FIRST_PDATE:2014', sort = 'cited')
#'
#' # print number of records found
#' attr(my.data, "hit_count")
#'
#' # change output
#'
#' }
#' @export
epmc_search <- function(query = NULL,
output = 'parsed',
synonym = TRUE,
verbose = TRUE,
limit = 100,
sort = NULL) {
#--- Input validation
stopifnot(is.logical(c(verbose, synonym)))
stopifnot(is.numeric(limit))
# sort
if (!is.null(sort)) {
match.arg(sort, c("date", "cited"))
query <- switch(
sort,
date = paste(query, "sort_date:y"),
cited = paste(query, "sort_cited:y")
)
} else {
query <- query
}
# get the correct hit count when mesh and uniprot synonyms are also searched
# synonym <- ifelse(synonym == FALSE, "false", "true")
# this is so far the only way how I got the synonym paramworking after
# the API change.
# there is a possible conflict with the resumption token and decoding
# the API call.
query <- transform_query(paste0(query, "&synonym=", synonym))
page_token <- "*"
if (!output == "raw")
results <- tibble::tibble()
else
results <- NULL
# search
out <-
epmc_search_(
query = query,
limit = limit,
output = output,
verbose = verbose,
page_token = page_token,
sort = sort
)
res_chunks <- chunks(limit = limit)
# super hacky to control limit, better approach using pageSize param needed
hits <- epmc_hits(query, synonym = synonym)
if (hits == 0) {
message("There are no results matching your query")
md <- NULL
} else {
limit <- as.integer(limit)
limit <- ifelse(hits <= limit, hits, limit)
message(paste(hits, "records found, returning", limit))
# let's loop over until page max is reached,
# or until cursor marks are identical
if (!is.null(out$next_cursor)) {
i <- 0
# progress
pb <- pb(limit = limit)
while (i < res_chunks$page_max) {
out <-
epmc_search_(
query = query,
limit = limit,
output = output,
verbose = verbose,
page_token = page_token,
sort = sort
)
if (is.null(out$next_cursor))
break
i <- i + 1
if (verbose == TRUE && hits > 100)
pb$tick()
page_token <- out$next_cursor
if (output == "raw") {
results <- c(results, out$results)
} else {
results <- dplyr::bind_rows(results, out$results)
}
}
# again, approach needed to use param pageSize instead
if (output == "raw") {
md <- results[1:limit]
} else {
md <- results[1:limit, ]
}
# return hit counts(thanks to @cstubben)
attr(md, "hit_count") <- hits
} else {
md <- out$results
attr(md, "hit_count") <- hits
}
}
return(md)
}
#' Get one page of results when searching Europe PubMed Central
#'
#' In general, use \code{\link{epmc_search}} instead. It calls this function, calling all
#' pages within the defined limit.
#'
#' @param query character, search query. For more information on how to
#' build a search query, see \url{https://europepmc.org/Help}
#' @param output character, what kind of output should be returned. One of 'parsed', 'id_list'
#' or 'raw' As default, parsed key metadata will be returned as data.frame.
#' 'id_list returns a list of IDs and sources.
#' Use 'raw' to get full metadata as list. Please be aware that these lists
#' can become very large.
#' @param limit integer, limit the number of records you wish to retrieve.
#' By default, 25 are returned.
#' @param page_token cursor marking the page
#'
#' @param ... further params from \code{\link{epmc_search}}
#'
#' @export
#'
#' @seealso \link{epmc_search}
epmc_search_ <-
function(query = NULL,
limit = 100,
output = "parsed",
page_token = NULL,
...) {
# control limit
limit <- as.integer(limit)
page_size <- ifelse(batch_size() <= limit, batch_size(), limit)
# choose output
if (!output %in% c("id_list", "parsed", "raw"))
stop("'output' must be one of 'parsed', 'id_list', or 'raw'",
call. = FALSE)
result_types <- c("id_list" = "idlist",
"parsed" = "lite",
"raw" = "core")
resulttype <- result_types[[output]]
# build query
args <-
list(
query = query,
format = "json",
resulttype = resulttype,
pageSize = page_size,
cursorMark = page_token
)
# call API
out <-
rebi_GET(path = paste0(rest_path(), "/search"), query = args)
# remove nested lists from resulting data.frame, get these infos
# with epmc_details or using output "raw"
if (!resulttype == "core") {
md <- out$resultList$result
if (length(md) == 0) {
md <- tibble::tibble()
} else {
md <- md %>%
dplyr::select_if(Negate(is.list)) %>%
tibble::as_tibble()
}
} else {
out <- jsonlite::fromJSON(out, simplifyDataFrame = FALSE)
md <- out$resultList$result
}
list(next_cursor = out$nextCursorMark, results = md)
}