Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implementing PubChem PUG-View web service #230

Merged
merged 18 commits into from
May 4, 2020
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ Imports:
methods,
dplyr,
purrr,
data.tree,
stitam marked this conversation as resolved.
Show resolved Hide resolved
tibble
Suggests:
testthat,
Expand Down
6 changes: 6 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,12 @@ export(opsin_query)
export(pan_query)
export(parse_mol)
export(pc_prop)
export(pc_sect)
export(pc_synonyms)
export(ping_cs)
export(ping_pan)
export(ping_pubchem)
export(ping_pubchem_pw)
export(pp_query)
export(ppdb)
export(ppdb_parse)
Expand All @@ -83,6 +85,9 @@ import(jsonlite)
import(rvest)
import(stringr)
import(xml2)
importFrom(data.tree,Do)
importFrom(data.tree,FindNode)
importFrom(data.tree,as.Node)
importFrom(dplyr,bind_rows)
importFrom(dplyr,left_join)
importFrom(httr,GET)
Expand All @@ -97,6 +102,7 @@ importFrom(purrr,map2)
importFrom(purrr,map_df)
importFrom(purrr,map_dfr)
importFrom(rvest,html_table)
importFrom(stats,rexp)
importFrom(stats,rgamma)
importFrom(stats,setNames)
importFrom(tibble,as_tibble)
Expand Down
4 changes: 2 additions & 2 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
webchem 0.5.0.9001
webchem 0.5.0.9005
======================

NEW FEATURES

* Retrieve chemical data from PubChem content pages with pc_sect().
* get_etoxid() now can search by CAS, EC, GSBL and RTECS numbers. Added `from = ` argument. [PR #241, added by @andschar]

* nist_ri() now can search by name, InChI, InChIKey, or CAS. The `cas` argument is deprecated. Use `query` instead with `from = "cas"`

MINOR IMPROVEMENTS
Expand Down
19 changes: 19 additions & 0 deletions R/ping.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,25 @@ ping_pubchem <- function(...) {
res$status_code == 200
}

# pubchem PUG-VIEW-----------------------------------------------------------------
#' @import httr
#' @rdname ping
#' @return TRUE if pubchem PUG-VIEW is reachable
#' @export
#' @examples
#' \dontrun{
#' # might fail if API is not available
#' ping_pubchem_pw()
#' }
ping_pubchem_pw <- function(...) {
qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data",
"compound/176/JSON?heading=pka", sep = "/")
res <- POST(qurl,
user_agent("webchem (https://github.com/ropensci/webchem)"))
res$status_code == 200
}



# ChemSpider webpage -----------------------------------------------------------
#' @import httr
Expand Down
248 changes: 240 additions & 8 deletions R/pubchem.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,12 @@
#' PUG-SOAP and PUG-REST: web services for programmatic access to chemical
#' information in PubChem. Nucleic acids research, gkv396.
#' @note Please respect the Terms and Conditions of the National Library of
#' Medicine, \url{https://www.nlm.nih.gov/databases/download.html} and the data
#' Medicine, \url{https://www.nlm.nih.gov/databases/download.html} the data
#' usage policies of National Center for Biotechnology Information,
#' \url{https://www.ncbi.nlm.nih.gov/home/about/policies/},
#' \url{https://pubchemdocs.ncbi.nlm.nih.gov/programmatic-access}.
#' \url{https://pubchemdocs.ncbi.nlm.nih.gov/programmatic-access}, and the data
#' usage policies of the indicidual data sources
#' \url{https://pubchem.ncbi.nlm.nih.gov/sources/}.
#' @author Eduard Szoecs, \email{eduardszoecs@@gmail.com}
#' @import httr
#' @importFrom purrr map map2
Expand Down Expand Up @@ -156,7 +158,7 @@ get_cid <-
#'
#' @return a data.frame
#' @author Eduard Szoecs, \email{eduardszoecs@@gmail.com}
#' @seealso \code{\link{get_cid}} to retrieve Pubchem IDs.
#' @seealso \code{\link{get_cid}}, \code{\link{pc_sect}}
#' @references Wang, Y., J. Xiao, T. O. Suzek, et al. 2009 PubChem: A Public
#' Information System for
#' Analyzing Bioactivities of Small Molecules. Nucleic Acids Research 37:
Expand All @@ -170,10 +172,12 @@ get_cid <-
#' PUG-SOAP and PUG-REST: web services for programmatic access to chemical
#' information in PubChem. Nucleic acids research, gkv396.
#' @note Please respect the Terms and Conditions of the National Library of
#' Medicine, \url{https://www.nlm.nih.gov/databases/download.html} and the data
#' Medicine, \url{https://www.nlm.nih.gov/databases/download.html} the data
#' usage policies of National Center for Biotechnology Information,
#' \url{https://www.ncbi.nlm.nih.gov/home/about/policies/},
#' \url{https://pubchemdocs.ncbi.nlm.nih.gov/programmatic-access}.
#' \url{https://pubchemdocs.ncbi.nlm.nih.gov/programmatic-access}, and the data
#' usage policies of the indicidual data sources
#' \url{https://pubchem.ncbi.nlm.nih.gov/sources/}.
#' @export
#' @examples
#' \donttest{
Expand Down Expand Up @@ -255,7 +259,6 @@ pc_prop <- function(cid, properties = NULL, verbose = TRUE, ...) {
return(out)
}


#' Search synonyms in pubchem
#'
#' Search synonyms using PUG-REST,
Expand Down Expand Up @@ -289,10 +292,12 @@ pc_prop <- function(cid, properties = NULL, verbose = TRUE, ...) {
#' PUG-SOAP and PUG-REST: web services for programmatic access to chemical
#' information in PubChem. Nucleic acids research, gkv396.
#' @note Please respect the Terms and Conditions of the National Library of
#' Medicine, \url{https://www.nlm.nih.gov/databases/download.html} and the data
#' Medicine, \url{https://www.nlm.nih.gov/databases/download.html} the data
#' usage policies of National Center for Biotechnology Information,
#' \url{https://www.ncbi.nlm.nih.gov/home/about/policies/},
#' \url{https://pubchemdocs.ncbi.nlm.nih.gov/programmatic-access}.
#' \url{https://pubchemdocs.ncbi.nlm.nih.gov/programmatic-access}, and the data
#' usage policies of the indicidual data sources
#' \url{https://pubchem.ncbi.nlm.nih.gov/sources/}.
#' @author Eduard Szoecs, \email{eduardszoecs@@gmail.com}
#' @export
#' @examples
Expand Down Expand Up @@ -343,3 +348,230 @@ pc_synonyms <- function(query, from = "name", choices = NULL, verbose = TRUE,
out <- unlist(out)
return(out)
}

#' Retrieve data from PubChem content pages
#'
#' When you search for an entity at \url{https://pubchem.ncbi.nlm.nih.gov/},
#' e.g. a compound or a substance, and select the record you are interested in,
#' you will be forwarded to a PubChem content page. When you look at a PubChem
#' content page, you can see that chemical information is organised into
#' sections, subsections, etc. The chemical data live at the lowest levels of
#' these sections. Use this function to retrieve the lowest level information
#' from PubChem content pages.
#' @param id numeric or character; a vector of PubChem identifiers to search
stitam marked this conversation as resolved.
Show resolved Hide resolved
#' for.
#' @param section character; the section of the content page to be imported.
#' @param domain character; the query domain. Can be one of \code{"compound"},
#' \code{"substance"}, \code{"assay"}, \code{"gene"}, \code{"protein"} or
#' \code{"patent"}.
#' @param verbose logical; should a verbose output be printed on the console?
#' @return Returns a tibble of query results. In the returned tibble,
#' \code{SourceName} is the name of the depositor, and \code{SourceID} is the
#' ID of the search term within the depositor's database. You can browse
#' \url{https://pubchem.ncbi.nlm.nih.gov/sources/} for more information about
#' the depositors.
#' @details \code{section} is not case sensitive but it is sensitive to typing
#' errors and it requires the full name of the section as it is printed on the
#' content page. The PubChem Table of Contents Tree can also be found at
#' \url{https://pubchem.ncbi.nlm.nih.gov/classification/#hid=72}.
#' @note Please respect the Terms and Conditions of the National Library of
#' Medicine, \url{https://www.nlm.nih.gov/databases/download.html} the data
#' usage policies of National Center for Biotechnology Information,
#' \url{https://www.ncbi.nlm.nih.gov/home/about/policies/},
#' \url{https://pubchemdocs.ncbi.nlm.nih.gov/programmatic-access}, and the data
#' usage policies of the individual data sources
#' \url{https://pubchem.ncbi.nlm.nih.gov/sources/}.
#' @references Kim, S., Thiessen, P.A., Cheng, T. et al. PUG-View: programmatic
#' access to chemical annotations integrated in PubChem. J Cheminform 11, 56
#' (2019). https://doi.org/10.1186/s13321-019-0375-2.
#' @author Tamas Stirling, \email{stirling.tamas@@gmail.com}
#' @seealso \code{\link{get_cid}}, \code{\link{pc_prop}}
#' @examples
#' # might fail if API is not available
#' \donttest{
#' pc_sect(176, "pka")
#' pc_sect(c(176, 311), "density")
#' pc_sect(2231, "depositor-supplied synonyms", "substance")
#' pc_sect(780286, "modify date", "assay")
#' pc_sect(9023, "Ensembl ID", "gene")
#' pc_sect("1ZHY_A", "Sequence", "protein")
#' pc_sect("US2013040379", "Patent Identifier Synonyms", "patent")
#' }
#' @export
pc_sect <- function(id,
section,
stitam marked this conversation as resolved.
Show resolved Hide resolved
domain = c("compound", "substance", "assay", "gene",
"protein", "patent"),
verbose = TRUE) {
domain <- match.arg(domain)
section <- tolower(gsub(" +", "+", section))
if (section %in% c("standard non-polar",
"Semi-standard non-polar",
"Standard polar")) {
stop("use nist_ri() to obtain more information on this.")
}
res <- pc_page(id, section, domain, verbose)
out <- pc_extract(res, section)
return(out)
}

#' Import PubChem content pages
#'
#' @importFrom jsonlite fromJSON
#' @importFrom data.tree as.Node Do
#' @importFrom stats rexp
#' @param id numeric or character; a vector of identifiers to search for.
#' @param section character; the section of the content page to be imported.
#' @param domain character; the query domain. Can be one of \code{"compound"},
#' \code{"substance"}, \code{"assay"}, \code{"gene"}, \code{"protein"} or
#' \code{"patent"}.
#' @return A named list of content pages where each element is either a
#' data.tree or NA.
#' @details \code{section} can be any section of a PubChem content page, e.g.
#' \code{section = "solubility"} will import the section on solubility, or
#' \code{section = "experimental properties"} will import all experimental
#' properties. The \code{section} argument is not case sensitive but it
#' is sensitive to typing errors and it requires the full name of the section as
#' it is printed on the content page. The PubChem Table of Contents Tree can
#' also be found at
#' \url{https://pubchem.ncbi.nlm.nih.gov/classification/#hid=72}.
#' @references Kim, S., Thiessen, P.A., Cheng, T. et al. PUG-View: programmatic
#' access to chemical annotations integrated in PubChem. J Cheminform 11, 56
#' (2019). https://doi.org/10.1186/s13321-019-0375-2.
#' @author Tamas Stirling, \email{stirling.tamas@@gmail.com}
#' @examples
#' # might fail if API is not available
#' \donttest{
#' pc_page(c(176, 311), "pka")
#' pc_page(49854366, "external id", domain = "substance")
#' }
#' @noRd
pc_page <- function(id,
section,
domain = c("compound", "substance", "assay", "gene",
"protein", "patent"),
verbose = TRUE) {
domain <- match.arg(domain)
section <- tolower(gsub(" +", "+", section))
foo <- function(id, section, domain) {
qurl <- paste0("https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/",
domain, "/", id, "/JSON?heading=", section)
if (verbose == TRUE) message("Searching ", id, ". ", appendLF = FALSE)
if (is.na(id)) {
if (verbose == TRUE) {
message("Invalid input. Returning NA.")
}
return(NA)
}
Sys.sleep(0.3 + stats::rexp(1, rate = 10 / 0.3))
res <- httr::POST(
qurl,
user_agent("webchem (https://github.com/ropensci/webchem)"),
handle = handle("")
)
if (res$status_code < 300) {
if (verbose == TRUE) message(httr::message_for_status(res))
cont <- httr::content(res, type = "text", encoding = "UTF-8")
cont <- jsonlite::fromJSON(cont, simplifyDataFrame = FALSE)
tree <- data.tree::as.Node(cont, nameName = "TOCHeading")
tree$Do(function(node) node$name <- tolower(node$name))
return(tree)
}
else {
if (verbose == TRUE) {
message(paste0(httr::message_for_status(res), " Returning NA."))
}
return(NA)
}
}
cont <- lapply(id, function(x) foo(x, section, domain))
names(cont) <- id
attr(cont, "domain") <- domain
attr(cont, "id") <- switch(domain, compound = "CID", substance = "SID",
assay = "AID", gene = "GeneID", protein = "pdbID",
patent = "PatentID")
return(cont)
}

#' Extract data from PubChem content pages
#'
#' This function takes a list of PubChem content pages, and extracts the
#' required information from them.
#' @importFrom data.tree FindNode
#' @importFrom dplyr bind_rows
#' @importFrom tibble as_tibble
#' @param pages list; a list of PubChem content pages.
#' @param section character; the lowest level section of the data to be
#' accessed.
#' @return A tibble of chemical information with references.
#' @details When you look at a PubChem content page, you can see that chemical
#' information is organised into sections, subsections, etc. The chemical data
#' live at the lowest levels of these sections. Use this function to extract the
#' lowest level information from PubChem content pages, e.g. IUPAC Name, Boiling
#' Point, Lower Explosive Limit (LEL).
#' @details The \code{section} argument is not case sensitive, but it is
#' sensitive to typing errors, and requires the full name of the section as it
#' is printed on the content page. The PubChem Table of Contents Tree can also
#' be found at \url{https://pubchem.ncbi.nlm.nih.gov/classification/#hid=72}.
#' @references Kim, S., Thiessen, P.A., Cheng, T. et al. PUG-View: programmatic
#' access to chemical annotations integrated in PubChem. J Cheminform 11, 56
#' (2019). https://doi.org/10.1186/s13321-019-0375-2.
#' @author Tamas Stirling, \email{stirling.tamas@@gmail.com}
#' @examples
#' # might fail if API is not available
#' \donttest{
#' comps <- pc_page(c(176, 311), "pka")
#' pc_extract(comps, "pka")
#' subs <- pc_page(49854366, "external id", domain = "substance")
#' pc_extract(subs, "external id")
#' }
#' @noRd
pc_extract <- function(page, section) {
section <- tolower(section)
ids <- names(page)
foo <- function(i, section) {
tree <- page[[i]]
if (length(tree) == 1 && is.na(tree)) return(tibble(ID = ids[i]))
node <- FindNode(tree, "information")
if (is.null(node)) return(tibble(ID = ids[i],
Name = tree$record$RecordTitle))
info <- lapply(node, function(y) {
lownode <- data.tree::FindNode(data.tree::as.Node(y), "stringwithmarkup")
if (is.null(lownode)) {
info <- tibble(Result = paste(y$value, collapse = " "),
ReferenceNumber = y$ReferenceNumber)
return(info)
}
else{
string <- sapply(lownode, function(z) z$String)
info <- tibble(Result = string,
ReferenceNumber = y$ReferenceNumber)
}
})
info <- dplyr::bind_rows(info)
info <- tibble(ID = ids[i],
Name = tree$record$RecordTitle,
info)
node <- FindNode(tree, "reference")
if (is.null(node)) return(tibble(info, SourceName = NA, SourceID = NA))
ref <- lapply(node, function(y) {
ref <- tibble(ReferenceNumber = y$ReferenceNumber,
SourceName = y$SourceName,
SourceID = y$SourceID)
return(ref)
})
ref <- dplyr::bind_rows(ref)
info$SourceName <- sapply(info$ReferenceNumber, function(x) {
ref$SourceName[ref$ReferenceNumber == x]
})
info$SourceID <- sapply(info$ReferenceNumber, function(x) {
ref$SourceID[ref$ReferenceNumber == x]
})
return(info)
}
info <- lapply(seq_along(page), function(x) foo(x, section))
info <- dplyr::bind_rows(info)
info <- info[, -which(names(info) == "ReferenceNumber")]
names(info)[1] <- attr(page, "id")
return(info)
}