Skip to content

Commit

Permalink
Merge pull request #230 from stitam/pubchem
Browse files Browse the repository at this point in the history
Implementing PubChem PUG-View web service
  • Loading branch information
Aariq committed May 4, 2020
2 parents 008c9d0 + 759b3be commit 21e1cf7
Show file tree
Hide file tree
Showing 19 changed files with 514 additions and 72 deletions.
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ Imports:
methods,
dplyr,
purrr,
data.tree,
tibble
Suggests:
testthat,
Expand Down
6 changes: 6 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,12 @@ export(opsin_query)
export(pan_query)
export(parse_mol)
export(pc_prop)
export(pc_sect)
export(pc_synonyms)
export(ping_cs)
export(ping_pan)
export(ping_pubchem)
export(ping_pubchem_pw)
export(pp_query)
export(ppdb)
export(ppdb_parse)
Expand All @@ -83,6 +85,9 @@ import(jsonlite)
import(rvest)
import(stringr)
import(xml2)
importFrom(data.tree,Do)
importFrom(data.tree,FindNode)
importFrom(data.tree,as.Node)
importFrom(dplyr,bind_rows)
importFrom(dplyr,left_join)
importFrom(httr,GET)
Expand All @@ -97,6 +102,7 @@ importFrom(purrr,map2)
importFrom(purrr,map_df)
importFrom(purrr,map_dfr)
importFrom(rvest,html_table)
importFrom(stats,rexp)
importFrom(stats,rgamma)
importFrom(stats,setNames)
importFrom(tibble,as_tibble)
Expand Down
4 changes: 2 additions & 2 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
webchem 0.5.0.9001
webchem 0.5.0.9005
======================

NEW FEATURES

* Retrieve chemical data from PubChem content pages with pc_sect().
* get_etoxid() now can search by CAS, EC, GSBL and RTECS numbers. Added `from = ` argument. [PR #241, added by @andschar]

* nist_ri() now can search by name, InChI, InChIKey, or CAS. The `cas` argument is deprecated. Use `query` instead with `from = "cas"`

MINOR IMPROVEMENTS
Expand Down
19 changes: 19 additions & 0 deletions R/ping.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,25 @@ ping_pubchem <- function(...) {
res$status_code == 200
}

# pubchem PUG-VIEW-----------------------------------------------------------------
#' @import httr
#' @rdname ping
#' @return TRUE if pubchem PUG-VIEW is reachable
#' @export
#' @examples
#' \dontrun{
#' # might fail if API is not available
#' ping_pubchem_pw()
#' }
ping_pubchem_pw <- function(...) {
qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data",
"compound/176/JSON?heading=pka", sep = "/")
res <- POST(qurl,
user_agent("webchem (https://github.com/ropensci/webchem)"))
res$status_code == 200
}



# ChemSpider webpage -----------------------------------------------------------
#' @import httr
Expand Down
248 changes: 240 additions & 8 deletions R/pubchem.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,12 @@
#' PUG-SOAP and PUG-REST: web services for programmatic access to chemical
#' information in PubChem. Nucleic acids research, gkv396.
#' @note Please respect the Terms and Conditions of the National Library of
#' Medicine, \url{https://www.nlm.nih.gov/databases/download.html} and the data
#' Medicine, \url{https://www.nlm.nih.gov/databases/download.html} the data
#' usage policies of National Center for Biotechnology Information,
#' \url{https://www.ncbi.nlm.nih.gov/home/about/policies/},
#' \url{https://pubchemdocs.ncbi.nlm.nih.gov/programmatic-access}.
#' \url{https://pubchemdocs.ncbi.nlm.nih.gov/programmatic-access}, and the data
#' usage policies of the indicidual data sources
#' \url{https://pubchem.ncbi.nlm.nih.gov/sources/}.
#' @author Eduard Szoecs, \email{eduardszoecs@@gmail.com}
#' @import httr
#' @importFrom purrr map map2
Expand Down Expand Up @@ -156,7 +158,7 @@ get_cid <-
#'
#' @return a data.frame
#' @author Eduard Szoecs, \email{eduardszoecs@@gmail.com}
#' @seealso \code{\link{get_cid}} to retrieve Pubchem IDs.
#' @seealso \code{\link{get_cid}}, \code{\link{pc_sect}}
#' @references Wang, Y., J. Xiao, T. O. Suzek, et al. 2009 PubChem: A Public
#' Information System for
#' Analyzing Bioactivities of Small Molecules. Nucleic Acids Research 37:
Expand All @@ -170,10 +172,12 @@ get_cid <-
#' PUG-SOAP and PUG-REST: web services for programmatic access to chemical
#' information in PubChem. Nucleic acids research, gkv396.
#' @note Please respect the Terms and Conditions of the National Library of
#' Medicine, \url{https://www.nlm.nih.gov/databases/download.html} and the data
#' Medicine, \url{https://www.nlm.nih.gov/databases/download.html} the data
#' usage policies of National Center for Biotechnology Information,
#' \url{https://www.ncbi.nlm.nih.gov/home/about/policies/},
#' \url{https://pubchemdocs.ncbi.nlm.nih.gov/programmatic-access}.
#' \url{https://pubchemdocs.ncbi.nlm.nih.gov/programmatic-access}, and the data
#' usage policies of the indicidual data sources
#' \url{https://pubchem.ncbi.nlm.nih.gov/sources/}.
#' @export
#' @examples
#' \donttest{
Expand Down Expand Up @@ -255,7 +259,6 @@ pc_prop <- function(cid, properties = NULL, verbose = TRUE, ...) {
return(out)
}


#' Search synonyms in pubchem
#'
#' Search synonyms using PUG-REST,
Expand Down Expand Up @@ -289,10 +292,12 @@ pc_prop <- function(cid, properties = NULL, verbose = TRUE, ...) {
#' PUG-SOAP and PUG-REST: web services for programmatic access to chemical
#' information in PubChem. Nucleic acids research, gkv396.
#' @note Please respect the Terms and Conditions of the National Library of
#' Medicine, \url{https://www.nlm.nih.gov/databases/download.html} and the data
#' Medicine, \url{https://www.nlm.nih.gov/databases/download.html} the data
#' usage policies of National Center for Biotechnology Information,
#' \url{https://www.ncbi.nlm.nih.gov/home/about/policies/},
#' \url{https://pubchemdocs.ncbi.nlm.nih.gov/programmatic-access}.
#' \url{https://pubchemdocs.ncbi.nlm.nih.gov/programmatic-access}, and the data
#' usage policies of the indicidual data sources
#' \url{https://pubchem.ncbi.nlm.nih.gov/sources/}.
#' @author Eduard Szoecs, \email{eduardszoecs@@gmail.com}
#' @export
#' @examples
Expand Down Expand Up @@ -343,3 +348,230 @@ pc_synonyms <- function(query, from = "name", choices = NULL, verbose = TRUE,
out <- unlist(out)
return(out)
}

#' Retrieve data from PubChem content pages
#'
#' When you search for an entity at \url{https://pubchem.ncbi.nlm.nih.gov/},
#' e.g. a compound or a substance, and select the record you are interested in,
#' you will be forwarded to a PubChem content page. When you look at a PubChem
#' content page, you can see that chemical information is organised into
#' sections, subsections, etc. The chemical data live at the lowest levels of
#' these sections. Use this function to retrieve the lowest level information
#' from PubChem content pages.
#' @param id numeric or character; a vector of PubChem identifiers to search
#' for.
#' @param section character; the section of the content page to be imported.
#' @param domain character; the query domain. Can be one of \code{"compound"},
#' \code{"substance"}, \code{"assay"}, \code{"gene"}, \code{"protein"} or
#' \code{"patent"}.
#' @param verbose logical; should a verbose output be printed on the console?
#' @return Returns a tibble of query results. In the returned tibble,
#' \code{SourceName} is the name of the depositor, and \code{SourceID} is the
#' ID of the search term within the depositor's database. You can browse
#' \url{https://pubchem.ncbi.nlm.nih.gov/sources/} for more information about
#' the depositors.
#' @details \code{section} is not case sensitive but it is sensitive to typing
#' errors and it requires the full name of the section as it is printed on the
#' content page. The PubChem Table of Contents Tree can also be found at
#' \url{https://pubchem.ncbi.nlm.nih.gov/classification/#hid=72}.
#' @note Please respect the Terms and Conditions of the National Library of
#' Medicine, \url{https://www.nlm.nih.gov/databases/download.html} the data
#' usage policies of National Center for Biotechnology Information,
#' \url{https://www.ncbi.nlm.nih.gov/home/about/policies/},
#' \url{https://pubchemdocs.ncbi.nlm.nih.gov/programmatic-access}, and the data
#' usage policies of the individual data sources
#' \url{https://pubchem.ncbi.nlm.nih.gov/sources/}.
#' @references Kim, S., Thiessen, P.A., Cheng, T. et al. PUG-View: programmatic
#' access to chemical annotations integrated in PubChem. J Cheminform 11, 56
#' (2019). https://doi.org/10.1186/s13321-019-0375-2.
#' @author Tamas Stirling, \email{stirling.tamas@@gmail.com}
#' @seealso \code{\link{get_cid}}, \code{\link{pc_prop}}
#' @examples
#' # might fail if API is not available
#' \donttest{
#' pc_sect(176, "pka")
#' pc_sect(c(176, 311), "density")
#' pc_sect(2231, "depositor-supplied synonyms", "substance")
#' pc_sect(780286, "modify date", "assay")
#' pc_sect(9023, "Ensembl ID", "gene")
#' pc_sect("1ZHY_A", "Sequence", "protein")
#' pc_sect("US2013040379", "Patent Identifier Synonyms", "patent")
#' }
#' @export
pc_sect <- function(id,
section,
domain = c("compound", "substance", "assay", "gene",
"protein", "patent"),
verbose = TRUE) {
domain <- match.arg(domain)
section <- tolower(gsub(" +", "+", section))
if (section %in% c("standard non-polar",
"Semi-standard non-polar",
"Standard polar")) {
stop("use nist_ri() to obtain more information on this.")
}
res <- pc_page(id, section, domain, verbose)
out <- pc_extract(res, section)
return(out)
}

#' Import PubChem content pages
#'
#' @importFrom jsonlite fromJSON
#' @importFrom data.tree as.Node Do
#' @importFrom stats rexp
#' @param id numeric or character; a vector of identifiers to search for.
#' @param section character; the section of the content page to be imported.
#' @param domain character; the query domain. Can be one of \code{"compound"},
#' \code{"substance"}, \code{"assay"}, \code{"gene"}, \code{"protein"} or
#' \code{"patent"}.
#' @return A named list of content pages where each element is either a
#' data.tree or NA.
#' @details \code{section} can be any section of a PubChem content page, e.g.
#' \code{section = "solubility"} will import the section on solubility, or
#' \code{section = "experimental properties"} will import all experimental
#' properties. The \code{section} argument is not case sensitive but it
#' is sensitive to typing errors and it requires the full name of the section as
#' it is printed on the content page. The PubChem Table of Contents Tree can
#' also be found at
#' \url{https://pubchem.ncbi.nlm.nih.gov/classification/#hid=72}.
#' @references Kim, S., Thiessen, P.A., Cheng, T. et al. PUG-View: programmatic
#' access to chemical annotations integrated in PubChem. J Cheminform 11, 56
#' (2019). https://doi.org/10.1186/s13321-019-0375-2.
#' @author Tamas Stirling, \email{stirling.tamas@@gmail.com}
#' @examples
#' # might fail if API is not available
#' \donttest{
#' pc_page(c(176, 311), "pka")
#' pc_page(49854366, "external id", domain = "substance")
#' }
#' @noRd
pc_page <- function(id,
section,
domain = c("compound", "substance", "assay", "gene",
"protein", "patent"),
verbose = TRUE) {
domain <- match.arg(domain)
section <- tolower(gsub(" +", "+", section))
foo <- function(id, section, domain) {
qurl <- paste0("https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/",
domain, "/", id, "/JSON?heading=", section)
if (verbose == TRUE) message("Searching ", id, ". ", appendLF = FALSE)
if (is.na(id)) {
if (verbose == TRUE) {
message("Invalid input. Returning NA.")
}
return(NA)
}
Sys.sleep(0.3 + stats::rexp(1, rate = 10 / 0.3))
res <- httr::POST(
qurl,
user_agent("webchem (https://github.com/ropensci/webchem)"),
handle = handle("")
)
if (res$status_code < 300) {
if (verbose == TRUE) message(httr::message_for_status(res))
cont <- httr::content(res, type = "text", encoding = "UTF-8")
cont <- jsonlite::fromJSON(cont, simplifyDataFrame = FALSE)
tree <- data.tree::as.Node(cont, nameName = "TOCHeading")
tree$Do(function(node) node$name <- tolower(node$name))
return(tree)
}
else {
if (verbose == TRUE) {
message(paste0(httr::message_for_status(res), " Returning NA."))
}
return(NA)
}
}
cont <- lapply(id, function(x) foo(x, section, domain))
names(cont) <- id
attr(cont, "domain") <- domain
attr(cont, "id") <- switch(domain, compound = "CID", substance = "SID",
assay = "AID", gene = "GeneID", protein = "pdbID",
patent = "PatentID")
return(cont)
}

#' Extract data from PubChem content pages
#'
#' This function takes a list of PubChem content pages, and extracts the
#' required information from them.
#' @importFrom data.tree FindNode
#' @importFrom dplyr bind_rows
#' @importFrom tibble as_tibble
#' @param pages list; a list of PubChem content pages.
#' @param section character; the lowest level section of the data to be
#' accessed.
#' @return A tibble of chemical information with references.
#' @details When you look at a PubChem content page, you can see that chemical
#' information is organised into sections, subsections, etc. The chemical data
#' live at the lowest levels of these sections. Use this function to extract the
#' lowest level information from PubChem content pages, e.g. IUPAC Name, Boiling
#' Point, Lower Explosive Limit (LEL).
#' @details The \code{section} argument is not case sensitive, but it is
#' sensitive to typing errors, and requires the full name of the section as it
#' is printed on the content page. The PubChem Table of Contents Tree can also
#' be found at \url{https://pubchem.ncbi.nlm.nih.gov/classification/#hid=72}.
#' @references Kim, S., Thiessen, P.A., Cheng, T. et al. PUG-View: programmatic
#' access to chemical annotations integrated in PubChem. J Cheminform 11, 56
#' (2019). https://doi.org/10.1186/s13321-019-0375-2.
#' @author Tamas Stirling, \email{stirling.tamas@@gmail.com}
#' @examples
#' # might fail if API is not available
#' \donttest{
#' comps <- pc_page(c(176, 311), "pka")
#' pc_extract(comps, "pka")
#' subs <- pc_page(49854366, "external id", domain = "substance")
#' pc_extract(subs, "external id")
#' }
#' @noRd
pc_extract <- function(page, section) {
section <- tolower(section)
ids <- names(page)
foo <- function(i, section) {
tree <- page[[i]]
if (length(tree) == 1 && is.na(tree)) return(tibble(ID = ids[i]))
node <- FindNode(tree, "information")
if (is.null(node)) return(tibble(ID = ids[i],
Name = tree$record$RecordTitle))
info <- lapply(node, function(y) {
lownode <- data.tree::FindNode(data.tree::as.Node(y), "stringwithmarkup")
if (is.null(lownode)) {
info <- tibble(Result = paste(y$value, collapse = " "),
ReferenceNumber = y$ReferenceNumber)
return(info)
}
else{
string <- sapply(lownode, function(z) z$String)
info <- tibble(Result = string,
ReferenceNumber = y$ReferenceNumber)
}
})
info <- dplyr::bind_rows(info)
info <- tibble(ID = ids[i],
Name = tree$record$RecordTitle,
info)
node <- FindNode(tree, "reference")
if (is.null(node)) return(tibble(info, SourceName = NA, SourceID = NA))
ref <- lapply(node, function(y) {
ref <- tibble(ReferenceNumber = y$ReferenceNumber,
SourceName = y$SourceName,
SourceID = y$SourceID)
return(ref)
})
ref <- dplyr::bind_rows(ref)
info$SourceName <- sapply(info$ReferenceNumber, function(x) {
ref$SourceName[ref$ReferenceNumber == x]
})
info$SourceID <- sapply(info$ReferenceNumber, function(x) {
ref$SourceID[ref$ReferenceNumber == x]
})
return(info)
}
info <- lapply(seq_along(page), function(x) foo(x, section))
info <- dplyr::bind_rows(info)
info <- info[, -which(names(info) == "ReferenceNumber")]
names(info)[1] <- attr(page, "id")
return(info)
}

0 comments on commit 21e1cf7

Please sign in to comment.