Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generalized biomart database interface #108

Merged
merged 8 commits into from
Nov 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: biomartr
Title: Genomic Data Retrieval
Version: 1.0.8.9000
Version: 1.0.9
Authors@R: c(person("Hajk-Georg", "Drost",
role = c("aut", "cre"),
email = "hajk-georg.drost@tuebingen.mpg.de",
Expand Down
86 changes: 34 additions & 52 deletions R/biomart.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,17 @@
#' @param dataset a character string specifying the dataset within the mart to
#' be used, e.g. \code{dataset} = \code{"hsapiens_gene_ensembl"}.
#' @param attributes a character vector specifying the attributes that shall be
#' used, e.g. \code{attributes} =
#' used, e.g. \code{attributes} =
#' \code{c("start_position","end_position","description")}.
#' @param filters a character vector specifying the filter (query key) for the
#' @param filters a character vector specifying the filter (query key) for the
#' BioMart query, e.g. \code{filter} = \code{"ensembl_gene_id"}.
#' @param ... additional parameters for the
#' \code{\link[biomaRt]{getBM}} function.
#' @param mute_citation logical value indicating whether citation message should be muted.
#' @author Hajk-Georg Drost
#' @details This function is the main query function of the biomartr package.
#'
#' It enables to fastly access annotations of a given gene set based
#' It enables to fastly access annotations of a given gene set based
#' on the \pkg{biomaRt} package
#' implemented by Steffen Durinck et al.
#'
Expand Down Expand Up @@ -46,12 +46,11 @@
#' filters = "ensembl_gene_id")
#'}
#'
#' @return A data.table storing the initial query gene vector in
#' @return A data.table storing the initial query gene vector in
#' the first column, the output
#' gene vector in the second column, and all attributes in
#' the following columns.
#' @seealso \code{\link{organismFilters}}, \code{\link{organismBM}},
#' \code{\link[biomaRt]{listAttributes}}, \code{\link[biomaRt]{getBM}}
#' @family biomaRt
#' @import biomaRt
#' @export
biomart <- function(genes,
Expand All @@ -61,51 +60,34 @@ biomart <- function(genes,
filters,
mute_citation = FALSE,
...) {

message("Starting BioMart query ...")

if (stringr::str_detect(mart, "ENSEMBL"))
# connect to BioMart API
m <- biomaRt::useMart(mart, host = "https://www.ensembl.org")

if (stringr::str_detect(mart, "plants"))
# connect to BioMart API
m <- biomaRt::useMart(mart, host = "https://plants.ensembl.org")

if (stringr::str_detect(mart, "fung"))
# connect to BioMart API
m <- biomaRt::useMart(mart, host = "https://fungi.ensembl.org")

if (stringr::str_detect(mart, "protist"))
# connect to BioMart API
m <- biomaRt::useMart(mart, host = "https://protist.ensembl.org")

if (stringr::str_detect(mart, "metazoa"))
# connect to BioMart API
m <- biomaRt::useMart(mart, host = "https://metazoa.ensembl.org")

d <- biomaRt::useDataset(dataset = dataset, mart = m)

# establishing a biomaRt connection and retrieving the
# information for the given gene list
query <-
biomaRt::getBM(
attributes = as.character(c(filters, attributes)),
filters = as.character(filters),
values = as.character(genes),
mart = d,
...
)

colnames(query) <- c(filters, attributes)

genes <- tibble::tibble(as.character(genes))
colnames(genes) <- as.character(filters)

tbl_biomart <-
merge(query, genes, by = filters, incomparables = NA)

please_cite_biomartr(mute_citation = mute_citation)
return(tbl_biomart)

# connect to BioMart API
host_url <- biomart_base_urls_select(mart)
m <- m <- biomaRt::useMart(mart, host = host_url)

d <- biomaRt::useDataset(dataset = dataset, mart = m)

# establishing a biomaRt connection and retrieving the
# information for the given gene list
query <-
biomaRt::getBM(
attributes = as.character(c(filters, attributes)),
filters = as.character(filters),
values = as.character(genes),
mart = d,
...
)

colnames(query) <- c(filters, attributes)

genes <- tibble::tibble(as.character(genes))
colnames(genes) <- as.character(filters)

tbl_biomart <-
merge(query, genes, by = filters, incomparables = NA)

please_cite_biomartr(mute_citation = mute_citation)
return(tbl_biomart)

}
15 changes: 6 additions & 9 deletions R/custom_download.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,13 @@ custom_download <- function(url, ...) {
custom_download_check_local <- function(url, local_file, rest_api_status, db = "ensembl",
notify_exist = TRUE, ...) {

if (file.exists(local_file)) {
if (!is.logical(local_file) && file.exists(local_file)) {
if (notify_exist) message("File ", local_file,
" exists already. Thus, download has been skipped.")
} else {
if (!is.null(rest_api_status) && rest_api_status$release_coord_system_version == "not_found") {
message("Found organism but given release number did not specify existing file
in ensembl, maybe it is too old? Check that it exists on ensembl
first at all.")
message("Found organism but given assembly type (toplevel/primary) or release number did not specify existing file
in ensembl, did you specify primary for toplevel only? Check that it exists on ensembl through your browser")
return(FALSE)
}

Expand Down Expand Up @@ -113,14 +112,12 @@ test_url_status <- function(url, organism) {
#' @import curl
exists.ftp.file.new <- function(url, file.path) {

url_dir_safe <- gsub("//$", "/", paste0(dirname(url), "/"))
if (!RCurl::url.exists(url_dir_safe))
return(FALSE)
url_dir_safe <- unique(gsub("//$", "/", paste0(dirname(url), "/")))
url_dir_exists <- RCurl::url.exists(url_dir_safe)
if (!url_dir_exists) return(url_dir_exists)

con <- RCurl::getURL(url_dir_safe, ftp.use.epsv = FALSE, dirlistonly = TRUE)

dir_files <- XML::getHTMLLinks(con)

return(is.element(as.character(basename(file.path)),
dir_files))
}
17 changes: 13 additions & 4 deletions R/ensembl_ftp_generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
#' files from the ENSEMBL ftp server
#' @inheritParams getBio
#' @param type character, biological sequence type (e.g. "dna", "cds")
#' @param id.type a character, default "toplevel". id type of assembly, either "toplevel" or "primary_assembly" usually.
#' @param id.type a character, default "toplevel".
#' id type of assembly, either "toplevel" or "primary_assembly" for genomes.
#' Can be other strings, for non genome objects.
#' @param path location where file shall be stored.
#' @author Hajk-Georg Drost
#' @return either a character path to downloaded file, or a logical FALSE, specifying failure.
Expand Down Expand Up @@ -39,15 +41,20 @@ getENSEMBL <- function(organism, type = "dna", id.type = "toplevel", release = N
# construct retrieval query
kingdom <- ensembl_summary$division[1]
core_path <- ensembl_ftp_server_url_format_full(kingdom, release, format)
# Go through all possible assemblies, from newest to oldest, only 1 will match!
# Validate that final URL exists, if not, "not_found" is kept.
rest_api_status$release_coord_system_version <- "not_found"
for (assembly_option in all_possible_assemblies) {
ensembl.qry <- ftp_url_ensembl(core_path, new.organism, assembly_option,
ensembl_summary, format, type, id.type,
release)
if (!isFALSE(ensembl.qry)) {
assembly_is_correct <- exists.ftp.file.new(ensembl.qry, ensembl.qry)
if (assembly_is_correct) {
if (any(assembly_is_correct)) {
if (length(assembly_is_correct) > 1) {
id.type <- id.type[assembly_is_correct][1]
message("Auto detected assembly type: ", id.type)
ensembl.qry <- ensembl.qry[assembly_is_correct][1]
}
rest_api_status$release_coord_system_version <- assembly_option
break
}
Expand All @@ -64,6 +71,8 @@ getENSEMBL <- function(organism, type = "dna", id.type = "toplevel", release = N
local_path_ensembl <- function(path, new.organism, rest_api_status,
format, type, release, id.type) {
assembly <- rest_api_status$release_coord_system_version
if (assembly == "not_found") return(FALSE)

if (format == "fasta") {
ensembl_seq_local_path(path, new.organism, assembly,
type, id.type)
Expand All @@ -86,7 +95,7 @@ ftp_url_ensembl <- function(core_path, new.organism, assembly,
url_stem <- file.path(dir, ensembl_assembly_stem(new.organism, assembly))

if (!is.null(type)) type <- paste0(type, ".")
if (!is.null(id.type) && id.type == "none") id.type <- NULL
if (!is.null(id.type) && all(id.type == "none")) id.type <- NULL
if (!is.null(id.type)) id.type <- paste0(id.type, ".")
release. <- release
is_fasta <- format == "fasta"
Expand Down
6 changes: 3 additions & 3 deletions R/get.ensembl.info.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@
#' @seealso \code{\link{ensembl_divisions}}, \code{\link{getKingdomAssemblySummary}}, \code{\link{getENSEMBLInfo}}
#' @export
get.ensembl.info <- function(update = FALSE, division) {
stopifnot(is.logical(update))
tmp_file <- file.path(cachedir(), paste0(division, "_info.tsv"))
if (file.exists(tmp_file) &&
!update) {
if (file.exists(tmp_file) && !update) {
suppressWarnings(
ensembl.info <-
readr::read_tsv(
Expand Down Expand Up @@ -145,7 +145,7 @@ ensembl_ftp_server_url <- function(division = "EnsemblVertebrates") {
if (division == "EnsemblVertebrates") {
"https://ftp.ensembl.org"
} else {
"http://ftp.ensemblgenomes.org"
"https://ftp.ensemblgenomes.ebi.ac.uk" # old: http://ftp.ensemblgenomes.org
}
}

Expand Down
99 changes: 23 additions & 76 deletions R/getAttributes.R
Original file line number Diff line number Diff line change
@@ -1,90 +1,38 @@
#' @title Retrieve All Available Attributes for a Specific Dataset
#' @description This function queries the BioMart Interface and returns a table
#' storing all available attributes for a specific dataset.
#'
#' @param mart a character string specifying the database (mart)
#'
#' @param mart a character string specifying the database (mart)
#' for which datasets shall be listed.
#' @param dataset a character string specifying the dataset for which
#' @param dataset a character string specifying the dataset for which
#' attributes shall be listed.
#' @param mute_citation logical value indicating whether citation message should be muted.
#' @author Hajk-Georg Drost
#' @examples
#' \dontrun{
#' # search for available datasets
#' getMarts()
#'
#'
#' # choose database (mart): ENSEMBL_MART_ENSEMBL
#' # and get a table of all available datasets from this BioMart database
#' head(getDatasets(mart = "ENSEMBL_MART_ENSEMBL"), 10)
#'
#'
#' # choose dataset: "hsapiens_gene_ensembl"
#' head(getAttributes(mart = "ENSEMBL_MART_ENSEMBL",
#' head(getAttributes(mart = "ENSEMBL_MART_ENSEMBL",
#' dataset = "hsapiens_gene_ensembl") , 5)
#' }
#' @seealso \code{\link{getMarts}}, \code{\link{getDatasets}},
#' \code{\link{getFilters}}, \code{\link{organismBM}},
#' \code{\link{organismFilters}}, \code{\link{organismAttributes}}
#' @family biomaRt
#' @export

getAttributes <- function(mart, dataset, mute_citation = FALSE){

if ((!is.character(mart)) || (!is.character(dataset)))
stop("Please use a character string as mart or dataset.",
call. = FALSE)

if (!is.element(mart, getMarts()$mart))
stop("Please select a valid mart with getMarts().", call. = FALSE)

message("Starting retrieval of attribute information from mart ", mart, " and dataset ", dataset, " ...")
# if (!is.element(dataset, getDatasets(mart = mart)$dataset))
# stop("Please select a valid dataset with getDatasets().", call. = FALSE)

if (stringr::str_detect(mart, "ENSEMBL"))
# connect to BioMart API
url <-
paste0(
"http://ensembl.org/biomart/martservice?type=attributes&dataset=",
dataset,
"&requestid=biomart&mart=",
mart
)
if (stringr::str_detect(mart, "plants"))
# connect to BioMart API
url <-
paste0(
"http://plants.ensembl.org/biomart/martservice?type=attributes&dataset=",
dataset,
"&requestid=biomart&mart=",
mart
)
if (stringr::str_detect(mart, "fung"))
# connect to BioMart API
url <-
paste0(
"http://fungi.ensembl.org/biomart/martservice?type=attributes&dataset=",
dataset,
"&requestid=biomart&mart=",
mart
)

if (stringr::str_detect(mart, "protist"))
# connect to BioMart API
url <-
paste0(
"http://protist.ensembl.org/biomart/martservice?type=attributes&dataset=",
dataset,
"&requestid=biomart&mart=",
mart
)
if (stringr::str_detect(mart, "metazoa"))
# connect to BioMart API
url <-
paste0(
"http://metazoa.ensembl.org/biomart/martservice?type=attributes&dataset=",
dataset,
"&requestid=biomart&mart=",
mart
)

if ((!is.character(mart)) || (!is.character(dataset)))
stop("Please use a character string as mart or dataset.",
call. = FALSE)

type <- "attributes"
message("Starting retrieval of ", type, " information from mart ", mart, " and dataset ", dataset, " ...")
url <- biomart_full_url(mart, dataset, type)


testContent <-
httr::content(httr::GET(url), as = "text", encoding = "UTF-8")
Expand All @@ -98,31 +46,30 @@ getAttributes <- function(mart, dataset, mute_citation = FALSE){
attrBioMart <- data.frame(name = "NA", description = "NA")
return(attrBioMart)
}

attributesPage <- httr::handle(url)
xmlContentAttributes <- httr::GET(handle = attributesPage)

httr::stop_for_status(xmlContentAttributes)

# extract attribute name and attribute description
suppressWarnings(rawDF <-
do.call("rbind", apply(as.data.frame(strsplit(
httr::content(xmlContentAttributes, as = "text",
httr::content(xmlContentAttributes, as = "text",
encoding = "UTF-8"),
"\n"
)), 1, function(x)
unlist(strsplit(x, "\t")))))

colnames(rawDF) <- paste0("V", seq_len(ncol(rawDF)))

attrBioMart <-
as.data.frame(rawDF[, c("V1", "V2")],
stringsAsFactors = FALSE,
colClasses = rep("character", 2))
colnames(attrBioMart) <- c("name", "description")

please_cite_biomartr(mute_citation = mute_citation)

return(attrBioMart)
}

Expand Down
13 changes: 12 additions & 1 deletion R/getBioSet.R
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,18 @@ getBioSet <- function(db = "refseq",
#' for further exploration.
#' @param analyse_genome logical, default FALSE. If TRUE, get general genome statistics like
#' gc content etc. For more details, see ?summary_genome
#' @param assembly_type a character, default "toplevel". id type of assembly, either "toplevel" or "primary_assembly" usually.
#' @param assembly_type character, default c("primary_assembly", "toplevel"). Used for ensembl only,
#' specifies the genome assembly type. Searches for both primary and toplevel, and if both are found, uses the
#' first by order (so primary is prioritized by default).
#' The Primary assembly should usually be used if it exists.
#' The "primary assembly" contains all the top-level sequence regions,
#' excluding alternative haplotypes and patches.
#' If the primary assembly file is not present for a species
#' (only defined for standard model organisms),
#' that indicates that there were no haplotype/patch regions,
#' and in such cases, the 'toplevel file is used.
#' For more details see:
#' \\href{https://grch37.ensembl.org/info/genome/genebuild/assembly.html}{ensembl tutorial}
#' @param format "gff3", alternative "gtf" for ensembl.
#' @param mute_citation logical, default FALSE, indicating whether citation message should be muted.
#' @author Hajk-Georg Drost
Expand Down
Loading