Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
af90ee7
commit 3f03911
Showing
14 changed files
with
723 additions
and
313 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,3 @@ | ||
utils::globalVariables("variableName") | ||
utils::globalVariables(c("variableName", | ||
"value", "name", #eml_to_spice | ||
"long", "lat", "region")) #edit_biblio |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,294 @@ | ||
get_entities <- function(eml, | ||
entities = c("dataTable", "spatialRaster", "spatialVector", "storedProcedure", "view", "otherEntity"), | ||
level_id = "entityName"){ | ||
entities <- entities[entities %in% names(eml$dataset)] | ||
|
||
#look for specific fields to determine if the entity needs to be listed ("boxed") or not | ||
level_cond <- paste0("~", paste(sprintf("!is.null(.x$%s)", level_id), collapse = " | ")) | ||
purrr::map(entities, ~eml2::eml_get(eml, .x)) %>% | ||
# restructure so that all entities are at the same level | ||
# use level id to determine if .x should be listed or not | ||
purrr::map_if(eval(parse(text = level_cond)), list) %>% | ||
unlist(recursive = FALSE) | ||
} | ||
|
||
get_access_spice <- function(x){ | ||
x %>% | ||
unlist() %>% | ||
tibble::enframe() %>% | ||
dplyr::mutate(name = dplyr::case_when( | ||
grepl("objectName", name) ~ "fileName", | ||
grepl("entityName", name) ~ "name", | ||
grepl("url", name) ~ "contentUrl", | ||
grepl("formatName", name) ~ "fileFormat" | ||
)) %>% | ||
stats::na.omit() %>% | ||
filter(value != "download") %>% #often also included as url | ||
tidyr::spread(name, value) | ||
} | ||
|
||
#' Get access from EML | ||
#' | ||
#' Return EML access in the dataspice access.csv format. | ||
#' | ||
#' @param eml (emld) an EML object | ||
#' @param path (character) folder path for saving the table to disk | ||
#' | ||
#' @export | ||
#' @import eml2 | ||
#' | ||
#' @examples | ||
#' \dontrun{ | ||
#' eml_path <- system.file("example-dataset/broodTable_metadata.xml", package = "dataspice") | ||
#' eml <- read_eml(eml_path) | ||
#' es_access(eml) | ||
#' } | ||
|
||
es_access <- function(eml, path = NULL) { | ||
entities <- get_entities(eml) | ||
access_entities <- lapply(entities, get_access_spice) | ||
|
||
out <- dplyr::bind_rows(access_entities) | ||
|
||
#reorder | ||
fields <- c("fileName", "name", "contentUrl", "fileFormat") | ||
out <- out[, fields[fields %in% colnames(out)]] | ||
|
||
if(!is.null(path)){ | ||
if(!dir.exists(path)){ | ||
dir.create(path) | ||
} | ||
readr::write_csv(out, file.path(path, "access.csv")) | ||
} | ||
|
||
return(out) | ||
} | ||
|
||
get_attributes_spice <- function(x) { | ||
#reformat attributes to tabular format specified in dataspice | ||
#input a dataTable or otherEntity | ||
|
||
objName <- eml2::eml_get(x, "objectName") | ||
objName <- ifelse(length(objName) == 2, objName[[1]], NA) | ||
|
||
attrList <- eml2::eml_get(x, "attributeList") | ||
|
||
if(length(attrList) <= 1){ | ||
out <- dplyr::tibble(fileName = objName, | ||
variableName = NA, | ||
description = NA, | ||
unitText = NA) | ||
} else { | ||
attr <- eml2::get_attributes(attrList) | ||
|
||
if(is.null(attr$attributes$unit)){ | ||
attr$attributes$unit <- NA | ||
} | ||
|
||
#set datetime format as unitText if available | ||
if(!is.null(attr$attributes$formatString)){ | ||
na_units <- is.na(attr$attributes$unit) | ||
attr$attributes$unit[na_units] <- attr$attributes$formatString[na_units] | ||
} | ||
|
||
#get missing value info in text form: | ||
missing_val <- dplyr::tibble(missingValueCode = c(attr$attributes$missingValueCode, "NA"), | ||
missingValueCodeExplanation = c(attr$attributes$missingValueCodeExplanation, "NA")) %>% | ||
dplyr::distinct() %>% | ||
stats::na.omit() | ||
|
||
missing_val_text <- paste(missing_val$missingValueCode, | ||
missing_val$missingValueCodeExplanation, | ||
sep = " = ", | ||
collapse = "; ") | ||
|
||
out <- dplyr::tibble(fileName = objName, | ||
variableName = attr$attributes$attributeName, | ||
description = paste0(attr$attributes$attributeDefinition, | ||
"; missing values: ", missing_val_text), | ||
unitText = attr$attributes$unit) | ||
} | ||
|
||
return(out) | ||
} | ||
|
||
#' Get attributes from EML | ||
#' | ||
#' Return EML attributes in the dataspice attributes.csv format. | ||
#' | ||
#' @param eml (emld) an EML object | ||
#' @param path (character) folder path for saving the table to disk | ||
#' | ||
#' @export | ||
#' | ||
#' @import dplyr | ||
#' @importFrom readr write_csv | ||
#' | ||
#' @examples | ||
#' \dontrun{ | ||
#' eml_path <- system.file("example-dataset/broodTable_metadata.xml", package = "dataspice") | ||
#' eml <- read_eml(eml_path) | ||
#' es_attributes(eml) | ||
#' } | ||
|
||
es_attributes <- function(eml, path = NULL) { | ||
entities <- get_entities(eml) | ||
attrTables <- lapply(entities, get_attributes_spice) | ||
|
||
out <- dplyr::bind_rows(attrTables) %>% | ||
filter(!is.na(variableName)) | ||
|
||
if(!is.null(path)){ | ||
if(!dir.exists(path)){ | ||
dir.create(path) | ||
} | ||
readr::write_csv(out, file.path(path, "attributes.csv")) | ||
} | ||
|
||
return(out) | ||
} | ||
|
||
#' Get biblio from EML | ||
#' | ||
#' Return EML biblio in the dataspice biblio.csv format. | ||
#' | ||
#' @param eml (emld) an EML object | ||
#' @param path (character) folder path for saving the table to disk | ||
#' | ||
#' @export | ||
#' | ||
#' @examples | ||
#' \dontrun{ | ||
#' eml_path <- system.file("example-dataset/broodTable_metadata.xml", package = "dataspice") | ||
#' eml <- read_eml(eml_path) | ||
#' es_biblio(eml) | ||
#' } | ||
|
||
es_biblio <- function(eml, path = NULL) { | ||
biblio_eml <- eml %>% | ||
unlist() %>% | ||
tibble::enframe() %>% | ||
dplyr::mutate(name = dplyr::case_when( | ||
grepl("dataset.title", name) ~ "title", | ||
grepl("abstract", name) ~ "description", | ||
grepl("pubDate", name) ~ "datePublished", | ||
grepl("packageId", name) ~ "identifier", | ||
grepl("keyword", name) ~ "keywords", | ||
grepl("intellectual", name) ~ "license", | ||
grepl("fund", name) ~ "funder", | ||
grepl("geographicDescription", name) ~ "geographicDescription", | ||
grepl("northBoundingCoordinate", name) ~ "northBoundCoord", | ||
grepl("eastBoundingCoordinate", name) ~ "eastBoundCoord", | ||
grepl("southBoundingCoordinate", name) ~ "southBoundCoord", | ||
grepl("westBoundingCoordinate", name) ~ "westBoundCoord", | ||
#wktString? | ||
grepl("beginDate|singleDateTime", name) ~ "startDate", | ||
grepl("endDate", name) ~ "endDate" | ||
)) %>% | ||
stats::na.omit() %>% | ||
dplyr::group_by(name) %>% | ||
dplyr::summarize(value = paste(value, collapse = "; ")) %>% | ||
tidyr::spread(name, value) | ||
|
||
#reorder | ||
fields <- c("title", "description", "datePublished", "citation", "keywords", "license", "funder", "geographicDescription", "northBoundCoord", "eastBoundCoord", "southBoundCoord", "westBoundCoord", "wktString", "startDate", "endDate") | ||
|
||
|
||
out <- biblio_eml[, fields[fields %in% colnames(biblio_eml)]] | ||
|
||
if(!is.null(path)){ | ||
if(!dir.exists(path)){ | ||
dir.create(path) | ||
} | ||
readr::write_csv(out, file.path(path, "biblio.csv")) | ||
} | ||
|
||
return(out) | ||
} | ||
|
||
#' Get creators from EML | ||
#' | ||
#' Return EML creators in the dataspice creators.csv format. | ||
#' | ||
#' @param eml (emld) an EML object | ||
#' @param path (character) folder path for saving the table to disk | ||
#' | ||
#' @importFrom purrr discard | ||
#' @importFrom tibble enframe | ||
#' @importFrom tidyr spread | ||
#' | ||
#' @export | ||
#' | ||
#' @examples | ||
#' \dontrun{ | ||
#' eml_path <- system.file("example-dataset/broodTable_metadata.xml", package = "dataspice") | ||
#' eml <- read_eml(eml_path) | ||
#' es_creators(eml) | ||
#' } | ||
|
||
es_creators <- function(eml, path = NULL) { | ||
people <- get_entities(eml, | ||
entities = c("creator", "contact", "associatedParty", "metadataProvider"), | ||
level_id = c("individualName", "organizationName")) | ||
if(!is.null(names(people))){ | ||
people <- people[names(people) == ""] | ||
} | ||
|
||
people_parsed <- lapply(people, function(x){x %>% | ||
unlist() %>% | ||
tibble::enframe() %>% | ||
dplyr::mutate(name = dplyr::case_when( | ||
grepl("userId.userId", name) ~ "id", | ||
grepl("givenName", name) ~ "givenName", | ||
grepl("surName", name) ~ "familyName", | ||
grepl("organizationName", name) ~ "affiliation", | ||
grepl("electronicMailAddress", name) ~ "email" | ||
)) %>% | ||
stats::na.omit() %>% | ||
# merge fields together if duplicated (ex: givenName1 & givenName2) | ||
group_by(name) %>% | ||
dplyr::summarize(value = paste(value, collapse = " ")) %>% | ||
tidyr::spread(name, value) | ||
}) | ||
|
||
out <- dplyr::bind_rows(people_parsed) %>% | ||
dplyr::distinct() | ||
|
||
fields <- c("id", "givenName", "familyName", "affiliation", "email") | ||
out <- out[, fields[fields %in% colnames(out)]] | ||
|
||
if(!is.null(path)){ | ||
if(!dir.exists(path)){ | ||
dir.create(path) | ||
} | ||
readr::write_csv(out, file.path(path, "creators.csv")) | ||
} | ||
|
||
return(out) | ||
} | ||
|
||
#' Get dataspice tabular formats from EML | ||
#' | ||
#' Return EML in the dataspice dataframes. | ||
#' | ||
#' @param eml (emld) an EML object | ||
#' @param path (character) folder path for saving the table to disk | ||
#' | ||
#' @export | ||
#' | ||
#' @examples | ||
#' \dontrun{ | ||
#' eml_path <- system.file("example-dataset/broodTable_metadata.xml", package = "dataspice") | ||
#' eml <- read_eml(eml_path) | ||
#' my_spice <- eml_to_spice(eml, ".") | ||
#' } | ||
|
||
eml_to_spice <- function(eml, path = NULL) { | ||
out <- list(attributes = es_attributes(eml, path), | ||
access = es_access(eml, path), | ||
biblio = es_biblio(eml, path), | ||
creators = es_creators(eml, path)) | ||
|
||
invisible(out) | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
#' Prepare access | ||
#' | ||
#' Extract variableNames for a given data file and add them to the attributes.csv | ||
#' @param data_path path to the data folder. Defaults to "data" and R 'data' file types | ||
#' @param access_path path to the access.csv file. Defaults to "data/metadata/access.csv". | ||
#' | ||
#' @return the functions writes out the updated access.csv file to access_path. | ||
#' @export | ||
prep_access <- function(data_path = here::here("data"), | ||
access_path = here::here("data", "metadata", | ||
"access.csv") | ||
){ | ||
|
||
if(!file.exists(data_path)){stop("invalid path to data folder")} | ||
if(!file.exists(access_path)){ | ||
stop("access file does not exist. Check path or run create_spice?")} | ||
|
||
access <- readr::read_csv(access_path) | ||
|
||
# read file info | ||
fileNames <- tools::list_files_with_exts(data_path, | ||
exts = c("csv", "tsv"), | ||
full.names = TRUE) | ||
fileTypes <- vapply(fileNames, tools::file_ext) | ||
|
||
if(all(basename(fileNames) %in% unique(access$fileName))){ | ||
stop("Entries already exist in access.csv for fileNames: ", | ||
paste(basename(fileNames), collapse = ", ")) | ||
} | ||
|
||
access <- tibble::add_row(access, | ||
fileName = basename(fileNames), | ||
name = basename(fileNames), | ||
contentUrl = NA, | ||
fileFormat = fileTypes) | ||
|
||
|
||
readr::write_csv(access, path = access_path) | ||
message("The following fileNames have been added to the access file: ", | ||
paste(basename(fileNames), collapse = ", ")) | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.