Skip to content

Commit

Permalink
Commit to merge shiny_filepath
Browse files Browse the repository at this point in the history
  • Loading branch information
annakrystalli committed Sep 8, 2018
1 parent af90ee7 commit 3f03911
Show file tree
Hide file tree
Showing 14 changed files with 723 additions and 313 deletions.
11 changes: 11 additions & 0 deletions .travis.yml
Expand Up @@ -6,3 +6,14 @@ r:
- oldrel
- release
- devel

# needed for eml2:
addons:
apt:
sources:
- sourceline: 'ppa:opencpu/jq'
packages:
- librdf0-dev
- libv8-dev
- libjq-dev
- libudunits2-dev
4 changes: 3 additions & 1 deletion R/dataspice.R
@@ -1 +1,3 @@
utils::globalVariables("variableName")
utils::globalVariables(c("variableName",
"value", "name", #eml_to_spice
"long", "lat", "region")) #edit_biblio
294 changes: 294 additions & 0 deletions R/eml_to_spice.R
@@ -0,0 +1,294 @@
get_entities <- function(eml,
entities = c("dataTable", "spatialRaster", "spatialVector", "storedProcedure", "view", "otherEntity"),
level_id = "entityName"){
entities <- entities[entities %in% names(eml$dataset)]

#look for specific fields to determine if the entity needs to be listed ("boxed") or not
level_cond <- paste0("~", paste(sprintf("!is.null(.x$%s)", level_id), collapse = " | "))
purrr::map(entities, ~eml2::eml_get(eml, .x)) %>%
# restructure so that all entities are at the same level
# use level id to determine if .x should be listed or not
purrr::map_if(eval(parse(text = level_cond)), list) %>%
unlist(recursive = FALSE)
}

get_access_spice <- function(x){
x %>%
unlist() %>%
tibble::enframe() %>%
dplyr::mutate(name = dplyr::case_when(
grepl("objectName", name) ~ "fileName",
grepl("entityName", name) ~ "name",
grepl("url", name) ~ "contentUrl",
grepl("formatName", name) ~ "fileFormat"
)) %>%
stats::na.omit() %>%
filter(value != "download") %>% #often also included as url
tidyr::spread(name, value)
}

#' Get access from EML
#'
#' Return EML access in the dataspice access.csv format.
#'
#' @param eml (emld) an EML object
#' @param path (character) folder path for saving the table to disk
#'
#' @export
#' @import eml2
#'
#' @examples
#' \dontrun{
#' eml_path <- system.file("example-dataset/broodTable_metadata.xml", package = "dataspice")
#' eml <- read_eml(eml_path)
#' es_access(eml)
#' }

es_access <- function(eml, path = NULL) {
entities <- get_entities(eml)
access_entities <- lapply(entities, get_access_spice)

out <- dplyr::bind_rows(access_entities)

#reorder
fields <- c("fileName", "name", "contentUrl", "fileFormat")
out <- out[, fields[fields %in% colnames(out)]]

if(!is.null(path)){
if(!dir.exists(path)){
dir.create(path)
}
readr::write_csv(out, file.path(path, "access.csv"))
}

return(out)
}

get_attributes_spice <- function(x) {
#reformat attributes to tabular format specified in dataspice
#input a dataTable or otherEntity

objName <- eml2::eml_get(x, "objectName")
objName <- ifelse(length(objName) == 2, objName[[1]], NA)

attrList <- eml2::eml_get(x, "attributeList")

if(length(attrList) <= 1){
out <- dplyr::tibble(fileName = objName,
variableName = NA,
description = NA,
unitText = NA)
} else {
attr <- eml2::get_attributes(attrList)

if(is.null(attr$attributes$unit)){
attr$attributes$unit <- NA
}

#set datetime format as unitText if available
if(!is.null(attr$attributes$formatString)){
na_units <- is.na(attr$attributes$unit)
attr$attributes$unit[na_units] <- attr$attributes$formatString[na_units]
}

#get missing value info in text form:
missing_val <- dplyr::tibble(missingValueCode = c(attr$attributes$missingValueCode, "NA"),
missingValueCodeExplanation = c(attr$attributes$missingValueCodeExplanation, "NA")) %>%
dplyr::distinct() %>%
stats::na.omit()

missing_val_text <- paste(missing_val$missingValueCode,
missing_val$missingValueCodeExplanation,
sep = " = ",
collapse = "; ")

out <- dplyr::tibble(fileName = objName,
variableName = attr$attributes$attributeName,
description = paste0(attr$attributes$attributeDefinition,
"; missing values: ", missing_val_text),
unitText = attr$attributes$unit)
}

return(out)
}

#' Get attributes from EML
#'
#' Return EML attributes in the dataspice attributes.csv format.
#'
#' @param eml (emld) an EML object
#' @param path (character) folder path for saving the table to disk
#'
#' @export
#'
#' @import dplyr
#' @importFrom readr write_csv
#'
#' @examples
#' \dontrun{
#' eml_path <- system.file("example-dataset/broodTable_metadata.xml", package = "dataspice")
#' eml <- read_eml(eml_path)
#' es_attributes(eml)
#' }

es_attributes <- function(eml, path = NULL) {
entities <- get_entities(eml)
attrTables <- lapply(entities, get_attributes_spice)

out <- dplyr::bind_rows(attrTables) %>%
filter(!is.na(variableName))

if(!is.null(path)){
if(!dir.exists(path)){
dir.create(path)
}
readr::write_csv(out, file.path(path, "attributes.csv"))
}

return(out)
}

#' Get biblio from EML
#'
#' Return EML biblio in the dataspice biblio.csv format.
#'
#' @param eml (emld) an EML object
#' @param path (character) folder path for saving the table to disk
#'
#' @export
#'
#' @examples
#' \dontrun{
#' eml_path <- system.file("example-dataset/broodTable_metadata.xml", package = "dataspice")
#' eml <- read_eml(eml_path)
#' es_biblio(eml)
#' }

es_biblio <- function(eml, path = NULL) {
biblio_eml <- eml %>%
unlist() %>%
tibble::enframe() %>%
dplyr::mutate(name = dplyr::case_when(
grepl("dataset.title", name) ~ "title",
grepl("abstract", name) ~ "description",
grepl("pubDate", name) ~ "datePublished",
grepl("packageId", name) ~ "identifier",
grepl("keyword", name) ~ "keywords",
grepl("intellectual", name) ~ "license",
grepl("fund", name) ~ "funder",
grepl("geographicDescription", name) ~ "geographicDescription",
grepl("northBoundingCoordinate", name) ~ "northBoundCoord",
grepl("eastBoundingCoordinate", name) ~ "eastBoundCoord",
grepl("southBoundingCoordinate", name) ~ "southBoundCoord",
grepl("westBoundingCoordinate", name) ~ "westBoundCoord",
#wktString?
grepl("beginDate|singleDateTime", name) ~ "startDate",
grepl("endDate", name) ~ "endDate"
)) %>%
stats::na.omit() %>%
dplyr::group_by(name) %>%
dplyr::summarize(value = paste(value, collapse = "; ")) %>%
tidyr::spread(name, value)

#reorder
fields <- c("title", "description", "datePublished", "citation", "keywords", "license", "funder", "geographicDescription", "northBoundCoord", "eastBoundCoord", "southBoundCoord", "westBoundCoord", "wktString", "startDate", "endDate")


out <- biblio_eml[, fields[fields %in% colnames(biblio_eml)]]

if(!is.null(path)){
if(!dir.exists(path)){
dir.create(path)
}
readr::write_csv(out, file.path(path, "biblio.csv"))
}

return(out)
}

#' Get creators from EML
#'
#' Return EML creators in the dataspice creators.csv format.
#'
#' @param eml (emld) an EML object
#' @param path (character) folder path for saving the table to disk
#'
#' @importFrom purrr discard
#' @importFrom tibble enframe
#' @importFrom tidyr spread
#'
#' @export
#'
#' @examples
#' \dontrun{
#' eml_path <- system.file("example-dataset/broodTable_metadata.xml", package = "dataspice")
#' eml <- read_eml(eml_path)
#' es_creators(eml)
#' }

es_creators <- function(eml, path = NULL) {
people <- get_entities(eml,
entities = c("creator", "contact", "associatedParty", "metadataProvider"),
level_id = c("individualName", "organizationName"))
if(!is.null(names(people))){
people <- people[names(people) == ""]
}

people_parsed <- lapply(people, function(x){x %>%
unlist() %>%
tibble::enframe() %>%
dplyr::mutate(name = dplyr::case_when(
grepl("userId.userId", name) ~ "id",
grepl("givenName", name) ~ "givenName",
grepl("surName", name) ~ "familyName",
grepl("organizationName", name) ~ "affiliation",
grepl("electronicMailAddress", name) ~ "email"
)) %>%
stats::na.omit() %>%
# merge fields together if duplicated (ex: givenName1 & givenName2)
group_by(name) %>%
dplyr::summarize(value = paste(value, collapse = " ")) %>%
tidyr::spread(name, value)
})

out <- dplyr::bind_rows(people_parsed) %>%
dplyr::distinct()

fields <- c("id", "givenName", "familyName", "affiliation", "email")
out <- out[, fields[fields %in% colnames(out)]]

if(!is.null(path)){
if(!dir.exists(path)){
dir.create(path)
}
readr::write_csv(out, file.path(path, "creators.csv"))
}

return(out)
}

#' Get dataspice tabular formats from EML
#'
#' Return EML in the dataspice dataframes.
#'
#' @param eml (emld) an EML object
#' @param path (character) folder path for saving the table to disk
#'
#' @export
#'
#' @examples
#' \dontrun{
#' eml_path <- system.file("example-dataset/broodTable_metadata.xml", package = "dataspice")
#' eml <- read_eml(eml_path)
#' my_spice <- eml_to_spice(eml, ".")
#' }

eml_to_spice <- function(eml, path = NULL) {
out <- list(attributes = es_attributes(eml, path),
access = es_access(eml, path),
biblio = es_biblio(eml, path),
creators = es_creators(eml, path))

invisible(out)
}

42 changes: 42 additions & 0 deletions R/prep_access.R
@@ -0,0 +1,42 @@
#' Prepare access
#'
#' Extract variableNames for a given data file and add them to the attributes.csv
#' @param data_path path to the data folder. Defaults to "data" and R 'data' file types
#' @param access_path path to the access.csv file. Defaults to "data/metadata/access.csv".
#'
#' @return the functions writes out the updated access.csv file to access_path.
#' @export
prep_access <- function(data_path = here::here("data"),
access_path = here::here("data", "metadata",
"access.csv")
){

if(!file.exists(data_path)){stop("invalid path to data folder")}
if(!file.exists(access_path)){
stop("access file does not exist. Check path or run create_spice?")}

access <- readr::read_csv(access_path)

# read file info
fileNames <- tools::list_files_with_exts(data_path,
exts = c("csv", "tsv"),
full.names = TRUE)
fileTypes <- vapply(fileNames, tools::file_ext)

if(all(basename(fileNames) %in% unique(access$fileName))){
stop("Entries already exist in access.csv for fileNames: ",
paste(basename(fileNames), collapse = ", "))
}

access <- tibble::add_row(access,
fileName = basename(fileNames),
name = basename(fileNames),
contentUrl = NA,
fileFormat = fileTypes)


readr::write_csv(access, path = access_path)
message("The following fileNames have been added to the access file: ",
paste(basename(fileNames), collapse = ", "))
}

11 changes: 6 additions & 5 deletions README.Rmd
Expand Up @@ -115,27 +115,28 @@ Completed metadata tables in this example will look like this:
`access.csv` has one row for each file

```{r, echo=FALSE, message=FALSE}
readr::read_csv(system.file("metadata-tables/access.csv", package = "dataspice")) %>% head() %>% kable()
readr::read_csv(system.file("metadata-tables/access.csv", package = "dataspice")) %>% head() %>% kable(format = "markdown")
```

`attributes.csv` has one row for each variable in each file

```{r, echo=FALSE, message=FALSE}
readr::read_csv(system.file("metadata-tables/attributes.csv", package = "dataspice")) %>% head() %>% kable()
readr::read_csv(system.file("metadata-tables/attributes.csv", package = "dataspice")) %>% head() %>% kable(format = "markdown")
```

`biblio.csv` is one row containing descriptors including spatial and temporal coverage

```{r, echo=FALSE, message=FALSE}
```{r, echo=FALSE, message=FALSE, warning=FALSE}
readr::read_csv(system.file("metadata-tables/biblio.csv", package = "dataspice")) %>%
dplyr::mutate(description = str_trunc(description, 200, side = "right")) %>%
kable()
kable(format = "markdown")
```

`creators.csv` has one row for each of the dataset authors

```{r, echo=FALSE, message=FALSE}
readr::read_csv(system.file("metadata-tables/creators.csv", package = "dataspice")) %>% kable()
readr::read_csv(system.file("metadata-tables/creators.csv", package = "dataspice")) %>%
kable(format = "markdown")
```


Expand Down

0 comments on commit 3f03911

Please sign in to comment.