Commit to merge shiny_filepath

ropensci · Sep 8, 2018 · 3f03911 · 3f03911
1 parent af90ee7
commit 3f03911
Show file tree

Hide file tree

Showing 14 changed files with 723 additions and 313 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -6,3 +6,14 @@ r:
   - oldrel
   - release
   - devel
+
+# needed for eml2:
+addons:
+  apt:
+    sources:
+      - sourceline: 'ppa:opencpu/jq'
+    packages:
+      - librdf0-dev
+      - libv8-dev
+      - libjq-dev
+      - libudunits2-dev
diff --git a/R/dataspice.R b/R/dataspice.R
@@ -1 +1,3 @@
-utils::globalVariables("variableName")
+utils::globalVariables(c("variableName",
+                         "value", "name", #eml_to_spice
+                         "long", "lat", "region")) #edit_biblio
diff --git a/R/eml_to_spice.R b/R/eml_to_spice.R
@@ -0,0 +1,294 @@
+get_entities <- function(eml,
+                         entities = c("dataTable", "spatialRaster", "spatialVector", "storedProcedure", "view", "otherEntity"),
+                         level_id = "entityName"){
+    entities <- entities[entities %in% names(eml$dataset)]
+
+    #look for specific fields to determine if the entity needs to be listed ("boxed") or not
+    level_cond <- paste0("~", paste(sprintf("!is.null(.x$%s)", level_id), collapse = " | "))
+    purrr::map(entities, ~eml2::eml_get(eml, .x)) %>%
+        # restructure so that all entities are at the same level
+        # use level id to determine if .x should be listed or not
+        purrr::map_if(eval(parse(text = level_cond)), list) %>%
+        unlist(recursive = FALSE)
+}
+
+get_access_spice <- function(x){
+    x %>%
+        unlist() %>%
+        tibble::enframe() %>%
+        dplyr::mutate(name = dplyr::case_when(
+            grepl("objectName", name) ~ "fileName",
+            grepl("entityName", name) ~ "name",
+            grepl("url", name) ~ "contentUrl",
+            grepl("formatName", name) ~ "fileFormat"
+        )) %>%
+        stats::na.omit() %>%
+        filter(value != "download") %>% #often also included as url
+        tidyr::spread(name, value)
+}
+
+#' Get access from EML
+#'
+#' Return EML access in the dataspice access.csv format.
+#'
+#' @param eml (emld) an EML object
+#' @param path (character) folder path for saving the table to disk
+#'
+#' @export
+#' @import eml2
+#'
+#' @examples
+#' \dontrun{
+#' eml_path <- system.file("example-dataset/broodTable_metadata.xml", package = "dataspice")
+#' eml <- read_eml(eml_path)
+#' es_access(eml)
+#' }
+
+es_access <- function(eml, path = NULL) {
+    entities <- get_entities(eml)
+    access_entities <- lapply(entities, get_access_spice)
+
+    out <- dplyr::bind_rows(access_entities)
+
+    #reorder
+    fields <- c("fileName", "name", "contentUrl", "fileFormat")
+    out <- out[, fields[fields %in% colnames(out)]]
+
+    if(!is.null(path)){
+        if(!dir.exists(path)){
+            dir.create(path)
+        }
+        readr::write_csv(out, file.path(path, "access.csv"))
+    }
+
+    return(out)
+}
+
+get_attributes_spice <- function(x) {
+  #reformat attributes to tabular format specified in dataspice
+  #input a dataTable or otherEntity
+
+  objName <- eml2::eml_get(x, "objectName")
+  objName <- ifelse(length(objName) == 2, objName[[1]], NA)
+
+  attrList <- eml2::eml_get(x, "attributeList")
+
+  if(length(attrList) <= 1){
+    out <- dplyr::tibble(fileName = objName,
+                         variableName = NA,
+                         description = NA,
+                         unitText = NA)
+  } else {
+    attr <- eml2::get_attributes(attrList)
+
+    if(is.null(attr$attributes$unit)){
+      attr$attributes$unit <- NA
+    }
+
+    #set datetime format as unitText if available
+    if(!is.null(attr$attributes$formatString)){
+      na_units <- is.na(attr$attributes$unit)
+      attr$attributes$unit[na_units] <- attr$attributes$formatString[na_units]
+    }
+
+    #get missing value info in text form:
+    missing_val <- dplyr::tibble(missingValueCode = c(attr$attributes$missingValueCode, "NA"),
+                                 missingValueCodeExplanation = c(attr$attributes$missingValueCodeExplanation, "NA")) %>%
+      dplyr::distinct() %>%
+      stats::na.omit()
+
+    missing_val_text <- paste(missing_val$missingValueCode,
+                              missing_val$missingValueCodeExplanation,
+                              sep = " = ",
+                              collapse = "; ")
+
+    out <- dplyr::tibble(fileName = objName,
+                         variableName = attr$attributes$attributeName,
+                         description = paste0(attr$attributes$attributeDefinition,
+                                              "; missing values: ", missing_val_text),
+                         unitText = attr$attributes$unit)
+  }
+
+  return(out)
+}
+
+#' Get attributes from EML
+#'
+#' Return EML attributes in the dataspice attributes.csv format.
+#'
+#' @param eml (emld) an EML object
+#' @param path (character) folder path for saving the table to disk
+#'
+#' @export
+#'
+#' @import dplyr
+#' @importFrom readr write_csv
+#'
+#' @examples
+#' \dontrun{
+#' eml_path <- system.file("example-dataset/broodTable_metadata.xml", package = "dataspice")
+#' eml <- read_eml(eml_path)
+#' es_attributes(eml)
+#' }
+
+es_attributes <- function(eml, path = NULL) {
+  entities <- get_entities(eml)
+  attrTables <- lapply(entities, get_attributes_spice)
+
+  out <- dplyr::bind_rows(attrTables) %>%
+    filter(!is.na(variableName))
+
+  if(!is.null(path)){
+    if(!dir.exists(path)){
+      dir.create(path)
+    }
+    readr::write_csv(out, file.path(path, "attributes.csv"))
+  }
+
+  return(out)
+}
+
+#' Get biblio from EML
+#'
+#' Return EML biblio in the dataspice biblio.csv format.
+#'
+#' @param eml (emld) an EML object
+#' @param path (character) folder path for saving the table to disk
+#'
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' eml_path <- system.file("example-dataset/broodTable_metadata.xml", package = "dataspice")
+#' eml <- read_eml(eml_path)
+#' es_biblio(eml)
+#' }
+
+es_biblio <- function(eml, path = NULL) {
+  biblio_eml <- eml %>%
+    unlist() %>%
+    tibble::enframe() %>%
+    dplyr::mutate(name = dplyr::case_when(
+      grepl("dataset.title", name) ~ "title",
+      grepl("abstract", name) ~ "description",
+      grepl("pubDate", name) ~ "datePublished",
+      grepl("packageId", name) ~ "identifier",
+      grepl("keyword", name) ~ "keywords",
+      grepl("intellectual", name) ~ "license",
+      grepl("fund", name) ~ "funder",
+      grepl("geographicDescription", name) ~ "geographicDescription",
+      grepl("northBoundingCoordinate", name) ~ "northBoundCoord",
+      grepl("eastBoundingCoordinate", name) ~ "eastBoundCoord",
+      grepl("southBoundingCoordinate", name) ~ "southBoundCoord",
+      grepl("westBoundingCoordinate", name) ~ "westBoundCoord",
+      #wktString?
+      grepl("beginDate|singleDateTime", name) ~ "startDate",
+      grepl("endDate", name) ~ "endDate"
+    )) %>%
+    stats::na.omit() %>%
+    dplyr::group_by(name) %>%
+    dplyr::summarize(value = paste(value, collapse = "; ")) %>%
+    tidyr::spread(name, value)
+
+  #reorder
+  fields <- c("title", "description", "datePublished", "citation", "keywords", "license", "funder", "geographicDescription", "northBoundCoord", "eastBoundCoord", "southBoundCoord", "westBoundCoord", "wktString", "startDate", "endDate")
+
+
+  out <- biblio_eml[, fields[fields %in% colnames(biblio_eml)]]
+
+  if(!is.null(path)){
+    if(!dir.exists(path)){
+      dir.create(path)
+    }
+    readr::write_csv(out, file.path(path, "biblio.csv"))
+  }
+
+  return(out)
+}
+
+#' Get creators from EML
+#'
+#' Return EML creators in the dataspice creators.csv format.
+#'
+#' @param eml (emld) an EML object
+#' @param path (character) folder path for saving the table to disk
+#'
+#' @importFrom purrr discard
+#' @importFrom tibble enframe
+#' @importFrom tidyr spread
+#'
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' eml_path <- system.file("example-dataset/broodTable_metadata.xml", package = "dataspice")
+#' eml <- read_eml(eml_path)
+#' es_creators(eml)
+#' }
+
+es_creators <- function(eml, path = NULL) {
+  people <- get_entities(eml,
+                         entities = c("creator", "contact", "associatedParty", "metadataProvider"),
+                         level_id = c("individualName", "organizationName"))
+  if(!is.null(names(people))){
+    people <- people[names(people) == ""]
+  }
+
+  people_parsed <- lapply(people, function(x){x %>%
+      unlist() %>%
+      tibble::enframe() %>%
+      dplyr::mutate(name = dplyr::case_when(
+        grepl("userId.userId", name) ~ "id",
+        grepl("givenName", name) ~ "givenName",
+        grepl("surName", name) ~ "familyName",
+        grepl("organizationName", name) ~ "affiliation",
+        grepl("electronicMailAddress", name) ~ "email"
+      )) %>%
+      stats::na.omit() %>%
+      # merge fields together if duplicated (ex: givenName1 & givenName2)
+      group_by(name) %>%
+      dplyr::summarize(value = paste(value, collapse = " ")) %>%
+      tidyr::spread(name, value)
+  })
+
+  out <- dplyr::bind_rows(people_parsed) %>%
+    dplyr::distinct()
+
+  fields <- c("id", "givenName", "familyName", "affiliation", "email")
+  out <- out[, fields[fields %in% colnames(out)]]
+
+  if(!is.null(path)){
+    if(!dir.exists(path)){
+      dir.create(path)
+    }
+    readr::write_csv(out, file.path(path, "creators.csv"))
+  }
+
+  return(out)
+}
+
+#' Get dataspice tabular formats from EML
+#'
+#' Return EML in the dataspice dataframes.
+#'
+#' @param eml (emld) an EML object
+#' @param path (character) folder path for saving the table to disk
+#'
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' eml_path <- system.file("example-dataset/broodTable_metadata.xml", package = "dataspice")
+#' eml <- read_eml(eml_path)
+#' my_spice <- eml_to_spice(eml, ".")
+#' }
+
+eml_to_spice <- function(eml, path = NULL) {
+  out <- list(attributes = es_attributes(eml, path),
+              access = es_access(eml, path),
+              biblio = es_biblio(eml, path),
+              creators = es_creators(eml, path))
+
+  invisible(out)
+}
+
diff --git a/R/prep_access.R b/R/prep_access.R
@@ -0,0 +1,42 @@
+#' Prepare access
+#'
+#' Extract variableNames for a given data file and add them to the attributes.csv
+#' @param data_path path to the data folder. Defaults to "data" and R 'data' file types
+#' @param access_path path to the access.csv file. Defaults to "data/metadata/access.csv".
+#'
+#' @return the functions writes out the updated access.csv file to access_path.
+#' @export
+prep_access <- function(data_path = here::here("data"),
+                        access_path = here::here("data", "metadata",
+                                                 "access.csv")
+                        ){
+
+  if(!file.exists(data_path)){stop("invalid path to data folder")}
+  if(!file.exists(access_path)){
+    stop("access file does not exist. Check path or run create_spice?")}
+
+  access <- readr::read_csv(access_path)
+
+  # read file info
+  fileNames <- tools::list_files_with_exts(data_path,
+                                           exts = c("csv", "tsv"),
+                                           full.names = TRUE)
+  fileTypes <- vapply(fileNames, tools::file_ext)
+
+  if(all(basename(fileNames) %in% unique(access$fileName))){
+    stop("Entries already exist in access.csv for fileNames: ",
+         paste(basename(fileNames), collapse = ", "))
+  }
+
+  access <- tibble::add_row(access,
+                            fileName = basename(fileNames),
+                            name = basename(fileNames),
+                            contentUrl = NA,
+                            fileFormat = fileTypes)
+
+
+  readr::write_csv(access, path = access_path)
+  message("The following fileNames have been added to the access file: ",
+          paste(basename(fileNames), collapse = ", "))
+}
+
diff --git a/README.Rmd b/README.Rmd
@@ -115,27 +115,28 @@ Completed metadata tables in this example will look like this:
 `access.csv` has one row for each file
 
 ```{r, echo=FALSE, message=FALSE}
-readr::read_csv(system.file("metadata-tables/access.csv", package = "dataspice")) %>% head() %>% kable()
+readr::read_csv(system.file("metadata-tables/access.csv", package = "dataspice")) %>% head() %>% kable(format = "markdown")
 ```
 
 `attributes.csv` has one row for each variable in each file
 
 ```{r, echo=FALSE, message=FALSE}
-readr::read_csv(system.file("metadata-tables/attributes.csv", package = "dataspice")) %>% head() %>% kable()
+readr::read_csv(system.file("metadata-tables/attributes.csv", package = "dataspice")) %>% head() %>% kable(format = "markdown")
 ```
 
 `biblio.csv` is one row containing descriptors including spatial and temporal coverage
 
-```{r, echo=FALSE, message=FALSE}
+```{r, echo=FALSE, message=FALSE, warning=FALSE}
 readr::read_csv(system.file("metadata-tables/biblio.csv", package = "dataspice")) %>% 
   dplyr::mutate(description = str_trunc(description, 200, side = "right")) %>%
-  kable()
+  kable(format = "markdown")
 ```
 
 `creators.csv` has one row for each of the dataset authors
 
 ```{r, echo=FALSE, message=FALSE}
-readr::read_csv(system.file("metadata-tables/creators.csv", package = "dataspice")) %>% kable() 
+readr::read_csv(system.file("metadata-tables/creators.csv", package = "dataspice")) %>% 
+  kable(format = "markdown") 
 ```