reimplement from scratch based on new knowledge/spec

ropensci-archive · May 13, 2016 · 19f5d97 · 19f5d97
1 parent 2335f82
commit 19f5d97
Show file tree

Hide file tree

Showing 13 changed files with 290 additions and 226 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -3,3 +3,5 @@
 ^appveyor\.yml$
 ^\.travis\.yml$
 ^data$
+^tests/testsuite-py$
+^datapackage.json$
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,4 +1,4 @@
-Package: datapackage
+Package: datapkg
 Type: Package
 Title: Read and Write Data Packages
 Version: 0.1
@@ -7,11 +7,13 @@ Authors@R: c(
     person("Karthik", "Ram", email = "karthik.ram@gmail.com", role = "aut"))
 Description: Convenience functions for reading and writing datasets following
     the 'data packagist' format.
-URL: http://dataprotocols.org/data-packages/
+URL: http://frictionlessdata.io/data-packages/
 License: MIT + file LICENSE
 Imports:
+    methods,
     readr,
-    jsonlite
+    jsonlite,
+    curl
 Suggests:
     ggplot2
 RoxygenNote: 5.0.1
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,10 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
-S3method("$",jeroen)
-S3method("[",jeroen)
-S3method("[[",jeroen)
-S3method(print,jeroen)
-export(data_package)
-importFrom(readr,write_csv)
-importFrom(readr,write_tsv)
-importFrom(tools,md5sum)
+S3method(print,datapkg_data)
+S3method(print,datapkg_resources)
+export(datapkg_read)
+import(readr)
diff --git a/R/datapkg_read.R b/R/datapkg_read.R
@@ -0,0 +1,127 @@
+#' Open data-package
+#'
+#' Loads a data and meta-data from a 'data-package' directory or URL.
+#'
+#' @import readr
+#' @param path file path or URL to the data package directory
+#' @rdname datapackage
+#' @name datapackage
+#' @aliases datapkg
+#' @references \url{http://frictionlessdata.io/data-packages}, \url{https://github.com/datasets}
+#' @export
+#' @examples # Example data from https://github.com/datasets
+#' datapkg_read("https://raw.githubusercontent.com/datasets/ex-tabular-multiple-resources-fk/master")
+#' datapkg_read("https://raw.githubusercontent.com/datasets/gini-index/master")
+#' datapkg_read("https://raw.githubusercontent.com/datasets/euribor/master")
+datapkg_read <- function(path){
+  root <- sub("datapackage.json$", "", path)
+  root <- sub("/$", "", root)
+  json_path <- file.path(root, "/datapackage.json")
+  json <- if(is_url(root)){
+    con <- curl::curl(json_path, "r")
+    on.exit(close(con))
+    readLines(con, warn = FALSE)
+  } else {
+    readLines(normalizePath(json_path, mustWork = TRUE), warn = FALSE)
+  }
+  pkg_info <- jsonlite::fromJSON(json, simplifyVector = TRUE)
+  if(is.data.frame(pkg_info$resources))
+    class(pkg_info$resources) <- c("datapkg_resources", class(pkg_info$resources))
+  if(is.data.frame(pkg_info$sources))
+    class(pkg_info$sources) <- c("datapkg_sources", class(pkg_info$sources))
+  pkg_info$data <- list(rep(NA, nrow(pkg_info$resources)))
+  data_names <- pkg_info$resources$name
+  for(i in seq_len(nrow(pkg_info$resources))){
+    target <- as.list(pkg_info$resources[i, ])
+    pkg_info$data[[i]] <- read_data_package(get_data_path(target, root),
+      dialect = as.list(target$dialect), hash = target$hash, target$schema$fields[[1]])
+  }
+  class(pkg_info$data) <- c("datapkg_data")
+  if(length(data_names))
+    names(pkg_info$data) <- ifelse(is.na(data_names), "", data_names)
+  pkg_info
+}
+
+get_data_path <- function(x, root){
+  if(length(x$path)){
+    data_path <- normalizePath(file.path(root, x$path), mustWork = FALSE)
+    if(is_url(data_path) || file.exists(data_path)){
+      return(data_path)
+    } else {
+      if(length(x$url)){
+        message("File not found: ", data_path)
+        return(x$url)
+      } else {
+        stop("File not found: ", data_path)
+      }
+    }
+  }
+}
+
+is_url <- function(x){
+  grepl("^[a-zA-Z]+://", x)
+}
+
+read_data_package <- function(path, dialect = list(), hash = NULL, fields = NULL) {
+  if(!length(fields))
+    return(data.frame())
+  col_types <- list()
+  for(i in seq_len(nrow(fields)))
+    col_types[[i]] <- do.call(make_field, as.list(fields[i,]))
+  do.call(parse_data_file, c(list(file = path, col_types = col_types), dialect))
+}
+
+make_field <- function(name = "", type = "string", description = "", format = "%Y-%m-%d", ...){
+  switch(type,
+         string = col_character(),
+         number = col_number(),
+         integer = col_integer(),
+         boolean = col_logical(),
+         object = col_character(),
+         array = col_character(),
+         date = col_date(sub("^fmt:", "", format)),
+         datetime = col_datetime(),
+         time = col_time(),
+         col_character()
+  )
+}
+
+## Defaults from http://dataprotocols.org/csv-dialect/
+parse_data_file <- function(file, col_types = NULL, delimiter = ",", doubleQuote = TRUE,
+                            lineTerminator = "\r\n", quoteChar = '"', escapeChar = "", skipInitialSpace = TRUE,
+                            header = TRUE, caseSensitiveHeader = FALSE){
+
+  # unused: lineTerminator, skipInitialSpace, caseSensitiveHeader
+  message("Reading file ", file)
+  readr::read_delim(
+    col_types = col_types,
+    file = file,
+    delim = delimiter,
+    escape_double = doubleQuote,
+    quote = quoteChar,
+    escape_backslash = identical(escapeChar, "\\"),
+    col_names = header
+  )
+}
+
+#' @export
+print.datapkg_resources <- function(x, ...){
+  print_names <- names(x) %in% c("name", "path", "format")
+  print(as.data.frame(x)[print_names])
+}
+
+#' @export
+print.datapkg_data <- function(x, ...){
+  for(i in seq_along(x)){
+    data_name <- names(x[i])
+    if(length(data_name) && !is.na(data_name)){
+      cat(" $", data_name, "\n", sep = "")
+    } else {
+      cat(" [[", i, "]]\n", sep = "")
+    }
+    mydata <- x[[i]]
+    for(j in seq_along(mydata)){
+      cat("  [", j, "] ", names(mydata)[j], " (", methods::is(mydata[[j]])[1], ")\n", sep = "")
+    }
+  }
+}
diff --git a/R/dpkg.R → R/old/datapkg_new.R b/R/dpkg.R → R/old/datapkg_new.R
@@ -7,9 +7,7 @@
 #' ambiguous format and natively supported by R via \code{\link{read.table}}
 #' or \code{\link[readr:read_tsv]{readr::read_tsv}}.
 #'
-#' @export
 #' @aliases datapackage
-#' @importFrom readr write_csv write_tsv
 #' @importFrom tools md5sum
 #' @param path root directory of the data package
 #' @param verbose emits some debugging messages
@@ -31,9 +29,9 @@
 #'
 #' # Parse data
 #' pkg$resources$read("iris")
-data_package <- function(path = ".", verbose = TRUE){
+datapkg_new <- function(path = ".", verbose = TRUE){
   pkg_file <- function(x, exists = TRUE) {
-    normalizePath(file.path(path, x), mustWork = exists)
+    normalizePath(file.path(path, x), mustWork = exists && !is_url(x))
   }
 
   pkg_json <- function(){
@@ -141,39 +139,46 @@ data_package <- function(path = ".", verbose = TRUE){
 
   # Resources object
   pkg_resources <- function(){
-    find <- function(title = "", folder = NULL){
+    find <- function(name = "", folder = NULL){
       data <- Filter(function(x){
-        if(length(folder) && !(grepl(paste0("^", folder, "/"), x$path))){
+        res_path <- paste0("", x$path)
+        res_name <- paste0("", x$name)
+        if(length(folder) && !(grepl(paste0("^", folder, "/"), res_path)))
           return(FALSE)
-        }
-        grepl(title, x$title, fixed = TRUE)
+        grepl(name, res_name, fixed = TRUE)
       }, pkg_read()$resources)
-      jsonlite:::simplifyDataFrame(data, c("title", "path", "format"), flatten = FALSE, simplifyMatrix = FALSE)
+      for(i in seq_along(data)){
+        data[[i]]$read = function(){
+          target <- data[[i]]
+          read_data_package(pkg_file(target$path), dialect = target$dialect, hash = target$hash, target$schema)
+        }
+      }
+      jsonlite:::simplifyDataFrame(data, c("name", "path", "format", "read"), flatten = FALSE, simplifyMatrix = FALSE)
     }
-    info <- function(title){
+    info <- function(name){
       data <- Filter(function(x){
-        (x$title == title)
+        (x$name == name)
       }, pkg_read()$resources)
       if(!length(data))
-        stop("Resource not found: ", title)
+        stop("Resource not found: ", name)
       data[[1]]
     }
-    add <- function(data, title, folder = "data", format = "csv"){
+    add <- function(data, name, folder = "data", format = "csv"){
       stopifnot(is.data.frame(data))
-      if(missing(title))
-        title <- deparse(substitute(data))
+      if(missing(name))
+        name <- deparse(substitute(data))
       format <- match.arg(format)
-      if(nrow(find(title)))
-        stop("Resource with title '", title, "' already exists.")
-      file_name <- paste(title, format, sep = ".")
+      if(nrow(find(name)))
+        stop("Resource with name '", name, "' already exists.")
+      file_name <- paste(name, format, sep = ".")
       file_path <- file.path(folder, file_name)
       abs_path <- pkg_file(file_path, exists = FALSE)
       dir.create(pkg_file(folder, exists = FALSE), showWarnings = FALSE)
       write_data <- prepare_data(data)
       readr::write_delim(write_data, abs_path, delim = ";", col_names = TRUE)
       hash <- tools::md5sum(abs_path)
       rec <- base::list(
-        title = title,
+        name = name,
         path = file_path,
         format = "tsv",
         hash = unname(hash),
@@ -186,17 +191,17 @@ data_package <- function(path = ".", verbose = TRUE){
       pkg_update(resources = c(pkg_read()$resources, base::list(rec)))
       find()
     }
-    remove <- function(title, folder = "data"){
-      stopifnot(is_string(title))
-      target <- info(title)
+    remove <- function(name, folder = "data"){
+      stopifnot(is_string(name))
+      target <- info(name)
       unlink(pkg_file(target$path))
       pkg_update(resources = Filter(function(x){
-        (x$title != title)
+        (x$name != name)
       }, pkg_read()$resources))
       find()
     }
-    read <- function(title, folder = "data"){
-      target <- info(title)
+    read <- function(name){
+      target <- info(name)
       data_path <- pkg_file(target$path)
       read_data_package(data_path, dialect = target$dialect, hash = target$hash, target$schema)
     }
@@ -282,7 +287,7 @@ make_schema <- function(data){
       type = get_type(data[[i]])
     )
   }
-  out
+  list(fields = out)
 }
 
 from_json <- function(path){
@@ -298,3 +303,59 @@ is_string <- function(x){
   is.character(x) && identical(length(x), 1L)
 }
 
+is_url <- function(x){
+  grepl("^[a-zA-Z]+://", x)
+}
+
+
+# Implements: http://dataprotocols.org/json-table-schema/#schema
+coerse_type <- function(x, type){
+  switch(type,
+         string = as.character(x),
+         number = as.numeric(x),
+         integer = as.integer(x),
+         boolean = parse_bool(x),
+         object = lapply(x, from_json),
+         array = lapply(x, from_json),
+         date = parse_date(x),
+         datetime = parse_datetime(x),
+         time = paste_time(x),
+         as.character(x)
+  )
+}
+
+get_type <- function(x){
+  if(inherits(x, "Date")) return("date")
+  if(inherits(x, "POSIXt")) return("datetime")
+  if(is.character(x)) return("string")
+  if(is.integer(x)) return("integer")
+  if(is.numeric(x)) return("number")
+  if(is.logical(x)) return("boolean")
+  return("string")
+}
+
+parse_bool <- function(x){
+  is_true <- (x %in% c("yes", "y", "true", "t", "1"))
+  is_false <- (x %in% c("no", "n", "false", "f", "0"))
+  is_na <- is.na(x) | (x %in% c("NA", "na", ""))
+  is_none <- (!is_true & !is_false & !is_na)
+  if(any(is_none))
+    stop("Failed to parse boolean values: ", paste(head(x[is_none], 5), collapse = ", "))
+  out <- rep(FALSE, length(x))
+  out[is_na] <- NA
+  out[is_true] <- TRUE
+  out
+}
+
+parse_date <- function(x){
+  as.Date(x)
+}
+
+parse_datetime <- function(x){
+  as.POSIXct(x)
+}
+
+paste_time <- function(x){
+  as.POSIXct(x)
+}
+
diff --git a/R/print.R → R/old/print.R b/R/print.R → R/old/print.R