Skip to content
This repository has been archived by the owner on May 10, 2022. It is now read-only.

Commit

Permalink
reimplement from scratch based on new knowledge/spec
Browse files Browse the repository at this point in the history
  • Loading branch information
jeroen committed May 13, 2016
1 parent 2335f82 commit 19f5d97
Show file tree
Hide file tree
Showing 13 changed files with 290 additions and 226 deletions.
2 changes: 2 additions & 0 deletions .Rbuildignore
Expand Up @@ -3,3 +3,5 @@
^appveyor\.yml$
^\.travis\.yml$
^data$
^tests/testsuite-py$
^datapackage.json$
8 changes: 5 additions & 3 deletions DESCRIPTION
@@ -1,4 +1,4 @@
Package: datapackage
Package: datapkg
Type: Package
Title: Read and Write Data Packages
Version: 0.1
Expand All @@ -7,11 +7,13 @@ Authors@R: c(
person("Karthik", "Ram", email = "karthik.ram@gmail.com", role = "aut"))
Description: Convenience functions for reading and writing datasets following
the 'data packagist' format.
URL: http://dataprotocols.org/data-packages/
URL: http://frictionlessdata.io/data-packages/
License: MIT + file LICENSE
Imports:
methods,
readr,
jsonlite
jsonlite,
curl
Suggests:
ggplot2
RoxygenNote: 5.0.1
12 changes: 4 additions & 8 deletions NAMESPACE
@@ -1,10 +1,6 @@
# Generated by roxygen2: do not edit by hand

S3method("$",jeroen)
S3method("[",jeroen)
S3method("[[",jeroen)
S3method(print,jeroen)
export(data_package)
importFrom(readr,write_csv)
importFrom(readr,write_tsv)
importFrom(tools,md5sum)
S3method(print,datapkg_data)
S3method(print,datapkg_resources)
export(datapkg_read)
import(readr)
127 changes: 127 additions & 0 deletions R/datapkg_read.R
@@ -0,0 +1,127 @@
#' Open data-package
#'
#' Loads a data and meta-data from a 'data-package' directory or URL.
#'
#' @import readr
#' @param path file path or URL to the data package directory
#' @rdname datapackage
#' @name datapackage
#' @aliases datapkg
#' @references \url{http://frictionlessdata.io/data-packages}, \url{https://github.com/datasets}
#' @export
#' @examples # Example data from https://github.com/datasets
#' datapkg_read("https://raw.githubusercontent.com/datasets/ex-tabular-multiple-resources-fk/master")
#' datapkg_read("https://raw.githubusercontent.com/datasets/gini-index/master")
#' datapkg_read("https://raw.githubusercontent.com/datasets/euribor/master")
datapkg_read <- function(path){
root <- sub("datapackage.json$", "", path)
root <- sub("/$", "", root)
json_path <- file.path(root, "/datapackage.json")
json <- if(is_url(root)){
con <- curl::curl(json_path, "r")
on.exit(close(con))
readLines(con, warn = FALSE)
} else {
readLines(normalizePath(json_path, mustWork = TRUE), warn = FALSE)
}
pkg_info <- jsonlite::fromJSON(json, simplifyVector = TRUE)
if(is.data.frame(pkg_info$resources))
class(pkg_info$resources) <- c("datapkg_resources", class(pkg_info$resources))
if(is.data.frame(pkg_info$sources))
class(pkg_info$sources) <- c("datapkg_sources", class(pkg_info$sources))
pkg_info$data <- list(rep(NA, nrow(pkg_info$resources)))
data_names <- pkg_info$resources$name
for(i in seq_len(nrow(pkg_info$resources))){
target <- as.list(pkg_info$resources[i, ])
pkg_info$data[[i]] <- read_data_package(get_data_path(target, root),
dialect = as.list(target$dialect), hash = target$hash, target$schema$fields[[1]])
}
class(pkg_info$data) <- c("datapkg_data")
if(length(data_names))
names(pkg_info$data) <- ifelse(is.na(data_names), "", data_names)
pkg_info
}

get_data_path <- function(x, root){
if(length(x$path)){
data_path <- normalizePath(file.path(root, x$path), mustWork = FALSE)
if(is_url(data_path) || file.exists(data_path)){
return(data_path)
} else {
if(length(x$url)){
message("File not found: ", data_path)
return(x$url)
} else {
stop("File not found: ", data_path)
}
}
}
}

is_url <- function(x){
grepl("^[a-zA-Z]+://", x)
}

read_data_package <- function(path, dialect = list(), hash = NULL, fields = NULL) {
if(!length(fields))
return(data.frame())
col_types <- list()
for(i in seq_len(nrow(fields)))
col_types[[i]] <- do.call(make_field, as.list(fields[i,]))
do.call(parse_data_file, c(list(file = path, col_types = col_types), dialect))
}

make_field <- function(name = "", type = "string", description = "", format = "%Y-%m-%d", ...){
switch(type,
string = col_character(),
number = col_number(),
integer = col_integer(),
boolean = col_logical(),
object = col_character(),
array = col_character(),
date = col_date(sub("^fmt:", "", format)),
datetime = col_datetime(),
time = col_time(),
col_character()
)
}

## Defaults from http://dataprotocols.org/csv-dialect/
parse_data_file <- function(file, col_types = NULL, delimiter = ",", doubleQuote = TRUE,
lineTerminator = "\r\n", quoteChar = '"', escapeChar = "", skipInitialSpace = TRUE,
header = TRUE, caseSensitiveHeader = FALSE){

# unused: lineTerminator, skipInitialSpace, caseSensitiveHeader
message("Reading file ", file)
readr::read_delim(
col_types = col_types,
file = file,
delim = delimiter,
escape_double = doubleQuote,
quote = quoteChar,
escape_backslash = identical(escapeChar, "\\"),
col_names = header
)
}

#' @export
print.datapkg_resources <- function(x, ...){
print_names <- names(x) %in% c("name", "path", "format")
print(as.data.frame(x)[print_names])
}

#' @export
print.datapkg_data <- function(x, ...){
for(i in seq_along(x)){
data_name <- names(x[i])
if(length(data_name) && !is.na(data_name)){
cat(" $", data_name, "\n", sep = "")
} else {
cat(" [[", i, "]]\n", sep = "")
}
mydata <- x[[i]]
for(j in seq_along(mydata)){
cat(" [", j, "] ", names(mydata)[j], " (", methods::is(mydata[[j]])[1], ")\n", sep = "")
}
}
}
113 changes: 87 additions & 26 deletions R/dpkg.R → R/old/datapkg_new.R
Expand Up @@ -7,9 +7,7 @@
#' ambiguous format and natively supported by R via \code{\link{read.table}}
#' or \code{\link[readr:read_tsv]{readr::read_tsv}}.
#'
#' @export
#' @aliases datapackage
#' @importFrom readr write_csv write_tsv
#' @importFrom tools md5sum
#' @param path root directory of the data package
#' @param verbose emits some debugging messages
Expand All @@ -31,9 +29,9 @@
#'
#' # Parse data
#' pkg$resources$read("iris")
data_package <- function(path = ".", verbose = TRUE){
datapkg_new <- function(path = ".", verbose = TRUE){
pkg_file <- function(x, exists = TRUE) {
normalizePath(file.path(path, x), mustWork = exists)
normalizePath(file.path(path, x), mustWork = exists && !is_url(x))
}

pkg_json <- function(){
Expand Down Expand Up @@ -141,39 +139,46 @@ data_package <- function(path = ".", verbose = TRUE){

# Resources object
pkg_resources <- function(){
find <- function(title = "", folder = NULL){
find <- function(name = "", folder = NULL){
data <- Filter(function(x){
if(length(folder) && !(grepl(paste0("^", folder, "/"), x$path))){
res_path <- paste0("", x$path)
res_name <- paste0("", x$name)
if(length(folder) && !(grepl(paste0("^", folder, "/"), res_path)))
return(FALSE)
}
grepl(title, x$title, fixed = TRUE)
grepl(name, res_name, fixed = TRUE)
}, pkg_read()$resources)
jsonlite:::simplifyDataFrame(data, c("title", "path", "format"), flatten = FALSE, simplifyMatrix = FALSE)
for(i in seq_along(data)){
data[[i]]$read = function(){
target <- data[[i]]
read_data_package(pkg_file(target$path), dialect = target$dialect, hash = target$hash, target$schema)
}
}
jsonlite:::simplifyDataFrame(data, c("name", "path", "format", "read"), flatten = FALSE, simplifyMatrix = FALSE)
}
info <- function(title){
info <- function(name){
data <- Filter(function(x){
(x$title == title)
(x$name == name)
}, pkg_read()$resources)
if(!length(data))
stop("Resource not found: ", title)
stop("Resource not found: ", name)
data[[1]]
}
add <- function(data, title, folder = "data", format = "csv"){
add <- function(data, name, folder = "data", format = "csv"){
stopifnot(is.data.frame(data))
if(missing(title))
title <- deparse(substitute(data))
if(missing(name))
name <- deparse(substitute(data))
format <- match.arg(format)
if(nrow(find(title)))
stop("Resource with title '", title, "' already exists.")
file_name <- paste(title, format, sep = ".")
if(nrow(find(name)))
stop("Resource with name '", name, "' already exists.")
file_name <- paste(name, format, sep = ".")
file_path <- file.path(folder, file_name)
abs_path <- pkg_file(file_path, exists = FALSE)
dir.create(pkg_file(folder, exists = FALSE), showWarnings = FALSE)
write_data <- prepare_data(data)
readr::write_delim(write_data, abs_path, delim = ";", col_names = TRUE)
hash <- tools::md5sum(abs_path)
rec <- base::list(
title = title,
name = name,
path = file_path,
format = "tsv",
hash = unname(hash),
Expand All @@ -186,17 +191,17 @@ data_package <- function(path = ".", verbose = TRUE){
pkg_update(resources = c(pkg_read()$resources, base::list(rec)))
find()
}
remove <- function(title, folder = "data"){
stopifnot(is_string(title))
target <- info(title)
remove <- function(name, folder = "data"){
stopifnot(is_string(name))
target <- info(name)
unlink(pkg_file(target$path))
pkg_update(resources = Filter(function(x){
(x$title != title)
(x$name != name)
}, pkg_read()$resources))
find()
}
read <- function(title, folder = "data"){
target <- info(title)
read <- function(name){
target <- info(name)
data_path <- pkg_file(target$path)
read_data_package(data_path, dialect = target$dialect, hash = target$hash, target$schema)
}
Expand Down Expand Up @@ -282,7 +287,7 @@ make_schema <- function(data){
type = get_type(data[[i]])
)
}
out
list(fields = out)
}

from_json <- function(path){
Expand All @@ -298,3 +303,59 @@ is_string <- function(x){
is.character(x) && identical(length(x), 1L)
}

is_url <- function(x){
grepl("^[a-zA-Z]+://", x)
}


# Implements: http://dataprotocols.org/json-table-schema/#schema
coerse_type <- function(x, type){
switch(type,
string = as.character(x),
number = as.numeric(x),
integer = as.integer(x),
boolean = parse_bool(x),
object = lapply(x, from_json),
array = lapply(x, from_json),
date = parse_date(x),
datetime = parse_datetime(x),
time = paste_time(x),
as.character(x)
)
}

get_type <- function(x){
if(inherits(x, "Date")) return("date")
if(inherits(x, "POSIXt")) return("datetime")
if(is.character(x)) return("string")
if(is.integer(x)) return("integer")
if(is.numeric(x)) return("number")
if(is.logical(x)) return("boolean")
return("string")
}

parse_bool <- function(x){
is_true <- (x %in% c("yes", "y", "true", "t", "1"))
is_false <- (x %in% c("no", "n", "false", "f", "0"))
is_na <- is.na(x) | (x %in% c("NA", "na", ""))
is_none <- (!is_true & !is_false & !is_na)
if(any(is_none))
stop("Failed to parse boolean values: ", paste(head(x[is_none], 5), collapse = ", "))
out <- rep(FALSE, length(x))
out[is_na] <- NA
out[is_true] <- TRUE
out
}

parse_date <- function(x){
as.Date(x)
}

parse_datetime <- function(x){
as.POSIXct(x)
}

paste_time <- function(x){
as.POSIXct(x)
}

File renamed without changes.

0 comments on commit 19f5d97

Please sign in to comment.