rsheets · jennybc · May 18, 2016 · May 18, 2016 · May 18, 2016 · May 19, 2016
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -13,8 +13,14 @@ Imports:
     linen (>= 0.0.3),
     progress,
     tibble,
-    xml2 (>= 0.1.2.9000)
+    xml2 (>= 0.1.2.9000),
+    dplyr
 Suggests:
+    knitr,
+    rmarkdown,
+    rprojroot,
     testthat
 RoxygenNote: 5.0.1.9000
-Remotes: hadley/xml2
+VignetteBuilder: knitr
+Remotes:
+    hadley/xml2
diff --git a/NAMESPACE b/NAMESPACE
@@ -3,3 +3,5 @@
 export(rexcel_read)
 export(rexcel_read_workbook)
 export(rexcel_readxl)
+export(rexcel_register)
+importFrom(dplyr,"%>%")
diff --git a/R/read.R b/R/read.R
@@ -32,9 +32,7 @@ rexcel_read <- function(path, sheet=1L) {
 ##' @param progress Display a progress bar?
 ##' @export
 rexcel_read_workbook <- function(path, sheets=NULL, progress=TRUE) {
-  if (!file.exists(path)) {
-    stop(sprintf("%s does not exist", path))
-  }
+  is_xlsx(path)
 
   dat <- xlsx_read_workbook(path)
 
@@ -69,11 +67,11 @@ rexcel_read_workbook <- function(path, sheets=NULL, progress=TRUE) {
   } else {
     fmt <- xlsx_format_codes()
   }
-  num_fmt <- tibble::data_frame(num_fmt=fmt)
-  style <- linen::linen_style(lookup, font=style_xlsx$fonts,
-                              fill=style_xlsx$fills,
-                              border=style_xlsx$borders,
-                              num_fmt=num_fmt)
+  num_fmt <- tibble::data_frame(num_fmt = fmt)
+  style <- linen::linen_style(lookup, font = style_xlsx$fonts,
+                              fill = style_xlsx$fills,
+                              border = style_xlsx$borders,
+                              num_fmt = num_fmt)
 
   workbook <- linen::workbook(sheets, style, dat$defined_names)
   for (s in sheets) {
@@ -132,16 +130,34 @@ xlsx_read_sheet <- function(path, sheet, workbook_dat) {
   xml
 }
 
+
+#' Read XML for a specific file
+#'
+#' Read in the XML for a specific file within the xlsx, e.g. the file
+#' corresponding to a specific worksheet.
+#'
+#' @param path path to xlsx
+#' @param file xml file corresponding to a specific worksheet
+#'
+#' @return an XML document
+#'
+#' @keywords internal
 xlsx_read_file <- function(path, file) {
   tmp <- tempfile()
   dir.create(tmp)
   ## Oh boy more terrible default behaviour.
-  filename <- tryCatch(utils::unzip(path, file, exdir=tmp),
-                       warning=function(e) stop(e))
-  on.exit(unlink(tmp, recursive=TRUE))
+  filename <- tryCatch(utils::unzip(path, file, exdir = tmp),
+                       warning = function(e) stop(e))
+  on.exit(unlink(tmp, recursive = TRUE))
   xml2::read_xml(filename)
 }
 
+xlsx_list_files <- function(path) {
+  ret <- tibble::as_data_frame(utils::unzip(path, list = TRUE))
+  names(ret) <- tolower(names(ret))
+  ret[order(ret$name), ]
+}
+
 xlsx_read_file_if_exists <- function(path, file, missing=NULL) {
   ## TODO: Appropriate error handling here is difficult; we should
   ## check that `path` exists, but by the time that this is called we

diff --git a/R/read_style.R b/R/read_style.R
@@ -132,8 +132,8 @@ xlsx_read_theme <- function(path) {
 
 ## 18.8.23 fonts
 xlsx_ct_fonts <- function(xml, ns, theme, index) {
-  process_container(xml, "d1:fonts", ns, xlsx_ct_font,
-                    theme, index)
+  process_container(xml, xpath = "d1:fonts", ns = ns,
+                    fun = xlsx_ct_font, theme, index)
 }
 
 ## 18.8.22 font

diff --git a/R/read_workbook.R b/R/read_workbook.R
@@ -1,3 +1,122 @@
+xlsx_read_Content_Types <- function(path) {
+  ct <- xlsx_read_file(path, "[Content_Types].xml")
+  node_att <- lapply(xml2::xml_contents(ct), xml_attrs_list)
+  tibble::data_frame(
+    part_name = vcapply2(node_att, "PartName"),
+    extension = vcapply2(node_att, "Extension"),
+    content_type = vcapply2(node_att, "ContentType")
+  )
+}
+
+xlsx_read_workbook_sheets <- function(path) {
+  xml <- xlsx_read_file(path, "xl/workbook.xml")
+  ## why do I write this weird XPath?
+  ## namespace avoidance in order to handle xlsx like
+  ## Ekaterinburg_IP.xlsx from here:
+  ## https://github.com/hadley/readxl/issues/80
+  sheets <- xml2::xml_find_first(xml, ".//*[local-name() = 'sheets']")
+  sheets_att <- lapply(xml2::xml_contents(sheets), xml_attrs_list)
+  tibble::data_frame(
+    name = vcapply2(sheets_att, "name"),
+    state = vcapply2(sheets_att, "state"),
+    sheet_id = as.integer(vcapply2(sheets_att, "sheetId")),
+    id = vcapply2(sheets_att, "id")
+  )
+}
+
+xlsx_read_workbook_defined_names <- function(path) {
+  xml <- xlsx_read_file(path, "xl/workbook.xml")
+
+  ## 18.14.5 definedName
+  ## 18.14.6 definedNames
+
+  ## the definedName nodes can have different XML structure
+  ## Rich had one in mind during his initial pass (or just worked from spec)
+  ## --> all info found in attributes (name, refersTo, sheedId)
+  ## which motivated
+  ## xlsx_ct_external_defined_names()
+  ## Jenny sees a different structure in the example sheet she created
+  ## inst/sheets/defined_names.xlsx
+  ## --> node value gives cell ref (Rich found this in refersTo attr)
+  ## --> attributes refersTo, sheetId don't even exist
+  ## --> localSheetId attribute appears when there are duplicated names (gabe.xlsx)
+  ##
+  ## this is an attempt to accomodate all forms but Jenny doesn't have an actual
+  ## example of the first form to look at
+  dn_nodes <- xml2::xml_find_all(xml, "//*[local-name() = 'definedName']")
+  ## why do I write this weird XPath?
+  ## namespace avoidance in order to handle xlsx like
+  ## Ekaterinburg_IP.xlsx from here:
+  ## https://github.com/hadley/readxl/issues/80
+  if (length(dn_nodes) == 0) {
+    return(NULL)
+  }
+  dn_att <- lapply(dn_nodes, xml_attrs_list)
+
+  ## this is where the spec suggests you will find the cell refs/ranges
+  refers_to <- vcapply2(dn_att, "refersTo")
+  ## and yet Jenny finds them as node text ...
+  if (all(is.na(refers_to))) {
+    refers_to <- xml2::xml_text(dn_nodes)
+  }
+
+  ## may just be NAs
+  sheet_id <- as.integer(vcapply2(dn_att, "sheetId"))
+  local_sheet_id <- as.integer(vcapply2(dn_att, "localSheetId"))
+
+  tibble::data_frame(
+     name = vcapply2(dn_att, "name"),
+     refers_to,
+     sheet_id,
+     local_sheet_id
+  )
+}
+
+xlsx_read_workbook_rels <- function(path) {
+  ## do we really have to worry about this file not existing?
+  xml <- xlsx_read_file_if_exists(path, "xl/_rels/workbook.xml.rels")
+  if (is.null(xml)) {
+    return(NULL)
+  }
+  rel_nodes <- xml2::xml_children(xml)
+  rels <- rbind_df(lapply(rel_nodes, xml_attrs_list))
+
+  names(rels) <- tolower(names(rels))
+  rels[c("target", "id", "type")]
+  ## MAYBE TODOs, if decide to do more processing:
+  ##
+  ## prepend target with "xl/"
+  ## but check the type and don't do if it's an external reference
+  ##
+  ## just take the last bit of type, i.e.basename(type),
+}
+
+xlsx_read_worksheet_rels <- function(path) {
+  manifest <- xlsx_list_files(path)
+  holds_ws_rels <-
+    grepl("xl/worksheets/_rels/sheet[0-9]*.xml.rels", manifest$name)
+  if (none(holds_ws_rels)) {
+    return(NULL)
+  }
+  ws_rels_fnames <- manifest$name[holds_ws_rels]
+  nms <- gsub("xl/worksheets/_rels/(sheet[0-9]*).xml.rels", "\\1",
+              ws_rels_fnames)
+  worksheet_rels <- lapply(ws_rels_fnames, xlsx_read_file, path = path)
+  worksheet_rels <-
+    lapply(worksheet_rels, xml2::xml_find_all, xpath = "//d1:Relationship")
+  names(worksheet_rels) <- nms
+  f <- function(x) {
+    x %>%
+      xml2::xml_attrs() %>%
+      lapply(as.list) %>%
+      dplyr::bind_rows()
+  }
+  worksheet_rels <- lapply(worksheet_rels, f) %>%
+    dplyr::bind_rows(.id = "worksheet")
+  names(worksheet_rels) <- tolower(names(worksheet_rels))
+  worksheet_rels
+}
+
 xlsx_read_workbook <- function(path) {
   ## TODO: Consider what do do when rels is NULL; do we throw?
   rels <- xlsx_read_rels(path, "xl/workbook.xml")
@@ -34,7 +153,7 @@ xlsx_ct_sheet <- function(xml, ns) {
     ref = attr_character(at[["id"]]))
 }
 
-## 18.14.6 definedName
+## 18.14.6 definedNames
 xlsx_ct_external_defined_names <- function(xml, ns) {
   process_container(xml, "d1:definedNames", ns, xlsx_ct_external_defined_name,
                     classes=TRUE)

diff --git a/R/register.R b/R/register.R
@@ -0,0 +1,102 @@
+#' Register an Excel workbook
+#'
+#' Experimental function, in an experimental package! It covers alot of the same
+#' ground as \code{\link{rexcel_read_workbook}()} with several notable
+#' exceptions.
+#'
+#' \itemize{
+#' \item Currently returns a list, not a proper linen::workbook.
+#' \item Read one file at a time, make one or more objects from it, with only
+#' the processing necessary to make it a reasonable R object.
+#' \item For some clearly useful things, create downstream objects via
+#' processing and/or combining info across primary objects.
+#' }
+#'
+#' @param path path to xlsx
+#'
+#' @return a list
+#' @keywords internal
+#'
+#' @examples
+#' mini_gap_path <- system.file("sheets", "mini-gap.xlsx", package = "rexcel")
+#' rexcel_register(mini_gap_path)
+#'
+#' ff_path <- system.file("sheets", "gs-test-formula-formatting.xlsx",
+#'                        package = "rexcel")
+#' rexcel_register(ff_path)
+#' @export
+rexcel_register <- function(path) {
+  ## TO DO:
+  ## if path is actually a workbook
+  ## Recall(path$path)
+  ## i.e. refresh registration of the workbook
+  ## to be used when you are concerned the xlsx has changed
+  if (!is_xlsx(path)) {
+    stop("`path` does not appear to point to valid xlsx:\n", path,
+         call. = FALSE)
+  }
+  manifest      <- xlsx_list_files(path)
+  ct            <- xlsx_read_Content_Types(path) ## [Content_Types].xml
+  # rels        <- xlsx_read_file(path, "_rels/.rels") # always boring? skipped
+  sheets        <- xlsx_read_workbook_sheets(path) # xl/workbook.xml
+  defined_names <- xlsx_read_workbook_defined_names(path)
+  workbook_rels <- xlsx_read_workbook_rels(path)
+
+  ## allow ourselves to do some synthesis of the above
+  ## in workbook_rels, prepend xl/ to target
+  ## join to sheets on id
+  ## result has one row per worksheet
+  sheets_df <- join_sheets_workbook_rels(sheets, workbook_rels)
+
+  ## back to the slog
+  shared_strings <- xlsx_read_shared_strings(path)
+  ## reverting to Rich's work here and god bless him for it
+  styles <- xlsx_read_style(path)
+  ## xl/worksheets/_rels/sheet1.xml.rels and friends
+  worksheet_rels <- xlsx_read_worksheet_rels(path)
+
+  ## xl/worksheets/sheet1.xml etc.
+  #one_sheet <- xlsx_read_file(path, "xl/worksheets/sheet1.xml")
+  ## come back here and parse enough to learn worksheet extent
+  ## otherwise, I don't see anything here that belongs in top-level workbook
+  ## creation
+
+  ## xl/drawings/worksheetdrawing1.xml etc.
+  #one_drawing <- xlsx_read_file(path, "xl/drawings/worksheetdrawing1.xml")
+  ## I don't see anything here that belongs in top-level workbook creation
+  ## also, in my toy examples with no charts, this consists only of namespaces
+
+  dplyr::lst(xlsx_path = path,
+      reg_time = Sys.time(),
+      manifest,
+      content_types = ct,
+      sheets,
+      defined_names,
+      workbook_rels,
+      shared_strings,
+      styles,
+      worksheet_rels,
+      sheets_df
+  )
+  ## TODO: obviously this should return a workbook object!
+}
+
+join_sheets_workbook_rels <- function(sheets, workbook_rels) {
+  suppressMessages(
+    sheets_df <- workbook_rels %>%
+      dplyr::mutate(target = file.path("xl", target)) %>%
+      dplyr::right_join(sheets) %>%
+      ## use one_of() here because not all variables exist all the time
+      dplyr::select(
+        dplyr::one_of(c("sheet_id", "name", "id", "target", "state", "type"))
+      )
+  )
+  unique_type <- unique(sheets_df$type)
+  if (!identical(unique_type,
+                 "http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet")) {
+    message("New type found for a worksheet! LOOK INTO THIS.")
+  } else {
+    sheets_df$type <- NULL
+  }
+  sheets_df
+}
diff --git a/R/rexcel-package.r b/R/rexcel-package.r
@@ -2,4 +2,5 @@
 #'
 #' @name rexcel
 #' @docType package
+#' @importFrom dplyr %>%
 NULL
diff --git a/R/shared_strings.R b/R/shared_strings.R
@@ -22,7 +22,20 @@ xlsx_read_shared_strings <- function(path) {
   if (is.null(xml)) {
     return(character(0))
   }
-  vcapply(xml2::xml_children(xml), xlsx_ct_rst, xml2::xml_ns(xml))
+  ns <- xml2::xml_ns(xml)
+  ## to deal w/ less common namespacing, e.g. Ekaterinburg sheet
+  alt_ns <-
+    construct_xml_ns(x = "http://schemas.openxmlformats.org/spreadsheetml/2006/main")
+  if (ns_equal_to_ref(ns, alt_ns)) {
+    ns <- xml2::xml_ns_rename(ns, x = "d1")
+  }
+  string_items <- xml2::xml_children(xml)
+  ret <- vcapply(string_items, xlsx_ct_rst, ns)
+  ## these gymnastics are necessary to preserve attribute names
+  at <- as.list(xml2::xml_attrs(xml)[c("count", "uniqueCount")])
+  at <- lapply(at, as.integer)
+  attributes(ret) <- at
+  ret
 }
 
 ## 18.4.8 si