Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

extract worksheet reading code from functions #7

Open
wants to merge 34 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
3686b4a
extract worksheet reading code from functions
May 18, 2016
3be6672
add function to list files in the xlsx
May 18, 2016
989433e
stub for is_xlsx()
May 18, 2016
794a5d3
start of experiment with registration fxn
May 19, 2016
53ea8b3
I've poked around all files in xlsx now
May 20, 2016
1f60cb3
oops, correct import of %>%
May 20, 2016
36a7f21
toy registration fxn for jenny to get oriented
May 22, 2016
730dd77
beef up is_xlsx()
May 23, 2016
918297c
add xlsx_read_Content_Types(), @richfitz style :sunglasses:
May 23, 2016
aef3dc3
vcapply2()
May 23, 2016
966da4a
add xlsx_read_workbook_JENNY()
May 23, 2016
b89de6e
use existing xlsx_read_shared_strings()
May 23, 2016
6db4859
explain and add another version of ekaterinburg sheet
May 24, 2016
dcf7086
add more test/example sheets
May 25, 2016
5904e1c
peg xml2 at a specific commit
May 27, 2016
662ad53
utility fxns (temporary?) for namespaces and cell ref string parsing
May 27, 2016
9e54890
gitignore xlsx temp files
May 27, 2016
30c258f
add test file with vexing characters in worksheet names
May 27, 2016
bbe858a
xlsx_read_workbook_sheets()
May 27, 2016
48021a1
anticipate alternative namespaces for shared strings
May 27, 2016
9535469
let dep on gh xml2 float (to the tip)
May 30, 2016
ae6b988
Merge branch 'master' into discuss-read-fxns
May 30, 2016
f651550
xml2 now returns namespaces among the attributes
May 30, 2016
3a50db1
xml2::xml_find_one --> xml_find_first
May 30, 2016
f27d25c
use the new/updated fxns in register()
May 30, 2016
a20bfbf
update + render faux vignette
May 30, 2016
1cb4571
de-purrr
May 30, 2016
d8d62f9
WIP: clean up rexcel_register()
May 31, 2016
3ea4cd5
add sharedStrings to faux vignette
Jun 2, 2016
08168af
saner comparison of namespaces
Jun 9, 2016
7dcf4c1
specify arg names
Jun 9, 2016
cc7c1ed
more cleaning up rexcel_register
Jun 9, 2016
81ea5e6
WIP: faux vignette, for skype call
Jun 9, 2016
21f83df
Merge branch 'master' into discuss-read-fxns
Jun 9, 2016
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 8 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,14 @@ Imports:
linen (>= 0.0.3),
progress,
tibble,
xml2 (>= 0.1.2.9000)
xml2 (>= 0.1.2.9000),
dplyr
Suggests:
knitr,
rmarkdown,
rprojroot,
testthat
RoxygenNote: 5.0.1.9000
Remotes: hadley/xml2
VignetteBuilder: knitr
Remotes:
hadley/xml2
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@
export(rexcel_read)
export(rexcel_read_workbook)
export(rexcel_readxl)
export(rexcel_register)
importFrom(dplyr,"%>%")
38 changes: 27 additions & 11 deletions R/read.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,7 @@ rexcel_read <- function(path, sheet=1L) {
##' @param progress Display a progress bar?
##' @export
rexcel_read_workbook <- function(path, sheets=NULL, progress=TRUE) {
if (!file.exists(path)) {
stop(sprintf("%s does not exist", path))
}
is_xlsx(path)

dat <- xlsx_read_workbook(path)

Expand Down Expand Up @@ -69,11 +67,11 @@ rexcel_read_workbook <- function(path, sheets=NULL, progress=TRUE) {
} else {
fmt <- xlsx_format_codes()
}
num_fmt <- tibble::data_frame(num_fmt=fmt)
style <- linen::linen_style(lookup, font=style_xlsx$fonts,
fill=style_xlsx$fills,
border=style_xlsx$borders,
num_fmt=num_fmt)
num_fmt <- tibble::data_frame(num_fmt = fmt)
style <- linen::linen_style(lookup, font = style_xlsx$fonts,
fill = style_xlsx$fills,
border = style_xlsx$borders,
num_fmt = num_fmt)

workbook <- linen::workbook(sheets, style, dat$defined_names)
for (s in sheets) {
Expand Down Expand Up @@ -132,16 +130,34 @@ xlsx_read_sheet <- function(path, sheet, workbook_dat) {
xml
}


#' Read XML for a specific file
#'
#' Read in the XML for a specific file within the xlsx, e.g. the file
#' corresponding to a specific worksheet.
#'
#' @param path path to xlsx
#' @param file xml file corresponding to a specific worksheet
#'
#' @return an XML document
#'
#' @keywords internal
xlsx_read_file <- function(path, file) {
tmp <- tempfile()
dir.create(tmp)
## Oh boy more terrible default behaviour.
filename <- tryCatch(utils::unzip(path, file, exdir=tmp),
warning=function(e) stop(e))
on.exit(unlink(tmp, recursive=TRUE))
filename <- tryCatch(utils::unzip(path, file, exdir = tmp),
warning = function(e) stop(e))
on.exit(unlink(tmp, recursive = TRUE))
xml2::read_xml(filename)
}

xlsx_list_files <- function(path) {
ret <- tibble::as_data_frame(utils::unzip(path, list = TRUE))
names(ret) <- tolower(names(ret))
ret[order(ret$name), ]
}

xlsx_read_file_if_exists <- function(path, file, missing=NULL) {
## TODO: Appropriate error handling here is difficult; we should
## check that `path` exists, but by the time that this is called we
Expand Down
4 changes: 2 additions & 2 deletions R/read_style.R
Original file line number Diff line number Diff line change
Expand Up @@ -132,8 +132,8 @@ xlsx_read_theme <- function(path) {

## 18.8.23 fonts
xlsx_ct_fonts <- function(xml, ns, theme, index) {
process_container(xml, "d1:fonts", ns, xlsx_ct_font,
theme, index)
process_container(xml, xpath = "d1:fonts", ns = ns,
fun = xlsx_ct_font, theme, index)
}

## 18.8.22 font
Expand Down
121 changes: 120 additions & 1 deletion R/read_workbook.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,122 @@
xlsx_read_Content_Types <- function(path) {
ct <- xlsx_read_file(path, "[Content_Types].xml")
node_att <- lapply(xml2::xml_contents(ct), xml_attrs_list)
tibble::data_frame(
part_name = vcapply2(node_att, "PartName"),
extension = vcapply2(node_att, "Extension"),
content_type = vcapply2(node_att, "ContentType")
)
}

xlsx_read_workbook_sheets <- function(path) {
xml <- xlsx_read_file(path, "xl/workbook.xml")
## why do I write this weird XPath?
## namespace avoidance in order to handle xlsx like
## Ekaterinburg_IP.xlsx from here:
## https://github.com/hadley/readxl/issues/80
sheets <- xml2::xml_find_first(xml, ".//*[local-name() = 'sheets']")
sheets_att <- lapply(xml2::xml_contents(sheets), xml_attrs_list)
tibble::data_frame(
name = vcapply2(sheets_att, "name"),
state = vcapply2(sheets_att, "state"),
sheet_id = as.integer(vcapply2(sheets_att, "sheetId")),
id = vcapply2(sheets_att, "id")
)
}

xlsx_read_workbook_defined_names <- function(path) {
xml <- xlsx_read_file(path, "xl/workbook.xml")

## 18.14.5 definedName
## 18.14.6 definedNames

## the definedName nodes can have different XML structure
## Rich had one in mind during his initial pass (or just worked from spec)
## --> all info found in attributes (name, refersTo, sheedId)
## which motivated
## xlsx_ct_external_defined_names()
## Jenny sees a different structure in the example sheet she created
## inst/sheets/defined_names.xlsx
## --> node value gives cell ref (Rich found this in refersTo attr)
## --> attributes refersTo, sheetId don't even exist
## --> localSheetId attribute appears when there are duplicated names (gabe.xlsx)
##
## this is an attempt to accomodate all forms but Jenny doesn't have an actual
## example of the first form to look at
dn_nodes <- xml2::xml_find_all(xml, "//*[local-name() = 'definedName']")
## why do I write this weird XPath?
## namespace avoidance in order to handle xlsx like
## Ekaterinburg_IP.xlsx from here:
## https://github.com/hadley/readxl/issues/80
if (length(dn_nodes) == 0) {
return(NULL)
}
dn_att <- lapply(dn_nodes, xml_attrs_list)

## this is where the spec suggests you will find the cell refs/ranges
refers_to <- vcapply2(dn_att, "refersTo")
## and yet Jenny finds them as node text ...
if (all(is.na(refers_to))) {
refers_to <- xml2::xml_text(dn_nodes)
}

## may just be NAs
sheet_id <- as.integer(vcapply2(dn_att, "sheetId"))
local_sheet_id <- as.integer(vcapply2(dn_att, "localSheetId"))

tibble::data_frame(
name = vcapply2(dn_att, "name"),
refers_to,
sheet_id,
local_sheet_id
)
}

xlsx_read_workbook_rels <- function(path) {
## do we really have to worry about this file not existing?
xml <- xlsx_read_file_if_exists(path, "xl/_rels/workbook.xml.rels")
if (is.null(xml)) {
return(NULL)
}
rel_nodes <- xml2::xml_children(xml)
rels <- rbind_df(lapply(rel_nodes, xml_attrs_list))

names(rels) <- tolower(names(rels))
rels[c("target", "id", "type")]
## MAYBE TODOs, if decide to do more processing:
##
## prepend target with "xl/"
## but check the type and don't do if it's an external reference
##
## just take the last bit of type, i.e.basename(type),
}

xlsx_read_worksheet_rels <- function(path) {
manifest <- xlsx_list_files(path)
holds_ws_rels <-
grepl("xl/worksheets/_rels/sheet[0-9]*.xml.rels", manifest$name)
if (none(holds_ws_rels)) {
return(NULL)
}
ws_rels_fnames <- manifest$name[holds_ws_rels]
nms <- gsub("xl/worksheets/_rels/(sheet[0-9]*).xml.rels", "\\1",
ws_rels_fnames)
worksheet_rels <- lapply(ws_rels_fnames, xlsx_read_file, path = path)
worksheet_rels <-
lapply(worksheet_rels, xml2::xml_find_all, xpath = "//d1:Relationship")
names(worksheet_rels) <- nms
f <- function(x) {
x %>%
xml2::xml_attrs() %>%
lapply(as.list) %>%
dplyr::bind_rows()
}
worksheet_rels <- lapply(worksheet_rels, f) %>%
dplyr::bind_rows(.id = "worksheet")
names(worksheet_rels) <- tolower(names(worksheet_rels))
worksheet_rels
}

xlsx_read_workbook <- function(path) {
## TODO: Consider what do do when rels is NULL; do we throw?
rels <- xlsx_read_rels(path, "xl/workbook.xml")
Expand Down Expand Up @@ -34,7 +153,7 @@ xlsx_ct_sheet <- function(xml, ns) {
ref = attr_character(at[["id"]]))
}

## 18.14.6 definedName
## 18.14.6 definedNames
xlsx_ct_external_defined_names <- function(xml, ns) {
process_container(xml, "d1:definedNames", ns, xlsx_ct_external_defined_name,
classes=TRUE)
Expand Down
102 changes: 102 additions & 0 deletions R/register.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#' Register an Excel workbook
#'
#' Experimental function, in an experimental package! It covers alot of the same
#' ground as \code{\link{rexcel_read_workbook}()} with several notable
#' exceptions.
#'
#' \itemize{
#' \item Currently returns a list, not a proper linen::workbook.
#' \item Read one file at a time, make one or more objects from it, with only
#' the processing necessary to make it a reasonable R object.
#' \item For some clearly useful things, create downstream objects via
#' processing and/or combining info across primary objects.
#' }
#'
#' @param path path to xlsx
#'
#' @return a list
#' @keywords internal
#'
#' @examples
#' mini_gap_path <- system.file("sheets", "mini-gap.xlsx", package = "rexcel")
#' rexcel_register(mini_gap_path)
#'
#' ff_path <- system.file("sheets", "gs-test-formula-formatting.xlsx",
#' package = "rexcel")
#' rexcel_register(ff_path)
#' @export
rexcel_register <- function(path) {
## TO DO:
## if path is actually a workbook
## Recall(path$path)
## i.e. refresh registration of the workbook
## to be used when you are concerned the xlsx has changed
if (!is_xlsx(path)) {
stop("`path` does not appear to point to valid xlsx:\n", path,
call. = FALSE)
}
manifest <- xlsx_list_files(path)
ct <- xlsx_read_Content_Types(path) ## [Content_Types].xml
# rels <- xlsx_read_file(path, "_rels/.rels") # always boring? skipped
sheets <- xlsx_read_workbook_sheets(path) # xl/workbook.xml
defined_names <- xlsx_read_workbook_defined_names(path)
workbook_rels <- xlsx_read_workbook_rels(path)

## allow ourselves to do some synthesis of the above
## in workbook_rels, prepend xl/ to target
## join to sheets on id
## result has one row per worksheet
sheets_df <- join_sheets_workbook_rels(sheets, workbook_rels)

## back to the slog
shared_strings <- xlsx_read_shared_strings(path)
## reverting to Rich's work here and god bless him for it
styles <- xlsx_read_style(path)
## xl/worksheets/_rels/sheet1.xml.rels and friends
worksheet_rels <- xlsx_read_worksheet_rels(path)

## xl/worksheets/sheet1.xml etc.
#one_sheet <- xlsx_read_file(path, "xl/worksheets/sheet1.xml")
## come back here and parse enough to learn worksheet extent
## otherwise, I don't see anything here that belongs in top-level workbook
## creation

## xl/drawings/worksheetdrawing1.xml etc.
#one_drawing <- xlsx_read_file(path, "xl/drawings/worksheetdrawing1.xml")
## I don't see anything here that belongs in top-level workbook creation
## also, in my toy examples with no charts, this consists only of namespaces

dplyr::lst(xlsx_path = path,
reg_time = Sys.time(),
manifest,
content_types = ct,
sheets,
defined_names,
workbook_rels,
shared_strings,
styles,
worksheet_rels,
sheets_df
)
## TODO: obviously this should return a workbook object!
}

join_sheets_workbook_rels <- function(sheets, workbook_rels) {
suppressMessages(
sheets_df <- workbook_rels %>%
dplyr::mutate(target = file.path("xl", target)) %>%
dplyr::right_join(sheets) %>%
## use one_of() here because not all variables exist all the time
dplyr::select(
dplyr::one_of(c("sheet_id", "name", "id", "target", "state", "type"))
)
)
unique_type <- unique(sheets_df$type)
if (!identical(unique_type,
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet")) {
message("New type found for a worksheet! LOOK INTO THIS.")
} else {
sheets_df$type <- NULL
}
sheets_df
}
1 change: 1 addition & 0 deletions R/rexcel-package.r
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
#'
#' @name rexcel
#' @docType package
#' @importFrom dplyr %>%
NULL
15 changes: 14 additions & 1 deletion R/shared_strings.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,20 @@ xlsx_read_shared_strings <- function(path) {
if (is.null(xml)) {
return(character(0))
}
vcapply(xml2::xml_children(xml), xlsx_ct_rst, xml2::xml_ns(xml))
ns <- xml2::xml_ns(xml)
## to deal w/ less common namespacing, e.g. Ekaterinburg sheet
alt_ns <-
construct_xml_ns(x = "http://schemas.openxmlformats.org/spreadsheetml/2006/main")
if (ns_equal_to_ref(ns, alt_ns)) {
ns <- xml2::xml_ns_rename(ns, x = "d1")
}
string_items <- xml2::xml_children(xml)
ret <- vcapply(string_items, xlsx_ct_rst, ns)
## these gymnastics are necessary to preserve attribute names
at <- as.list(xml2::xml_attrs(xml)[c("count", "uniqueCount")])
at <- lapply(at, as.integer)
attributes(ret) <- at
ret
}

## 18.4.8 si
Expand Down