Skip to content

Commit

Permalink
Merge pull request #27 from mbojan/xml_errors
Browse files Browse the repository at this point in the history
Handling XML faults
  • Loading branch information
sckott committed Oct 6, 2015
2 parents 94d1d3c + 72bf319 commit ecbea70
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 16 deletions.
67 changes: 67 additions & 0 deletions R/read_xml_safely.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Read XML while fishing-out some errors

# Function `tryCatch` looks at conditions thrown by `read_xml`. If the condition
# is of class "Rcpp::exception" a handler function `handle_rcpp_exception` is
# executed. This function looks at the message of the original condition. If it
# looks like "PCDATA invalid Char value" it extracts the value of the invalid
# character and the error number (in square brackets in the original message). A
# new condition object is created of class `"invalid_char_value"` which extends
# the original class `"Rcpp::exception"`. Extracted offending character value
# and error number are added as attributes to the new condition. Finally, the
# condition is signalled.
#
# Conditions not caught by `tryCatch`, so all others apart from
# `"Rcpp::exception"`s, are signaled as usual.
#
# This allows for special handling of different conditions somewhere else

read_xml_with_errors <- function(x, ...) {
# Handle Rcpp exceptions,
# which include libxml errors
handle_rcpp_exception <- function(cond) {
# better error message
cond$message <- paste("xml2::read_xml says:", cond$message)
# enhance for selected errors
if(grepl("PCDATA invalid Char value", cond$message)) {
cond <- condition(
subclass=c("invalid_char_value", class(cond)),
message = cond$message,
call=cond$call,
error_no = as.numeric(stringr::str_extract(cond$message, "(?<=\\[)[0-9]+(?=\\]$)" )),
char_value = as.numeric(stringr::str_extract(cond$message, "(?<=value )[0-9]+(?= \\[)" ))
)
}
stop(cond)
}

# Catch!
tryCatch( xml2::read_xml(x, ...),
"Rcpp::exception" = handle_rcpp_exception
)
}




# Read XML safely
#
# Removes illegal characters
# @param invalid_as NA or character. If not NA, replace invalid characters with `invalid_as`
read_xml_safely <- function(x, ..., xml_invalid_as=getOption("oai.xml_invalid_as", "") ) {
repeat {
tryCatch( return(read_xml_with_errors(x, ...)),
# Removing offending characters
invalid_char_value = function(er, invalid_as=xml_invalid_as) {
charint <- attr(er, "char_value")
if(is.na(invalid_as)) {
stop(er)
} else {
stopifnot(is.character(xml_invalid_as))
x <<- gsub(intToUtf8(charint), invalid_as, x)
msg <- paste0(er$message, ", replacing offending characters with ", dQuote(invalid_as))
warning(msg)
}
}
)
}
}
69 changes: 53 additions & 16 deletions R/while_oai.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,26 +16,41 @@ while_oai <- function(url, args, token, as, dumper=NULL, dumper_args=NULL, ...)
res <- GET(url, query = args2, ...)
stop_for_status(res)
tt <- content(res, "text")
xml_orig <- xml2::read_xml(tt)
handle_errors(xml_orig)
xml <- xml2::xml_children(xml2::xml_children(xml_orig)[[3]])
trytok <- xml2::as_list(xml[sapply(xml, xml_name) == "resumptionToken"])
if (length(trytok) == 0) {
tok <- list(token = "")

# try parsing
parsed <- try( read_xml_safely(tt), silent=TRUE )
if(inherits(parsed, "try-error")) {
parsed <- try( xml2::read_html(tt), silent=TRUE )
warning("read_xml parsing failed, but read_html succeeded")
if( inherits(parsed, "try-error") ) {
fname <- tempfile(
pattern=paste0("oaidump_", chartr(" :", "_-", Sys.time())),
tmpdir=".", fileext="xml")
cat(tt, file=fname)
stop(paste0("cannot parse downloaded XML, dumped raw XML to file ", fname))
}
is_html <- TRUE
} else {
tok <- xml2::xml_text(trytok[[1]])
tok_atts <- xml2::xml_attrs(trytok[[1]])
tok <- c(token = tok, as.list(tok_atts))
is_html <- FALSE
}

handle_errors(parsed)
tok <- get_token(parsed, verb=args2$verb, is_html=is_html)
# `as` determines what the `dumper` gets
res <- if (as == "raw") {
tt
if (as == "raw") {
res <- tt
} else {
switch(args$verb,
ListRecords = get_data(xml, as = as),
ListIdentifiers = parse_listid(xml, as = as),
ListSets = get_sets(xml, as = as)
)
if(is_html) {
warning("malformed XML - keeping raw text even though `as` is ", dQuote(as))
res <- tt
} else {
xml_verb <- xml2::xml_children(xml2::xml_children(parsed)[[3]]) # TODO xpath here
res <- switch(args$verb,
ListRecords = get_data(xml_verb, as = as),
ListIdentifiers = parse_listid(xml_verb, as = as),
ListSets = get_sets(xml_verb, as = as)
)
}
}
# Collect values returned by `dumper` if they are not NULL
if (is.null(dumper)) {
Expand All @@ -59,3 +74,25 @@ while_oai <- function(url, args, token, as, dumper=NULL, dumper_args=NULL, ...)
ListSets = out
)
}

# Get resumptionToken from parsed XML/HTML
# @param x result of read_xml or read_html
# @param is_html is it XML or HTML (incl. malformed XML)
get_token <- function(x, verb, is_html=FALSE) {
xp <- paste0("/*[local-name()='OAI-PMH']/*[local-name()='", verb, "']/*[local-name()='",
if(is_html) "resumptiontoken" else "resumptionToken", "']" )
node <- xml2::xml_find_all(x, xp)
if(length(node) == 0) {
return( list(token="") )
}
else {
if(length(node) > 1) {
warning("more than one match - using last")
node <- node[length(node)]
}
return( c(
token=xml2::xml_text(node),
as.list(xml2::xml_attrs(node))
) )
}
}

0 comments on commit ecbea70

Please sign in to comment.