Skip to content

Commit

Permalink
Make xml_serialize()/xml_unserialize() work also for HTML documents (#…
Browse files Browse the repository at this point in the history
…408)

Fixes #407
  • Loading branch information
HenrikBengtsson committed Nov 10, 2023
1 parent e088a1c commit b9f65ba
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 2 deletions.
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# xml2 (development version)

* `xml_serialize()` now includes the document type so that `xml_unserialize()` works also for HTML documents (#407, @HenrikBengtsson).

* Remove unused dependencies on glue, withr and lifecycle (@mgirlich).

* `print()` is faster for very long `xml_nodeset` inputs (#366, @michaelchirico).

# xml2 1.3.5
Expand Down
10 changes: 8 additions & 2 deletions R/xml_serialize.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ xml_serialize.xml_document <- function(object, connection, ...) {
connection <- file(connection, "w", raw = TRUE)
on.exit(close(connection))
}
serialize(structure(as.character(object, ...), class = "xml_serialized_document"), connection)
serialize(structure(as.character(object, ...), doc_type = doc_type(object), class = "xml_serialized_document"), connection)
}

#' @export
Expand Down Expand Up @@ -64,7 +64,13 @@ xml_unserialize <- function(connection, ...) {
# Select only the root
res <- xml_find_first(x, "/node()")
} else if (inherits(object, "xml_serialized_document")) {
res <- read_xml(unclass(object), ...)
read_xml_int <- function(object, as_html = FALSE, ...) {
if (missing(as_html)) {
as_html <- identical(attr(object, "doc_type", exact = TRUE), "html")
}
read_xml(unclass(object), as_html = as_html, ...)
}
res <- read_xml_int(unclass(object), ...)
} else {
abort("Not a serialized xml2 object")
}
Expand Down
15 changes: 15 additions & 0 deletions tests/testthat/test-xml_serialize.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ x <- read_xml("<a>
<b><c>123</c></b>
<b><c>456</c></b>
</a>")

test_that("xml_serialize and xml_unserialize work with xml_document input", {
out <- xml_unserialize(xml_serialize(x, NULL))
expect_identical(as.character(x), as.character(out))
Expand Down Expand Up @@ -37,6 +38,20 @@ test_that("xml_serialize and xml_unserialize work with xml_nodeset input", {
expect_identical(as.character(xml_unserialize(f)), as.character(b))
})

test_that("xml_serialize and xml_unserialize work with HTML-based xml_document input", {
file <- system.file("extdata", "r-project.html", package = "xml2")
x <- read_html(file)

out <- xml_unserialize(xml_serialize(x, NULL))
expect_identical(as.character(x), as.character(out))

f <- tempfile()
on.exit(unlink(f))

xml_serialize(x, f)
expect_identical(as.character(xml_unserialize(f)), as.character(x))
})

test_that("xml_unserialize throws an error if given a invalid object", {
expect_error(xml_unserialize(serialize(1, NULL)), "Not a serialized xml2 object")
})

0 comments on commit b9f65ba

Please sign in to comment.