From c215e941bd543484da10003dd5051e78b72eca8b Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Tue, 29 Aug 2023 12:45:41 +0000 Subject: [PATCH] WIP `xpath_search()` --- R/xml_find.R | 30 +------------- src/init.c | 2 + src/xml2_xpath.cpp | 69 ++++++++++++++++++++++++++++++- tests/testthat/test-xml_missing.R | 2 +- tests/testthat/test-xml_nodeset.R | 2 +- 5 files changed, 72 insertions(+), 33 deletions(-) diff --git a/R/xml_find.R b/R/xml_find.R index 74791dd..7b603c2 100644 --- a/R/xml_find.R +++ b/R/xml_find.R @@ -111,37 +111,9 @@ xml_find_all.xml_nodeset <- function(x, xpath, ns = xml_ns(x), flatten = TRUE, . #' @export #' @rdname xml_find_all xml_find_first <- function(x, xpath, ns = xml_ns(x)) { - UseMethod("xml_find_first") -} - -#' @export -xml_find_first.xml_missing <- function(x, xpath, ns = xml_ns(x)) { - xml_missing() + .Call(xpath_search2, x, xpath, ns, 1) } -#' @export -xml_find_first.xml_node <- function(x, xpath, ns = xml_ns(x)) { - .Call(xpath_search, x$node, x$doc, xpath, ns, 1) -} - -#' @export -xml_find_first.xml_nodeset <- function(x, xpath, ns = xml_ns(x)) { - if (length(x) == 0) { - return(xml_nodeset()) - } - - xml_nodeset( - lapply( - x, - function(x) { - xml_find_first(x, xpath = xpath, ns = ns) - } - ), - deduplicate = FALSE - ) -} - - #' @export #' @rdname xml_find_all xml_find_num <- function(x, xpath, ns = xml_ns(x)) { diff --git a/src/init.c b/src/init.c index 9fcecfc..df5dfcb 100644 --- a/src/init.c +++ b/src/init.c @@ -74,6 +74,7 @@ extern SEXP url_unescape_(SEXP); extern SEXP xml_parse_options_(void); extern SEXP xml_save_options_(void); extern SEXP xpath_search(SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP xpath_search2(SEXP, SEXP, SEXP, SEXP); static const R_CallMethodDef CallEntries[] = { {"doc_has_root", (DL_FUNC) &doc_has_root, 1}, @@ -139,6 +140,7 @@ static const R_CallMethodDef CallEntries[] = { {"xml_parse_options_", (DL_FUNC) &xml_parse_options_, 0}, {"xml_save_options_", (DL_FUNC) &xml_save_options_, 0}, {"xpath_search", (DL_FUNC) &xpath_search, 5}, + {"xpath_search2", (DL_FUNC) &xpath_search2, 4}, {NULL, NULL, 0} }; diff --git a/src/xml2_xpath.cpp b/src/xml2_xpath.cpp index 3de58b6..f84ba0b 100644 --- a/src/xml2_xpath.cpp +++ b/src/xml2_xpath.cpp @@ -116,10 +116,74 @@ class XmlSeeker { }; +SEXP xpath_search_impl(SEXP x, const char* xpath, SEXP nsMap_sxp, double num_results) { + NodeType type = getNodeType(x); + + SEXP out; + + switch(type) { + case NodeType::missing: + Rprintf("-> missing\n"); + out = new_xml_missing(); + break; + case NodeType::node: { + SEXP node_sxp = VECTOR_ELT(x, 0); + XPtrNode node(node_sxp); + SEXP doc_sxp = VECTOR_ELT(x, 1); + XPtrDoc doc(doc_sxp); + + XmlSeeker seeker(doc, node.checked_get()); + seeker.registerNamespace(nsMap_sxp); + out = seeker.search(xpath, num_results); + break; + } + default: Rf_error("Unexpected node type"); + } + + return(out); +} + // [[export]] -extern "C" SEXP xpath_search(SEXP node_sxp, SEXP doc_sxp, SEXP xpath_sxp, SEXP nsMap_sxp, SEXP num_results_sxp) { +extern "C" SEXP xpath_search2(SEXP x, SEXP xpath_sxp, SEXP nsMap_sxp, SEXP num_results_sxp) { + if (TYPEOF(xpath_sxp) != STRSXP) { + Rf_error("XPath must be a string, received %s", Rf_type2char(TYPEOF(xpath_sxp))); + } + const char* xpath = CHAR(STRING_ELT(xpath_sxp, 0)); - XPtrNode node(node_sxp); + double num_results = REAL(num_results_sxp)[0]; + if (num_results == R_PosInf) { + num_results = INT_MAX; + } + + NodeType type = getNodeType(x); + + switch(type) + { + case NodeType::missing: + case NodeType::node : + return(xpath_search_impl(x, xpath, nsMap_sxp, num_results)); + break; + case NodeType::nodeset: { + int n = Rf_xlength(x); + + SEXP out = PROTECT(Rf_allocVector(VECSXP, n)); + + for (int i = 0; i < n; ++i) { + SEXP x_i = VECTOR_ELT(x, i); + SEXP res_i = xpath_search_impl(x_i, xpath, nsMap_sxp, num_results); + SET_VECTOR_ELT(out, i, res_i); + } + + Rf_setAttrib(out, R_ClassSymbol, Rf_mkString("xml_nodeset")); + + UNPROTECT(1); + return(out); + }; + } +} + +// [[export]] +extern "C" SEXP xpath_search(SEXP node_sxp, SEXP doc_sxp, SEXP xpath_sxp, SEXP nsMap_sxp, SEXP num_results_sxp) { XPtrDoc doc(doc_sxp); if (TYPEOF(xpath_sxp) != STRSXP) { Rf_error("XPath must be a string, received %s", Rf_type2char(TYPEOF(xpath_sxp))); @@ -128,6 +192,7 @@ extern "C" SEXP xpath_search(SEXP node_sxp, SEXP doc_sxp, SEXP xpath_sxp, SEXP n double num_results = REAL(num_results_sxp)[0]; + XPtrNode node(node_sxp); if (num_results == R_PosInf) { num_results = INT_MAX; } diff --git a/tests/testthat/test-xml_missing.R b/tests/testthat/test-xml_missing.R index fdc24f0..3583471 100644 --- a/tests/testthat/test-xml_missing.R +++ b/tests/testthat/test-xml_missing.R @@ -27,7 +27,7 @@ test_that("xml_missing methods return properly for all S3 methods", { expect_equal(xml_find_chr(mss), character()) expect_equal(xml_find_lgl(mss), logical()) expect_equal(xml_find_num(mss), numeric()) - expect_equal(xml_find_first(mss), xml_missing()) + expect_equal(xml_find_first(mss, "dummy"), xml_missing()) expect_equal(xml_length(mss), 0L) expect_equal(xml_name(mss), NA_character_) expect_equal(xml_parent(mss), xml_missing()) diff --git a/tests/testthat/test-xml_nodeset.R b/tests/testthat/test-xml_nodeset.R index 3a92c0b..77d9915 100644 --- a/tests/testthat/test-xml_nodeset.R +++ b/tests/testthat/test-xml_nodeset.R @@ -35,7 +35,7 @@ test_that("methods work on empty nodesets", { expect_identical(xml_double(empty), numeric()) expect_identical(xml_find_all(empty), empty) expect_identical(xml_find_chr(empty), character()) - expect_identical(xml_find_first(empty), empty) + expect_identical(xml_find_first(empty, "dummy"), empty) expect_identical(xml_find_lgl(empty), logical()) expect_identical(xml_find_num(empty), numeric()) expect_identical(xml_integer(empty), integer())