Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Tickbox and Math Protection #39

Merged
merged 34 commits into from
May 11, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
28cd450
add asis node; demonstrate math protection
zkamvar May 1, 2021
22f4464
fix example
zkamvar May 2, 2021
11d946b
add emph asis type
zkamvar May 2, 2021
acffb34
update asis node example
zkamvar May 2, 2021
6ad6945
add asis node testing
zkamvar May 2, 2021
6c9af60
simplify code and parse out reusable chunks
zkamvar May 3, 2021
e7846c0
store arguments in yarn object for resetting.
zkamvar May 4, 2021
365b861
make more progress towards inline math.
zkamvar May 4, 2021
44e05fb
use correct variable names
zkamvar May 5, 2021
55e16a7
add grouping for full regex
zkamvar May 5, 2021
e6d501a
update protect_math() and snapshot
zkamvar May 5, 2021
14655c5
update documentation
zkamvar May 5, 2021
282c360
add include = TRUE for block math
zkamvar May 6, 2021
fe955ee
update regex for inline math
zkamvar May 6, 2021
831c27f
update inline math parsing
zkamvar May 6, 2021
60d9a53
add new test with snapshot
zkamvar May 6, 2021
72a69b1
document; add condidtions for no math
zkamvar May 6, 2021
b6c91b4
export find_between(); fix parsing issue
zkamvar May 6, 2021
b058508
update test
zkamvar May 6, 2021
14e0cee
add nocov tags
zkamvar May 6, 2021
21d1688
add tickbox protection
zkamvar May 7, 2021
c9f711d
document
zkamvar May 7, 2021
df7cf0d
remove unneeded body
zkamvar May 7, 2021
284bff0
add not math dollar in example
maelle May 7, 2021
aae7811
Apply suggestions from code review
zkamvar May 7, 2021
589642c
Merge pull request #40 from ropensci/small
zkamvar May 7, 2021
0701022
update error message
zkamvar May 7, 2021
47e4d84
update documentation; delta internal function name
zkamvar May 7, 2021
d46e9ef
add test for error
zkamvar May 7, 2021
c2748ed
make tickboxes protected by default
zkamvar May 7, 2021
b9fd529
update README to explain math
zkamvar May 7, 2021
5060438
escape protection early if nothing needs protectin
zkamvar May 7, 2021
c00a12a
process tickboxes up front
zkamvar May 7, 2021
b3afe1e
polish code; add docs; add tests
zkamvar May 11, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Generated by roxygen2: do not edit by hand

export(find_between)
export(md_ns)
export(protect_math)
export(to_md)
export(to_xml)
export(yarn)
Expand Down
8 changes: 8 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,14 @@
* xml and yaml objects are now stored in an R6 class called `yarn`.
* testthat edition 3 is now being used with snapshot testing.
* Tables are now pretty after a full loop `to_xml()` + `to_md()` (@pdaengeli, #9)
* 2021-05-04: yarn objects remember the `sourcepos` and `encoding` options
zkamvar marked this conversation as resolved.
Show resolved Hide resolved
when using the `$reset()` method.
* 2021-05-06: `protect_math()` function and method protects LaTeX math (dollar
notation) from escaping by commonmark (@zkamvar, #39).
* 2021-05-06: GitHub-flavored markdown ticks/checkboxes are now protected by
default (@zkamvar, #39).
* 2021-05-11: `md_ns()` is a new convenience function to provide the `md`
namespace prefix for commonmark xml documents (@zkamvar, #39).

# tinkr 0.0.0.9000

Expand Down
54 changes: 54 additions & 0 deletions R/add_md.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#' Add markdown content to an XML object
#'
#' @param body an XML object generated via {tinkr}
#' @param md a string of new markdown to insert
#' @param where the position in the markdown document to insert the new markdown
#' @keywords internal
#'
#' @return a copy of the XML object with the markdown inserted.
add_md <- function(body, md, where = 0L) {
new <- md_to_xml(md)
add_nodes_to_body(body, new, where)
copy_xml(body)
}

# Add children to a specific location in the full document.
add_nodes_to_body <- function(body, nodes, where = 0L) {
for (child in rev(nodes)) {
maelle marked this conversation as resolved.
Show resolved Hide resolved
xml2::xml_add_child(body, child, .where = where)
}
}

# Add siblings to a node
add_node_siblings <- function(node, nodes, where = "after", remove = TRUE) {
for (sib in rev(nodes)) {
xml2::xml_add_sibling(node, sib, .where = where)
}
if (remove) xml2::xml_remove(node)
}
zkamvar marked this conversation as resolved.
Show resolved Hide resolved

#' Convert markdown to XML
#'
#' @param md a character vector of markdown text
#' @return an XML nodeset of the markdown text
#' @keywords internal
#' @examples
#' tinkr:::md_to_xml(c(
#' "## This is a new section of markdown",
#' "",
#' "Each new element",
#' "Is converted to a new line of markdown text",
#' "",
#' "```{r code-example, echo = FALSE}",
#' "cat('code blocks work well here, too')",
#' "```",
#' "",
#' "Neat, right?"
maelle marked this conversation as resolved.
Show resolved Hide resolved
#' ))
md_to_xml <- function(md) {
new <- clean_content(paste(md, collapse = "\n"))
new <- commonmark::markdown_xml(new, extensions = TRUE)
parse_rmd(new <- xml2::read_xml(new))
new <- xml2::xml_ns_strip(new)
xml2::xml_children(new)
}
247 changes: 247 additions & 0 deletions R/asis-nodes.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
#' Protect math elements from commonmark's character escape
#'
#' @param body an XML object
#' @param ns an XML namespace object (defaults: [md_ns()]).
#' @return a copy of the modified XML object
#' @details Commonmark does not know what LaTeX is and will LaTeX equations as
#' normal text. This means that content surrounded by underscores are
#' interpreted as `<emph>` elements and all backslashes are escaped by default.
#' This function protects inline and block math elements that use `$` and `$$`
#' for delimiters, respectively.
#'
#' @note this function is also a method in the [tinkr::yarn] object.
#'
#' @export
#' @examples
#' m <- tinkr::to_xml(system.file("extdata", "math-example.md", package = "tinkr"))
#' txt <- textConnection(tinkr::to_md(m))
#' cat(tail(readLines(txt)), sep = "\n") # broken math
#' close(txt)
#' m$body <- protect_math(m$body)
#' txt <- textConnection(tinkr::to_md(m))
#' cat(tail(readLines(txt)), sep = "\n") # fixed math
#' close(txt)
protect_math <- function(body, ns = md_ns()) {
# block math adds attributes, done in memory
protect_block_math(body, ns)
# inline math adds _nodes_, which means a new document
protect_inline_math(body, ns)
}

set_asis <- function(nodes) {
xml2::xml_set_attr(nodes[xml2::xml_name(nodes) != "softbreak"], "asis", "true")
}

# INLINE MATH ------------------------------------------------------------------

# finding inline math consists of searching for $ and excluding $$
find_inline_math <- function(body, ns) {
i <- ".//md:text[not(@asis) and contains(text(), '$') and not(contains(text(), '$$'))]"
xml2::xml_find_all(body, i, ns = ns)
}

# Helper function to return the proper regex for inline math.
# Having the start and stop type individually allows me to invert the
# union between them to find the incomplete cases.
inline_dollars_regex <- function(type = c("start", "stop", "full")) {
# any space
ace <- "[:space:]"
punks <- glue::glue("[{ace}[:punct:]]")
# Note about this regex: the first part is a lookahead (?=...) that searches
# for the line start, space, or punctuation. Importantly about lookaheads,
# they do not consume the string
# (https://junli.netlify.app/en/overlapping-regular-expression-in-python/)
#
# The rest of the regex looks for a dollar sign that does not butt up against
# a space or another dollar.
start <- glue::glue("(?=^|{punks})[$]?[$][^{ace}$]")
stop <- glue::glue("[^{ace}$][$][$]?(?={punks}|$)")
switch(type,
start = start,
stop = stop,
full = glue::glue('({start}.+?{stop})')
)
}

# Find incomplete cases for inline math
find_broken_math <- function(math) {
txt <- xml2::xml_text(math)
start <- grepl(inline_dollars_regex("start"), txt, perl = TRUE)
stop <- grepl(inline_dollars_regex("stop"), txt, perl = TRUE)
incomplete <- !(start & stop)
list(
no_end = start & incomplete,
no_beginning = stop & incomplete
)
}

#' Find and protect all inline math elements
#'
#' @param body an XML document
#' @param ns an XML namespace
#' @return a modified _copy_ of the original XML document
#' @keywords internal
#' @examples
#' txt <- commonmark::markdown_xml(
#' r"{This sentence contains $I_A$ $\frac{\pi}{2}$ inline $\LaTeX$ math.}"
#' )
#' txt <- xml2::read_xml(txt)
#' cat(to_md(list(body = txt, yaml = "")), sep = "\n")
#' ns <- tinkr::md_ns()
#' protxt <- tinkr:::protect_inline_math(txt, ns)
#' cat(to_md(list(body = protxt, yaml = "")), sep = "\n")
protect_inline_math <- function(body, ns) {
body <- copy_xml(body)
math <- find_inline_math(body, ns)
if (length(math) == 0) {
return(body)
}
broke <- find_broken_math(math)

bespoke <- !(broke$no_end | broke$no_beginning)
endless <- broke$no_end[!bespoke]
headless <- broke$no_beginning[!bespoke]

imath <- math[bespoke]
bmath <- math[!bespoke]

# protect math that is strictly inline
if (length(imath)) {
new_nodes <- lapply(
fix_fully_inline(imath),
FUN = function(n) xml2::xml_ns_strip(xml2::xml_children(n))
)
# since we split up the nodes, we have to do this node by node
for (i in seq(new_nodes)) {
add_node_siblings(imath[[i]], new_nodes[[i]], remove = TRUE)
}
}

# protect math that is broken across lines or markdown elements
if (length(bmath)) {
# If the lengths of the beginning and ending tags don't match, we throw
# an error.
if ((le <- length(bmath[endless])) != (lh <- length(bmath[headless]))) {
unbalanced_math_error(bmath, endless, headless, le, lh)
}
# assign sequential tags to the pairs of inline math elements
tags <- seq(length(bmath[endless]))
xml2::xml_set_attr(bmath[endless], "latex-pair", tags)
xml2::xml_set_attr(bmath[headless], "latex-pair", tags)
for (i in tags) {
fix_partial_inline(i, body, ns)
}
}
copy_xml(body)
}

# Partial inline math are math elements that are not entirely embedded in a
# single `<text>` element. There are two reasons for this:
#
# 1. Math is split across separate lines in the markdown document
# 2. There are elements like `_` that are interpreted as markdown elements.
#
# To use this function, an inline pair needs to be first tagged with a
# `latex-pair` attribute that uniquely identifies that pair of tags. It assumes
# that all of the content between that pair of tags belongs to the math element.
fix_partial_inline <- function(tag, body, ns) {
# find everything between the tagged pair
math_lines <- find_between_inlines(body, ns, tag)
# make sure everything between the tagged pair is labeled as 'asis'
filling <- math_lines[is.na(xml2::xml_attr(math_lines, "latex-pair"))]
set_asis(filling)
filling <- xml2::xml_find_all(filling, ".//node()")
set_asis(filling)
# paste the lines together and create new nodes
n <- length(math_lines)
char <- as.character(math_lines)
char[[1]] <- sub("[$]", "$</text><text asis='true'>", char[[1]])
char[[n]] <- sub("[<]text ", "<text asis='true' ", char[[n]])
nodes <- paste(char, collapse = "")
nodes <- xml2::xml_children(set_default_space(nodes))
nodes <- xml2::xml_ns_strip(nodes)
# add the new nodes to the bottom of the existing math lines
last_line <- math_lines[n]
to_remove <- math_lines[-n]
add_node_siblings(last_line, nodes, remove = TRUE)
# remove the duplicate lines
xml2::xml_remove(to_remove)
}

fix_fully_inline <- function(math) {
char <- as.character(math)
# Find inline math that is complete and wrap it in text with asis
# <text>this is $\LaTeX$ text</text>
# becomes
# <text>this is </text><text asis='true'>$\LaTeX$</text><text> text</text>
char <- gsub(
pattern = inline_dollars_regex("full"),
replacement = "</text><text asis='true'>\\1</text><text>",
x = char,
perl = TRUE
)
set_default_space(char)
}

set_default_space <- function(char) {
new_nodes <- char_to_nodelist(char)
n <- xml2::xml_find_all(new_nodes, ".//node()")
# set space to default to avoid weird formatting (this may change)
xml2::xml_set_attr(n, "xml:space", "default")
new_nodes
}

char_to_nodelist <- function(txt) {
doc <- glue::glue(commonmark::markdown_xml("{paste(txt, collapse = '\n')}"))
doc <- xml2::read_xml(doc)
nodes <- xml2::xml_children(xml2::xml_children(doc))
nodes[xml2::xml_name(nodes) != "softbreak"]
}


# BLOCK MATH ------------------------------------------------------------------

find_block_math <- function(body, ns) {
find_between(body, ns, pattern = "md:text[contains(text(), '$$')]", include = FALSE)
}

find_between_inlines <- function(body, ns, tag) {
to_find <- "md:text[@latex-pair='{tag}']"
find_between(body, ns, pattern = glue::glue(to_find), include = TRUE)
}

protect_block_math <- function(body, ns) {
bm <- find_block_math(body, ns)
# get all of the internal nodes
bm <- xml2::xml_find_all(bm, ".//descendant-or-self::md:*", ns = ns)
set_asis(bm)
}

# TICK BOXES -------------------------------------------------------------------

tick_check <- function(body, ns) {
predicate <- "starts-with(text(), '[ ]') or starts-with(text(), '[x]')"
zkamvar marked this conversation as resolved.
Show resolved Hide resolved
cascade <- glue::glue(".//md:item/md:paragraph/md:text[{predicate}]")
xml2::xml_find_all(body, cascade, ns = ns)
}

protect_tickbox <- function(body, ns) {
body <- copy_xml(body)
ticks <- tick_check(body, ns)
if (length(ticks) == 0) {
return(body)
}
# set the tickbox asis
set_asis(ticks)
char <- as.character(ticks)
char <- sub("(\\[.\\])", "\\1</text><text>", char, perl = TRUE)
new_nodes <- lapply(
set_default_space(char),
function(n) xml2::xml_ns_strip(xml2::xml_children(n))
)
# since we split up the nodes, we have to do this node by node
for (i in seq(new_nodes)) {
add_node_siblings(ticks[[i]], new_nodes[[i]], remove = TRUE)
}
copy_xml(body)
}
Loading