-
-
Notifications
You must be signed in to change notification settings - Fork 969
/
html_resources.R
449 lines (393 loc) · 17.5 KB
/
html_resources.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
#' Find External Resource References
#'
#' Given an R Markdown document or HTML file, attempt to determine the set of
#' additional files needed in order to render and display the document.
#'
#' This routine applies heuristics in order to scan a document for
#' possible resource references.
#'
#' In R Markdown documents, it looks for references to files implicitly
#' referenced in Markdown (e.g. \code{![alt](img.png)}), in the document's
#' YAML header, in raw HTML chunks, and as quoted strings in R code chunks
#' (e.g. \code{read.csv("data.csv")}).
#'
#' Resources specified explicitly in the YAML header for R Markdown
#' documents are also returned. To specify resources in YAML, use the
#' \code{resource_files} key:
#'
#' \preformatted{---
#' title: My Document
#' author: My Name
#' resource_files:
#' - data/mydata.csv
#' - images/figure.png
#' ---}
#'
#' Each item in the \code{resource_files} list can refer to:
#' \enumerate{
#' \item A single file, such as \code{images/figure.png}, or
#' \item A directory, such as \code{resources/data}, in which case all of the
#' directory's content will be recursively included, or
#' \item A wildcard pattern, such as \code{data/*.csv}, in which case all of
#' the files matching the pattern will be included. No recursion is done in
#' this case.
#' }
#'
#' In HTML files (and raw HTML chunks in R Markdown documents), this routine
#' searches for resources specified in common tag attributes, such as
#' \code{<img src="...">}, \code{<link href="...">}, etc.
#'
#' In all cases, only resources that exist on disk and are contained in the
#' document's directory (or a child thereof) are returned.
#' @param input_file path to the R Markdown document or HTML file to process
#' @inheritParams render
#' @return A data frame with the following columns:
#' \describe{
#' \item{path}{The relative path from the document to the resource}
#' \item{explicit}{Whether the resource was specified explicitly
#' (\code{TRUE}) or discovered implicitly (\code{FALSE})}
#' \item{web}{Whether the resource is needed to display a Web page rendered
#' from the document}
#' }
#' @export
find_external_resources <- function(input_file, encoding = 'UTF-8') {
# ensure we're working with valid input
ext <- tolower(xfun::file_ext(input_file))
if (!(ext %in% c("md", "rmd", "qmd", "html", "htm", "r", "css"))) {
stop("Resource discovery is only supported for R Markdown files or HTML files.")
}
if (!file.exists(input_file)) {
stop("The input file file '", input_file, "' does not exist.")
}
# set up the frame we'll use to report results
discovered_resources <- data.frame(
path = character(0), explicit = logical(0), web = logical(0)
)
input_dir <- dirname(normalize_path(input_file))
# discover a single resource--tests a string to see if it corresponds to a
# resource on disk; if so, adds it to the list of known resources and returns
# TRUE
discover_single_resource <- function(path, explicit, web) {
if (!(is.character(path) && length(path) == 1 && path != "." && path != ".." &&
file.exists(file.path(input_dir, path))))
return(FALSE)
ext <- tolower(xfun::file_ext(file.path(input_dir, path)))
if (identical(ext, "r")) {
# if this is a .R script, look for resources it contains, too
discover_r_resources(file.path(input_dir, path), discover_single_resource)
} else if (identical(ext, "css")) {
# if it's a CSS file, look for files it references (e.g. fonts/images)
discover_css_resources(file.path(input_dir, path), discover_single_resource)
}
# if this is an implicitly discovered resource, it needs to refer to
# a file rather than a directory
if (!explicit && dir_exists(file.path(input_dir, path))) {
return(FALSE)
}
# this looks valid; remember it
discovered_resources <<- rbind(discovered_resources, data.frame(
path = path, explicit = explicit, web = web, stringsAsFactors = FALSE
))
TRUE
}
# run the main resource discovery appropriate to the file type
if (ext %in% c("md", "rmd", "qmd")) {
# discover R Markdown doc resources--scans the document itself as described
# in comments above, renders as Markdown, and invokes HTML discovery
# on the result
discover_rmd_resources(input_file, discover_single_resource)
} else if (ext %in% c("htm", "html")) {
# discover HTML resources
discover_html_resources(input_file, discover_single_resource)
# if the HTML file represents a rendered R Markdown document, it may have a
# sidecar _files folder; include that if it's present
sidecar_files_dir <- knitr_files_dir(input_file)
files_dir_info <- file.info(sidecar_files_dir)
if (isTRUE(files_dir_info$isdir)) {
# we probably auto-discovered some resources from _files--exclude those
# since they'll be covered by the directory
files_dir_prefix <- file.path(basename(sidecar_files_dir), "")
files_dir_matches <- substr(
discovered_resources$path, 1, nchar(files_dir_prefix)
) == files_dir_prefix
discovered_resources <- discovered_resources[!files_dir_matches, , drop = FALSE]
# add the directory itself
discovered_resources <- rbind(discovered_resources, data.frame(
path = files_dir_prefix, explicit = FALSE, web = TRUE,
stringsAsFactors = FALSE)
)
}
} else if (ext == "r") {
discover_r_resources(input_file, discover_single_resource)
} else if (ext == "css") {
discover_css_resources(input_file, discover_single_resource)
}
# clean row names (they're not meaningful)
rownames(discovered_resources) <- NULL
# convert paths from factors if necssary, and clean any redundant ./ leaders
discovered_resources$path <- as.character(discovered_resources$path)
has_prefix <- grepl("^\\./", discovered_resources$path)
discovered_resources$path[has_prefix] <- substring(discovered_resources$path[has_prefix], 3)
discovered_resources
}
# discovers resources in a single HTML file
discover_html_resources <- function(html_file, discover_single_resource) {
# resource accumulator
discover_resource <- function(node, att, val, idx) {
res_file <- utils::URLdecode(val)
discover_single_resource(res_file, FALSE, TRUE)
}
# create a single string with all of the lines in the document
html_lines <- file_string(html_file)
# parse the HTML and invoke our resource discovery callbacks
call_resource_attrs(html_lines, discover_resource)
}
# discovers resources in a single R Markdown document
discover_rmd_resources <- function(rmd_file, discover_single_resource) {
# create a UTF-8 encoded Markdown file to serve as the resource discovery
# source
md_file <- tempfile(fileext = ".md")
input_dir <- dirname(normalize_path(rmd_file))
output_dir <- dirname(md_file)
rmd_content <- read_utf8(rmd_file)
if (length(i <- grep('^---\\s*$', rmd_content)) >= 2 && i[1] == 1) {
rmd_content <- append(rmd_content, 'citeproc: false', i[2] - 1)
}
write_utf8(rmd_content, md_file)
# create a vector of temporary files; anything in here
# will be cleaned up on exit
temp_files <- md_file
on.exit(unlink(temp_files, recursive = TRUE), add = TRUE)
# discovers render-time resources; if any are found, adds them to the list of
# discovered resources and copies them alongside the input document.
discover_render_resource <- function(output_render_file) {
if (discover_single_resource(output_render_file, FALSE, FALSE)) {
# mirror original directory structure so we don't need to mutate input
# prior to render
output_target_file <- file.path(output_dir, output_render_file)
if (!file.exists(dirname(output_target_file))) {
dir.create(dirname(output_target_file), showWarnings = FALSE, recursive = TRUE)
}
# copy the original resource to the temporary render folder
file.copy(file.path(input_dir, output_render_file), output_target_file)
# clean up this file when we're done
temp_files <<- c(temp_files, output_target_file)
}
}
# parse the YAML front matter to discover resources named there
front_matter <- yaml_front_matter(md_file)
# Check for content referred to by output format calls to the includes
# function (for generating headers/footers/etc. at render time), and for
# references to files in pandoc arguments.
#
# These will be needed to produce even a vanilla Markdown variant of the input
# document, so copy them to the temporary folder in preparation for rendering
# (in addition to marking them as required resources).
output_formats <- front_matter[["output"]]
if (is.list(output_formats)) {
for (output_format in output_formats) {
if (is.list(output_format)) {
output_render_files <- unlist(output_format[c(
'includes', 'pandoc_args', 'logo', 'reference_doc', 'reference_docx', 'template'
)])
lapply(output_render_files, discover_render_resource)
}
}
}
# check for explicitly named resources
if (!is.null(front_matter$resource_files)) {
lapply(front_matter$resource_files, function(res) {
explicit_res <- if (is.character(res)) {
list(path = res, explicit = TRUE, web = is_web_file(res))
} else if (is.list(res) && length(names(res)) > 0) {
# list--happens when web flag is specified explicitly in YAML.
list(path = names(res)[[1]],
explicit = TRUE,
web = if (is.null(res$web)) is_web_file(res) else res$web)
}
# check the extracted filename to see if it exists
if (!is.null(explicit_res)) {
if (grepl("*", explicit_res$path, fixed = TRUE)) {
# if the resource file spec includes a wildcard, list the files
# that match the pattern
files <- list.files(
file.path(input_dir, dirname(explicit_res$path)),
utils::glob2rx(basename(explicit_res$path))
)
lapply(files, function(f) discover_single_resource(
file.path(dirname(explicit_res$path), f), TRUE, web = is_web_file(f)
))
} else {
# no wildcard, see whether this resource refers to a directory or to
# an individual file
info <- file.info(file.path(input_dir, explicit_res$path))
if (is.na(info$isdir)) {
# implies that the file doesn't exist (should we warn here?)
NULL
} else if (isTRUE(info$isdir)) {
# if the resource file spec is a directory, include all the files in
# the directory, recursively
files <- list.files(
file.path(input_dir, explicit_res$path), recursive = TRUE
)
lapply(files, function(f) discover_single_resource(
file.path(explicit_res$path, f), TRUE, web = is_web_file(f)
))
} else {
# isdir is false--this is an individual file; return it
discover_single_resource(explicit_res$path, explicit_res$explicit, explicit_res$web)
}
}
} else {
discover_single_resource(explicit_res$path, explicit_res$explicit, explicit_res$web)
}
})
}
# check for a 'preview' yaml metadata entry
if (!is.null(front_matter[["preview"]])) {
discover_single_resource(front_matter[["preview"]], explicit = FALSE, web = TRUE)
}
# check for bibliography and csl files at the top level
for (bibfile in c("bibliography", "csl")) {
lapply(front_matter[[bibfile]], discover_render_resource)
}
# check for parameter values that look like files.
if (!is.null(front_matter$params)) {
# This is the raw parameter information and has not had any YAML tag
# processing performed. See `knitr:::resolve_params`.
lapply(front_matter$params, function(param) {
if (is.list(param)) {
if (identical(param$input, "file")) {
if (!is.null(param$value)) {
# We treat param filenames as non-web resources.
discover_single_resource(param$value, TRUE, FALSE)
}
}
}
})
}
# check for knitr child documents in R Markdown documents
if (tolower(xfun::file_ext(rmd_file)) %in% c("qmd", "rmd")) {
chunk_lines <- gregexpr(knitr::all_patterns$md$chunk.begin, rmd_content, perl = TRUE)
for (idx in seq_along(chunk_lines)) {
chunk_line <- chunk_lines[idx][[1]]
if (is.na(chunk_line) || chunk_line < 0) next
chunk_start <- attr(chunk_line, "capture.start", exact = TRUE) + 1
chunk_text <- substr(
rmd_content[idx], chunk_start,
chunk_start + attr(chunk_line, "capture.length", exact = TRUE) - 2
)
for (child_expr in c("\\bchild\\s*=\\s*'([^']+)'", "\\bchild\\s*=\\s*\"([^\"]+)\"")) {
child_match <- gregexpr(child_expr, chunk_text, perl = TRUE)[[1]]
if (child_match > 0) {
child_start <- attr(child_match, "capture.start", exact = TRUE)
child_text <- substr(
chunk_text, child_start,
child_start + attr(child_match, "capture.length", exact = TRUE) - 1
)
discover_render_resource(child_text)
}
}
}
}
# render "raw" markdown to HTML
html_file <- tempfile(fileext = ".html")
on.exit(unlink(html_file), add = TRUE)
# check to see what format this document is going to render as; if it's a
# format that produces HTML, let it render as-is, but if it isn't, render as
# html_document to pick up dependencies
output_format <- output_format_from_yaml_front_matter(rmd_content)
output_format_function <- eval(xfun::parse_only(output_format$name))
override_output_format <- if (!is_pandoc_to_html(output_format_function()$pandoc)) "html_document"
html_file <- render(
md_file, override_output_format, html_file, quiet = TRUE,
output_options = list(
self_contained = FALSE,
pandoc_args = c("--metadata", "pagetitle=PREVIEW")
)
)
# clean up output file and its supporting files directory
temp_files <- c(temp_files, html_file, knitr_files_dir(md_file), knitr_files_dir(html_file))
# run the HTML resource discovery mechanism on the rendered output
discover_html_resources(html_file, discover_single_resource)
# if this is an R Markdown file, purl the file to extract just the R code
if (tolower(xfun::file_ext(rmd_file)) %in% c("qmd", "rmd")) {
r_file <- tempfile(fileext = ".R")
# suppress possible try() errors https://github.com/rstudio/rmarkdown/issues/1247
try_file <- tempfile()
opts <- options(try.outFile = try_file)
on.exit({
unlink(c(r_file, try_file)); options(opts)
}, add = TRUE)
knitr::purl(md_file, output = r_file, quiet = TRUE, documentation = 0)
temp_files <- c(temp_files, r_file)
discover_r_resources(r_file, discover_single_resource)
}
}
discover_r_resources <- function(r_file, discover_single_resource) {
# read the lines from the R file
r_lines <- read_utf8(r_file)
# clean comments from the R code (simply; consider: # inside strings)
r_lines <- sub("#.*$", "", r_lines)
# find quoted strings in the code and attempt to ascertain whether they are
# files on disk
r_lines <- one_string(r_lines)
quoted_strs <- Reduce(c, lapply(c("\"[^\"\n]*\"", "'[^'\n]*'"), function(pat) {
matches <- unlist(regmatches(r_lines, gregexpr(pat, r_lines)))
substr(matches, 2, nchar(matches) - 1)
}))
# consider any quoted string containing a valid relative path to a file that
# exists on disk to be a reference
for (quoted_str in quoted_strs) {
if (nchar(quoted_str) > 0)
discover_single_resource(quoted_str, FALSE, is_web_file(quoted_str))
}
}
# copies the external resources needed to render original_input into
# intermediates_dir; with skip_web, skips web resources. returns a character
# vector containing paths to all resources copied.
copy_render_intermediates <- function(original_input, intermediates_dir, skip_web) {
# start with an empty set of intermediates
intermediates <- c()
# extract all the resources used by the input file; note that this actually
# runs another (non-knitting) render, and that recursion is avoided because
# we explicitly render with self-contained = FALSE while discovering
# resources
resources <- find_external_resources(original_input)
dest_dir <- normalize_path(intermediates_dir)
source_dir <- dirname(normalize_path(original_input))
# process each returned reosurce
by(resources, seq_len(nrow(resources)), function(res) {
# skip web resources if requested
if (skip_web && res$web) return()
dest <- copy_file_with_dir(res$path, dest_dir, source_dir)
intermediates <<- c(intermediates, dest)
})
# return the list of files we generated
intermediates
}
# copy a file from a relative path to a destination dir, and preserve its
# original dir structure, e.g., if we copy foo/bar.txt to /tmp, the destination
# file should be /tmp/foo/bar.txt instead of /tmp/bar.txt
copy_file_with_dir <- function(path, dest, from = '.') {
dest <- file.path(dest, path)
path <- file.path(from, path)
if (!file.exists(path)) return()
if (!dir_exists(dirname(dest))) dir.create(dirname(dest), recursive = TRUE)
file.copy(path, dest)
dest
}
discover_css_resources <- function(css_file, discover_single_resource) {
css_lines <- read_utf8(css_file)
discover_resource <- function(node, att, val, idx) {
res_file <- utils::URLdecode(val)
discover_single_resource(res_file, FALSE, TRUE)
}
call_css_resource_attrs(one_string(css_lines), discover_resource)
}
# given a filename, return true if the file appears to be a web file
is_web_file <- function(filename) {
tolower(xfun::file_ext(filename)) %in% c(
"css", "gif", "htm", "html", "jpeg", "jpg", "js", "mp3", "mp4", "png", "wav"
)
}