From b2004a65b3f3c74edd3c02e9b7768ba349b53366 Mon Sep 17 00:00:00 2001 From: George Stagg Date: Mon, 2 Sep 2024 10:19:58 +0100 Subject: [PATCH 01/12] Add and export make_tar_index function --- NAMESPACE | 1 + R/tar.R | 148 ++++++++++++++++++++++++++++++++++++++++++ man/make_tar_index.Rd | 25 +++++++ 3 files changed, 174 insertions(+) create mode 100644 R/tar.R create mode 100644 man/make_tar_index.Rd diff --git a/NAMESPACE b/NAMESPACE index b550c1d..a466b57 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,6 +6,7 @@ export(add_repo) export(build) export(file_packager) export(make_library) +export(make_tar_index) export(make_vfs_library) export(make_vfs_repo) export(rm_pkg) diff --git a/R/tar.R b/R/tar.R new file mode 100644 index 0000000..ebef738 --- /dev/null +++ b/R/tar.R @@ -0,0 +1,148 @@ +#' Create an Emscripten metadata file for a given `tar` archive +#' +#' Calculates file offsets and other metadata for content stored in an +#' (optionally gzip compressed) `tar` archive. Together the `tar` archive and +#' resulting metadata file can be mounted as an Emscripten filesystem image, +#' making the content of the archive available to the WebAssembly R process. +#' +#' Outputs a metadata file named by the base name of the `tar` archive with new +#' extension `".js.metadata"`. Both files should be hosted online so that their +#' URL can be provided to webR for mounting on the virtual filesystem. +#' +#' @param file Filename of the `tar` archive to be used as input. +#' @param strip Remove the specified number of leading path elements. Pathnames +#' with fewer elements are skipped. Defaults to `0`, meaning none. +#' @export +make_tar_index <- function(file, strip = 0) { + file_ext <- tolower(fs::path_ext(file)) + file_base <- fs::path_ext_remove(file) + + # Check if our tar is compatible + if (!any(file_ext == c("tgz", "gz", "tar"))) { + stop(paste0("Can't make index for \"", file, + "\". Only uncompressed or `gzip` compressed tar files can be indexed.")) + } + + # Handle two-component extensions + if (file_ext == "gz") { + file_base <- fs::path_ext_remove(file_base) + } + + # Should we decompress? + gzip <- any(file_ext == c("tgz", "gz")) + + # R seems to choke when seeking on a gzfile() connection, so we buffer it + data <- readBin(file, "raw", n = file.size(file)) + if (gzip) { + data <- memDecompress(data) + } + con <- rawConnection(data, open = "rb") + on.exit(close(con)) + + # Build metadata and write to .js.metadata file + entries <- read_tar_offsets(con, strip) + metadata <- list( + files = entries, + gzip = gzip, + ext = gsub(file_base, "", file, ignore.case = TRUE), + remote_package_size = length(data) + ) + metadata_file <- paste0(file_base, ".js.metadata") + jsonlite::write_json(metadata, metadata_file, auto_unbox = TRUE) +} + +read_tar_offsets <- function(con, strip) { + entries <- list() + next_filename <- NULL + + while (TRUE) { + # Read tar entry header block + header <- readBin(con, "raw", n = 512) + + # Empty header indicates end of archive + if (all(header == 0)) break + + # Entry size and offset + offset <- seek(con) + size <- strtoi(sub("\\s.*", "", rawToChar(header[125:136])), 8) + file_blocks <- ceiling(size / 512) + + # Skip directories, global, and vendor-specific extended headers + type <- rawToChar(header[157]) + if (grepl("5|g|[A-Z]", type)) { + next + } + + # Handle PAX extended header + if (type == "x") { + pax_data <- readBin(con, "raw", n = 512 * ceiling(size / 512)) + pax_data <- pax_data[1:max(which(pax_data != as.raw(0x00)))] + lines <- raw_split(pax_data, "\n") + for (line in lines) { + payload <- raw_split(line, " ")[[2]] + kv <- raw_split(payload, "=") + if (rawToChar(kv[[1]]) == "path") { + next_filename <- rawToChar(kv[[2]]) + break + } + } + next + } + + # Basic tar filename + filename <- rawToChar(header[1:100]) + + # Apply ustar formatted extended filename + magic <- rawToChar(header[258:263]) + if (magic == "ustar"){ + prefix <- rawToChar(header[346:501]) + filename <- paste(prefix, filename, sep = "/") + } + + # Apply PAX formatted extended filename + if (!is.null(next_filename)) { + filename <- next_filename + next_filename <- NULL + } + + # Strip path elements, ignoring leading slash, skip if no path remains + if (strip > 0) { + filename <- gsub("^/", "", filename) + parts <- fs::path_split(filename)[[1]] + parts <- parts[-strip:-1] + if (length(parts) == 0) { + seek(con, 512 * file_blocks, origin = "current") + next + } + filename <- fs::path_join(c("/", parts)) + } + + # Calculate file offsets + entry <- list(filename = filename, start = offset, end = offset + size) + entries <- append(entries, list(entry)) + + # Skip to next entry header + seek(con, 512 * file_blocks, origin = "current") + } + entries +} + +# Split the elements of a raw vector x according to matches of element `split` +raw_split <- function(x, split) { + if (is.character(split)) { + split <- charToRaw(split) + } + + start <- 1 + out <- list() + for (end in which(x == split)) { + out <- c(out, list(x[start:(end - 1)])) + start <- end + 1 + } + + if (start <= length(x)) { + out <- c(out, list(x[start:length(x)])) + } + + out +} diff --git a/man/make_tar_index.Rd b/man/make_tar_index.Rd new file mode 100644 index 0000000..94cfb97 --- /dev/null +++ b/man/make_tar_index.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tar.R +\name{make_tar_index} +\alias{make_tar_index} +\title{Create an Emscripten metadata file for a given \code{tar} archive} +\usage{ +make_tar_index(file, strip = 0) +} +\arguments{ +\item{file}{Filename of the \code{tar} archive to be used as input.} + +\item{strip}{Remove the specified number of leading path elements. Pathnames +with fewer elements are skipped. Defaults to \code{0}, meaning none.} +} +\description{ +Calculates file offsets and other metadata for content stored in an +(optionally gzip compressed) \code{tar} archive. Together the \code{tar} archive and +resulting metadata file can be mounted as an Emscripten filesystem image, +making the content of the archive available to the WebAssembly R process. +} +\details{ +Outputs a metadata file named by the base name of the \code{tar} archive with new +extension \code{".js.metadata"}. Both files should be hosted online so that their +URL can be provided to webR for mounting on the virtual filesystem. +} From 36994cf5a9edb25dcb1f3c758b0be350fba09b4a Mon Sep 17 00:00:00 2001 From: George Stagg Date: Mon, 2 Sep 2024 11:11:28 +0100 Subject: [PATCH 02/12] Use `make_tar_index()` when building R packages --- R/build.R | 27 ++++++++++++++++----------- R/lib.R | 14 +++++++++----- R/repo.R | 9 ++++++--- R/tar.R | 3 +++ man/add_list.Rd | 6 ++++-- man/add_pkg.Rd | 8 +++++--- man/add_repo.Rd | 6 ++++-- man/build.Rd | 8 +++++--- man/make_vfs_library.Rd | 9 +++++++-- man/make_vfs_repo.Rd | 17 ++++++++++------- 10 files changed, 69 insertions(+), 38 deletions(-) diff --git a/R/build.R b/R/build.R index 088625e..29a39ad 100644 --- a/R/build.R +++ b/R/build.R @@ -18,7 +18,7 @@ build <- function(packages, out_dir = ".", remotes = NULL, dependencies = FALSE, - compress = FALSE) { + compress = TRUE) { tmp_dir <- tempfile() on.exit(unlink(tmp_dir, recursive = TRUE)) dir.create(tmp_dir) @@ -215,16 +215,21 @@ wasm_build <- function(pkg, tarball_path, contrib_bin, compress) { bin_dest <- fs::path(contrib_bin, paste0(pkg, "_", bin_ver, ".tgz")) fs::file_copy(bin_path, bin_dest, overwrite = TRUE) - # Build an Emscripten filesystem image for the package - tmp_bin_dir <- fs::path(tempfile()) - on.exit(unlink(tmp_bin_dir, recursive = TRUE), add = TRUE) - untar(bin_dest, exdir = tmp_bin_dir) - file_packager( - fs::dir_ls(tmp_bin_dir)[[1]], - contrib_bin, - fs::path_file(bin_dest), - compress - ) + if (compress) { + # Use binary .tgz file to build Emscripten filesystem image metadata + make_tar_index(bin_dest, strip = 1) + } else { + # Build an uncompressed Emscripten filesystem image for the package + tmp_bin_dir <- fs::path(tempfile()) + on.exit(unlink(tmp_bin_dir, recursive = TRUE), add = TRUE) + untar(bin_dest, exdir = tmp_bin_dir) + file_packager( + fs::dir_ls(tmp_bin_dir)[[1]], + contrib_bin, + fs::path_file(bin_dest), + compress = FALSE + ) + } invisible(NULL) } diff --git a/R/lib.R b/R/lib.R index 5df4b3f..f37b651 100644 --- a/R/lib.R +++ b/R/lib.R @@ -48,13 +48,14 @@ make_library <- function(repo_dir = "./repo", lib_dir = "./lib", strip = NULL) { #' #' Each filesystem image is generated using Emscripten's [file_packager()] tool #' and the output `.data` and `.js.metadata` filesystem image files are written -#' to the repository in the same directory as the package binary `.tar.gz` -#' files. +#' to the repository in the same directory as the package binary `.tgz` files. #' #' The resulting filesystem images may then be used by webR to download and -#' install R packages faster by mounting the `.data` images to the Emscripten -#' virtual filesystem, rather than decompressing and extracting the equivalent -#' `.tar.gz` files. +#' install R packages by mounting the `.data` images to the Emscripten virtual +#' filesystem. +#' +#' When `compress` is `TRUE`, an additional file with extension `".data.gz"` is +#' also output containing a compressed version of the filesystem data. #' #' @inheritParams add_pkg #' @@ -100,6 +101,9 @@ make_vfs_repo <- function(repo_dir = "./repo", compress = FALSE) { #' tool and the output `.data` and `.js.metadata` filesystem image files are #' written to the directory `out_dir`. #' +#' When `compress` is `TRUE`, an additional file with extension `".data.gz"` is +#' also output containing a compressed version of the filesystem data. +#' #' The resulting image can be downloaded by webR and mounted on the Emscripten #' virtual filesystem as an efficient way to provide a pre-configured R library, #' without installing each R package individually. diff --git a/R/repo.R b/R/repo.R index 16a1313..a179037 100644 --- a/R/repo.R +++ b/R/repo.R @@ -80,7 +80,10 @@ add_list <- function(list_file, ...) { #' Use `NA` to install only hard dependencies whereas `TRUE` installs all #' optional dependencies as well. See [pkgdepends::as_pkg_dependencies] #' for details. -#' @inheritParams file_packager +#' @param compress If `TRUE`, use R binary `.tgz` files when creating Emscripten +#' filesystem image metadata. Otherwise, an additional `.data` filesystem image +#' file is created by [file_packager()] and included in the output repository +#' binary directory. Defaults to `TRUE`. #' #' @importFrom dplyr rows_update select #' @importFrom pkgdepends new_pkg_download_proposal @@ -89,7 +92,7 @@ add_pkg <- function(packages, repo_dir = "./repo", remotes = NA, dependencies = FALSE, - compress = FALSE) { + compress = TRUE) { # Set up pkgdepends configuration config <- ppm_config config$dependencies <- dependencies @@ -185,7 +188,7 @@ prefer_remotes <- function(package_info, remotes = NA) { update_repo <- function(package_info, remotes = NA, repo_dir = "./repo", - compress = FALSE) { + compress = TRUE) { r_version <- R_system_version(getOption("rwasm.webr_version")) writeLines(sprintf("Processing %d package(s).", nrow(package_info))) diff --git a/R/tar.R b/R/tar.R index ebef738..342f3b9 100644 --- a/R/tar.R +++ b/R/tar.R @@ -14,9 +14,12 @@ #' with fewer elements are skipped. Defaults to `0`, meaning none. #' @export make_tar_index <- function(file, strip = 0) { + file <- fs::path_norm(file) file_ext <- tolower(fs::path_ext(file)) file_base <- fs::path_ext_remove(file) + message(paste("Building metadata index for:", file)) + # Check if our tar is compatible if (!any(file_ext == c("tgz", "gz", "tar"))) { stop(paste0("Can't make index for \"", file, diff --git a/man/add_list.Rd b/man/add_list.Rd index a536b91..3aa984e 100644 --- a/man/add_list.Rd +++ b/man/add_list.Rd @@ -21,8 +21,10 @@ add to the repository. Defaults to \code{FALSE}, meaning no additional packages. Use \code{NA} to install only hard dependencies whereas \code{TRUE} installs all optional dependencies as well. See \link[pkgdepends:as_pkg_dependencies]{pkgdepends::as_pkg_dependencies} for details.} - \item{\code{compress}}{Logical. If \code{TRUE}, a compressed version of the filesystem -data is included in the output. Defaults to \code{FALSE}.} + \item{\code{compress}}{If \code{TRUE}, use R binary \code{.tgz} files when creating Emscripten +filesystem image metadata. Otherwise, an additional \code{.data} filesystem image +file is created by \code{\link[=file_packager]{file_packager()}} and included in the output repository +binary directory. Defaults to \code{TRUE}.} }} } \description{ diff --git a/man/add_pkg.Rd b/man/add_pkg.Rd index 2610456..e0964d0 100644 --- a/man/add_pkg.Rd +++ b/man/add_pkg.Rd @@ -9,7 +9,7 @@ add_pkg( repo_dir = "./repo", remotes = NA, dependencies = FALSE, - compress = FALSE + compress = TRUE ) } \arguments{ @@ -27,8 +27,10 @@ Use \code{NA} to install only hard dependencies whereas \code{TRUE} installs all optional dependencies as well. See \link[pkgdepends:as_pkg_dependencies]{pkgdepends::as_pkg_dependencies} for details.} -\item{compress}{Logical. If \code{TRUE}, a compressed version of the filesystem -data is included in the output. Defaults to \code{FALSE}.} +\item{compress}{If \code{TRUE}, use R binary \code{.tgz} files when creating Emscripten +filesystem image metadata. Otherwise, an additional \code{.data} filesystem image +file is created by \code{\link[=file_packager]{file_packager()}} and included in the output repository +binary directory. Defaults to \code{TRUE}.} } \description{ Downloads and builds the \link[pkgdepends:pkg_refs]{R package references} given diff --git a/man/add_repo.Rd b/man/add_repo.Rd index e9ce42e..1abff78 100644 --- a/man/add_repo.Rd +++ b/man/add_repo.Rd @@ -25,8 +25,10 @@ add to the repository. Defaults to \code{FALSE}, meaning no additional packages. Use \code{NA} to install only hard dependencies whereas \code{TRUE} installs all optional dependencies as well. See \link[pkgdepends:as_pkg_dependencies]{pkgdepends::as_pkg_dependencies} for details.} - \item{\code{compress}}{Logical. If \code{TRUE}, a compressed version of the filesystem -data is included in the output. Defaults to \code{FALSE}.} + \item{\code{compress}}{If \code{TRUE}, use R binary \code{.tgz} files when creating Emscripten +filesystem image metadata. Otherwise, an additional \code{.data} filesystem image +file is created by \code{\link[=file_packager]{file_packager()}} and included in the output repository +binary directory. Defaults to \code{TRUE}.} }} } \description{ diff --git a/man/build.Rd b/man/build.Rd index 027e5a6..065eaad 100644 --- a/man/build.Rd +++ b/man/build.Rd @@ -9,7 +9,7 @@ build( out_dir = ".", remotes = NULL, dependencies = FALSE, - compress = FALSE + compress = TRUE ) } \arguments{ @@ -28,8 +28,10 @@ Use \code{NA} to install only hard dependencies whereas \code{TRUE} installs all optional dependencies as well. See \link[pkgdepends:as_pkg_dependencies]{pkgdepends::as_pkg_dependencies} for details.} -\item{compress}{Logical. If \code{TRUE}, a compressed version of the filesystem -data is included in the output. Defaults to \code{FALSE}.} +\item{compress}{If \code{TRUE}, use R binary \code{.tgz} files when creating Emscripten +filesystem image metadata. Otherwise, an additional \code{.data} filesystem image +file is created by \code{\link[=file_packager]{file_packager()}} and included in the output repository +binary directory. Defaults to \code{TRUE}.} } \description{ Downloads and builds the \link[pkgdepends:pkg_refs]{R package references} given diff --git a/man/make_vfs_library.Rd b/man/make_vfs_library.Rd index 0381fe5..e4ed523 100644 --- a/man/make_vfs_library.Rd +++ b/man/make_vfs_library.Rd @@ -20,8 +20,10 @@ to \code{"./vfs"}.} \item{repo_dir}{The package repository directory. Defaults to \code{"./repo"}.} -\item{compress}{Logical. If \code{TRUE}, a compressed version of the filesystem -data is included in the output. Defaults to \code{FALSE}.} +\item{compress}{If \code{TRUE}, use R binary \code{.tgz} files when creating Emscripten +filesystem image metadata. Otherwise, an additional \code{.data} filesystem image +file is created by \code{\link[=file_packager]{file_packager()}} and included in the output repository +binary directory. Defaults to \code{TRUE}.} \item{...}{ Arguments passed on to \code{\link[=make_library]{make_library}} @@ -39,6 +41,9 @@ A single filesystem image is generated using Emscripten's \code{\link[=file_pack tool and the output \code{.data} and \code{.js.metadata} filesystem image files are written to the directory \code{out_dir}. +When \code{compress} is \code{TRUE}, an additional file with extension \code{".data.gz"} is +also output containing a compressed version of the filesystem data. + The resulting image can be downloaded by webR and mounted on the Emscripten virtual filesystem as an efficient way to provide a pre-configured R library, without installing each R package individually. diff --git a/man/make_vfs_repo.Rd b/man/make_vfs_repo.Rd index 3e43386..bf1df1d 100644 --- a/man/make_vfs_repo.Rd +++ b/man/make_vfs_repo.Rd @@ -9,8 +9,10 @@ make_vfs_repo(repo_dir = "./repo", compress = FALSE) \arguments{ \item{repo_dir}{The package repository directory. Defaults to \code{"./repo"}.} -\item{compress}{Logical. If \code{TRUE}, a compressed version of the filesystem -data is included in the output. Defaults to \code{FALSE}.} +\item{compress}{If \code{TRUE}, use R binary \code{.tgz} files when creating Emscripten +filesystem image metadata. Otherwise, an additional \code{.data} filesystem image +file is created by \code{\link[=file_packager]{file_packager()}} and included in the output repository +binary directory. Defaults to \code{TRUE}.} } \description{ Creates an Emscripten filesystem image for each R package that exists in the @@ -19,11 +21,12 @@ package repository directory \code{repo_dir}. \details{ Each filesystem image is generated using Emscripten's \code{\link[=file_packager]{file_packager()}} tool and the output \code{.data} and \code{.js.metadata} filesystem image files are written -to the repository in the same directory as the package binary \code{.tar.gz} -files. +to the repository in the same directory as the package binary \code{.tgz} files. The resulting filesystem images may then be used by webR to download and -install R packages faster by mounting the \code{.data} images to the Emscripten -virtual filesystem, rather than decompressing and extracting the equivalent -\code{.tar.gz} files. +install R packages by mounting the \code{.data} images to the Emscripten virtual +filesystem. + +When \code{compress} is \code{TRUE}, an additional file with extension \code{".data.gz"} is +also output containing a compressed version of the filesystem data. } From d9b50bd9b0172f02b6194eb91eb643aca6f1282c Mon Sep 17 00:00:00 2001 From: George Stagg Date: Mon, 2 Sep 2024 12:07:42 +0100 Subject: [PATCH 03/12] Update NEWS.md --- NEWS.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 0d7ca50..96e914c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,12 @@ # rwasm (development version) -* Support for a new `compression` argument in `build()`, `add_pkg()`, `make_vfs_library()`, and other related functions. When enabled, VFS images will be compressed using `gzip`. Note: Loading compressed VFS images requires at least version 0.4.1 of webR (#39). +## New features + +* When building R packages with `compress` set to `TRUE`, use the binary R package `.tgz` file for the Emscripten filesystem image data and generate custom metadata rather than using Emscripten's `file_packager` tool. + +* Support for a new `compress` argument in `file_packager()`, `make_vfs_library()`, and other related functions. When enabled, VFS images will be compressed using `gzip` (#39). + +Note: Mounting processed `.tgz` archives or compressed VFS images requires at least version 0.4.2 of webR. # rwasm 0.1.0 From 82dfebe3b3ac4dc52ff186c8de93931d14f12f96 Mon Sep 17 00:00:00 2001 From: George Stagg Date: Mon, 2 Sep 2024 15:41:24 +0100 Subject: [PATCH 04/12] Append VFS metadata to R package .tgz output --- R/repo.R | 17 +++++------ R/tar.R | 66 ++++++++++++++++++++++++++++------------- man/add_list.Rd | 8 ++--- man/add_pkg.Rd | 8 ++--- man/add_repo.Rd | 8 ++--- man/build.Rd | 8 ++--- man/make_tar_index.Rd | 27 ++++++++++------- man/make_vfs_library.Rd | 8 ++--- man/make_vfs_repo.Rd | 8 ++--- 9 files changed, 94 insertions(+), 64 deletions(-) diff --git a/R/repo.R b/R/repo.R index a179037..3e1bd4e 100644 --- a/R/repo.R +++ b/R/repo.R @@ -76,15 +76,14 @@ add_list <- function(list_file, ...) { #' source. Defaults to `NA`, meaning prefer a built-in list of references to #' packages pre-modified for use with webR. #' @param dependencies Dependency specification for packages to additionally -#' add to the repository. Defaults to `FALSE`, meaning no additional packages. -#' Use `NA` to install only hard dependencies whereas `TRUE` installs all -#' optional dependencies as well. See [pkgdepends::as_pkg_dependencies] -#' for details. -#' @param compress If `TRUE`, use R binary `.tgz` files when creating Emscripten -#' filesystem image metadata. Otherwise, an additional `.data` filesystem image -#' file is created by [file_packager()] and included in the output repository -#' binary directory. Defaults to `TRUE`. -#' +#' add to the repository. Defaults to `FALSE`, meaning no additional packages. +#' Use `NA` to install only hard dependencies whereas `TRUE` installs all +#' optional dependencies as well. See [pkgdepends::as_pkg_dependencies] +#' for details. +#' @param compress When `TRUE`, add and compress Emscripten virtual filesystem +#' metadata in the resulting R package binary `.tgz` files. Otherwise, +#' [file_packager()] is used to create uncompressed virtual filesystem images +#' included in the output binary package repository. Defaults to `TRUE`. #' @importFrom dplyr rows_update select #' @importFrom pkgdepends new_pkg_download_proposal #' @export diff --git a/R/tar.R b/R/tar.R index 342f3b9..7177f42 100644 --- a/R/tar.R +++ b/R/tar.R @@ -1,24 +1,31 @@ -#' Create an Emscripten metadata file for a given `tar` archive +#' Add Emscripten virtual filesystem metadata to a given `tar` archive #' #' Calculates file offsets and other metadata for content stored in an -#' (optionally gzip compressed) `tar` archive. Together the `tar` archive and -#' resulting metadata file can be mounted as an Emscripten filesystem image, -#' making the content of the archive available to the WebAssembly R process. +#' (optionally gzip compressed) `tar` archive. Once added, the `tar` archive +#' with metadata can be mounted as an Emscripten filesystem image, making the +#' contents of the archive available to the WebAssembly R process. #' -#' Outputs a metadata file named by the base name of the `tar` archive with new -#' extension `".js.metadata"`. Both files should be hosted online so that their -#' URL can be provided to webR for mounting on the virtual filesystem. +#' The virtual filesystem metadata is appended to the end of the `tar` archive, +#' with the output replacing the original file. The resulting archive should be +#' hosted online so that its URL can be provided to webR for mounting on the +#' virtual filesystem. #' -#' @param file Filename of the `tar` archive to be used as input. -#' @param strip Remove the specified number of leading path elements. Pathnames -#' with fewer elements are skipped. Defaults to `0`, meaning none. +#' If `strip` is greater than `0` the virtual filesystem metadata is generated +#' such that when mounted by webR the specified number of leading path elements +#' are removed. Useful for R package binaries where data files are stored in the +#' original `.tgz` file under a subdirectory. Files with fewer path name +#' elements than the specified amount are skipped. +#' +#' @param file Filename of the `tar` archive for which metadata is to be added. +#' @param strip Remove the specified number of leading path elements when +#' mounting with webR. Defaults to `0`. #' @export make_tar_index <- function(file, strip = 0) { file <- fs::path_norm(file) file_ext <- tolower(fs::path_ext(file)) file_base <- fs::path_ext_remove(file) - message(paste("Building metadata index for:", file)) + message(paste("Appending virtual filesystem metadata for:", file)) # Check if our tar is compatible if (!any(file_ext == c("tgz", "gz", "tar"))) { @@ -31,27 +38,41 @@ make_tar_index <- function(file, strip = 0) { file_base <- fs::path_ext_remove(file_base) } - # Should we decompress? + # Read archive contents, decompressing if necessary gzip <- any(file_ext == c("tgz", "gz")) - - # R seems to choke when seeking on a gzfile() connection, so we buffer it data <- readBin(file, "raw", n = file.size(file)) if (gzip) { data <- memDecompress(data) } - con <- rawConnection(data, open = "rb") - on.exit(close(con)) - # Build metadata and write to .js.metadata file + # Build metadata from source .tar file + con <- rawConnection(data, open = "rb") + on.exit(close(con), add = TRUE) entries <- read_tar_offsets(con, strip) + tar_end <- seek(con) + metadata <- list( files = entries, gzip = gzip, - ext = gsub(file_base, "", file, ignore.case = TRUE), remote_package_size = length(data) ) - metadata_file <- paste0(file_base, ".js.metadata") - jsonlite::write_json(metadata, metadata_file, auto_unbox = TRUE) + + # Append metadata to .tar data + json <- charToRaw(jsonlite::toJSON(metadata, auto_unbox = TRUE)) + length(json) <- 4 * ceiling(length(json) / 4) # pad to 4 byte boundary + marker <- writeBin(as.integer(tar_end / 512), raw(), size = 4, endian = "big") + data <- c(data[1:tar_end], json, marker) + + # Write output and move into place + out <- tempfile() + out_con <- if (gzip) { + gzfile(out, open = "wb") + } else { + file(out, open = "wb") + } + writeBin(data, out_con, size = 1L) + close(out_con) + fs::file_copy(out, file, overwrite = TRUE) } read_tar_offsets <- function(con, strip) { @@ -63,7 +84,10 @@ read_tar_offsets <- function(con, strip) { header <- readBin(con, "raw", n = 512) # Empty header indicates end of archive - if (all(header == 0)) break + if (all(header == 0)) { + seek(con, 512, origin = "current") + break + } # Entry size and offset offset <- seek(con) diff --git a/man/add_list.Rd b/man/add_list.Rd index 3aa984e..fe31084 100644 --- a/man/add_list.Rd +++ b/man/add_list.Rd @@ -21,10 +21,10 @@ add to the repository. Defaults to \code{FALSE}, meaning no additional packages. Use \code{NA} to install only hard dependencies whereas \code{TRUE} installs all optional dependencies as well. See \link[pkgdepends:as_pkg_dependencies]{pkgdepends::as_pkg_dependencies} for details.} - \item{\code{compress}}{If \code{TRUE}, use R binary \code{.tgz} files when creating Emscripten -filesystem image metadata. Otherwise, an additional \code{.data} filesystem image -file is created by \code{\link[=file_packager]{file_packager()}} and included in the output repository -binary directory. Defaults to \code{TRUE}.} + \item{\code{compress}}{When \code{TRUE}, add and compress Emscripten virtual filesystem +metadata in the resulting R package binary \code{.tgz} files. Otherwise, +\code{\link[=file_packager]{file_packager()}} is used to create uncompressed virtual filesystem images +included in the output binary package repository. Defaults to \code{TRUE}.} }} } \description{ diff --git a/man/add_pkg.Rd b/man/add_pkg.Rd index e0964d0..d26cb5a 100644 --- a/man/add_pkg.Rd +++ b/man/add_pkg.Rd @@ -27,10 +27,10 @@ Use \code{NA} to install only hard dependencies whereas \code{TRUE} installs all optional dependencies as well. See \link[pkgdepends:as_pkg_dependencies]{pkgdepends::as_pkg_dependencies} for details.} -\item{compress}{If \code{TRUE}, use R binary \code{.tgz} files when creating Emscripten -filesystem image metadata. Otherwise, an additional \code{.data} filesystem image -file is created by \code{\link[=file_packager]{file_packager()}} and included in the output repository -binary directory. Defaults to \code{TRUE}.} +\item{compress}{When \code{TRUE}, add and compress Emscripten virtual filesystem +metadata in the resulting R package binary \code{.tgz} files. Otherwise, +\code{\link[=file_packager]{file_packager()}} is used to create uncompressed virtual filesystem images +included in the output binary package repository. Defaults to \code{TRUE}.} } \description{ Downloads and builds the \link[pkgdepends:pkg_refs]{R package references} given diff --git a/man/add_repo.Rd b/man/add_repo.Rd index 1abff78..9597119 100644 --- a/man/add_repo.Rd +++ b/man/add_repo.Rd @@ -25,10 +25,10 @@ add to the repository. Defaults to \code{FALSE}, meaning no additional packages. Use \code{NA} to install only hard dependencies whereas \code{TRUE} installs all optional dependencies as well. See \link[pkgdepends:as_pkg_dependencies]{pkgdepends::as_pkg_dependencies} for details.} - \item{\code{compress}}{If \code{TRUE}, use R binary \code{.tgz} files when creating Emscripten -filesystem image metadata. Otherwise, an additional \code{.data} filesystem image -file is created by \code{\link[=file_packager]{file_packager()}} and included in the output repository -binary directory. Defaults to \code{TRUE}.} + \item{\code{compress}}{When \code{TRUE}, add and compress Emscripten virtual filesystem +metadata in the resulting R package binary \code{.tgz} files. Otherwise, +\code{\link[=file_packager]{file_packager()}} is used to create uncompressed virtual filesystem images +included in the output binary package repository. Defaults to \code{TRUE}.} }} } \description{ diff --git a/man/build.Rd b/man/build.Rd index 065eaad..1ec5b75 100644 --- a/man/build.Rd +++ b/man/build.Rd @@ -28,10 +28,10 @@ Use \code{NA} to install only hard dependencies whereas \code{TRUE} installs all optional dependencies as well. See \link[pkgdepends:as_pkg_dependencies]{pkgdepends::as_pkg_dependencies} for details.} -\item{compress}{If \code{TRUE}, use R binary \code{.tgz} files when creating Emscripten -filesystem image metadata. Otherwise, an additional \code{.data} filesystem image -file is created by \code{\link[=file_packager]{file_packager()}} and included in the output repository -binary directory. Defaults to \code{TRUE}.} +\item{compress}{When \code{TRUE}, add and compress Emscripten virtual filesystem +metadata in the resulting R package binary \code{.tgz} files. Otherwise, +\code{\link[=file_packager]{file_packager()}} is used to create uncompressed virtual filesystem images +included in the output binary package repository. Defaults to \code{TRUE}.} } \description{ Downloads and builds the \link[pkgdepends:pkg_refs]{R package references} given diff --git a/man/make_tar_index.Rd b/man/make_tar_index.Rd index 94cfb97..73ea74a 100644 --- a/man/make_tar_index.Rd +++ b/man/make_tar_index.Rd @@ -2,24 +2,31 @@ % Please edit documentation in R/tar.R \name{make_tar_index} \alias{make_tar_index} -\title{Create an Emscripten metadata file for a given \code{tar} archive} +\title{Add Emscripten virtual filesystem metadata to a given \code{tar} archive} \usage{ make_tar_index(file, strip = 0) } \arguments{ -\item{file}{Filename of the \code{tar} archive to be used as input.} +\item{file}{Filename of the \code{tar} archive for which metadata is to be added.} -\item{strip}{Remove the specified number of leading path elements. Pathnames -with fewer elements are skipped. Defaults to \code{0}, meaning none.} +\item{strip}{Remove the specified number of leading path elements when +mounting with webR. Defaults to \code{0}.} } \description{ Calculates file offsets and other metadata for content stored in an -(optionally gzip compressed) \code{tar} archive. Together the \code{tar} archive and -resulting metadata file can be mounted as an Emscripten filesystem image, -making the content of the archive available to the WebAssembly R process. +(optionally gzip compressed) \code{tar} archive. Once added, the \code{tar} archive +with metadata can be mounted as an Emscripten filesystem image, making the +contents of the archive available to the WebAssembly R process. } \details{ -Outputs a metadata file named by the base name of the \code{tar} archive with new -extension \code{".js.metadata"}. Both files should be hosted online so that their -URL can be provided to webR for mounting on the virtual filesystem. +The virtual filesystem metadata is appended to the end of the \code{tar} archive, +with the output replacing the original file. The resulting archive should be +hosted online so that its URL can be provided to webR for mounting on the +virtual filesystem. + +If \code{strip} is greater than \code{0} the virtual filesystem metadata is generated +such that when mounted by webR the specified number of leading path elements +are removed. Useful for R package binaries where data files are stored in the +original \code{.tgz} file under a subdirectory. Files with fewer path name +elements than the specified amount are skipped. } diff --git a/man/make_vfs_library.Rd b/man/make_vfs_library.Rd index e4ed523..ef7e6a9 100644 --- a/man/make_vfs_library.Rd +++ b/man/make_vfs_library.Rd @@ -20,10 +20,10 @@ to \code{"./vfs"}.} \item{repo_dir}{The package repository directory. Defaults to \code{"./repo"}.} -\item{compress}{If \code{TRUE}, use R binary \code{.tgz} files when creating Emscripten -filesystem image metadata. Otherwise, an additional \code{.data} filesystem image -file is created by \code{\link[=file_packager]{file_packager()}} and included in the output repository -binary directory. Defaults to \code{TRUE}.} +\item{compress}{When \code{TRUE}, add and compress Emscripten virtual filesystem +metadata in the resulting R package binary \code{.tgz} files. Otherwise, +\code{\link[=file_packager]{file_packager()}} is used to create uncompressed virtual filesystem images +included in the output binary package repository. Defaults to \code{TRUE}.} \item{...}{ Arguments passed on to \code{\link[=make_library]{make_library}} diff --git a/man/make_vfs_repo.Rd b/man/make_vfs_repo.Rd index bf1df1d..650e226 100644 --- a/man/make_vfs_repo.Rd +++ b/man/make_vfs_repo.Rd @@ -9,10 +9,10 @@ make_vfs_repo(repo_dir = "./repo", compress = FALSE) \arguments{ \item{repo_dir}{The package repository directory. Defaults to \code{"./repo"}.} -\item{compress}{If \code{TRUE}, use R binary \code{.tgz} files when creating Emscripten -filesystem image metadata. Otherwise, an additional \code{.data} filesystem image -file is created by \code{\link[=file_packager]{file_packager()}} and included in the output repository -binary directory. Defaults to \code{TRUE}.} +\item{compress}{When \code{TRUE}, add and compress Emscripten virtual filesystem +metadata in the resulting R package binary \code{.tgz} files. Otherwise, +\code{\link[=file_packager]{file_packager()}} is used to create uncompressed virtual filesystem images +included in the output binary package repository. Defaults to \code{TRUE}.} } \description{ Creates an Emscripten filesystem image for each R package that exists in the From 0908275640dd37108e5a91374e12e87e2d1798c4 Mon Sep 17 00:00:00 2001 From: George Stagg Date: Mon, 2 Sep 2024 17:00:23 +0100 Subject: [PATCH 05/12] Improve VFS metadata encoding in .tgz file --- R/tar.R | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/R/tar.R b/R/tar.R index 7177f42..1df934e 100644 --- a/R/tar.R +++ b/R/tar.R @@ -59,9 +59,12 @@ make_tar_index <- function(file, strip = 0) { # Append metadata to .tar data json <- charToRaw(jsonlite::toJSON(metadata, auto_unbox = TRUE)) + magic <- charToRaw('webR') + reserved <- raw(4) # reserved for future use + block <- writeBin(as.integer(tar_end / 512), raw(), size = 4, endian = "big") + len <- writeBin(length(json), raw(), size = 4, endian = "big") length(json) <- 4 * ceiling(length(json) / 4) # pad to 4 byte boundary - marker <- writeBin(as.integer(tar_end / 512), raw(), size = 4, endian = "big") - data <- c(data[1:tar_end], json, marker) + data <- c(data[1:tar_end], json, magic, reserved, block, len) # Write output and move into place out <- tempfile() From d6ef78c3096a002969ebfc8a0ae1e1144e3654c8 Mon Sep 17 00:00:00 2001 From: George Stagg Date: Tue, 3 Sep 2024 13:19:19 +0100 Subject: [PATCH 06/12] Rename make_tar_index to add_tar_index --- NAMESPACE | 2 +- R/build.R | 2 +- R/tar.R | 2 +- man/{make_tar_index.Rd => add_tar_index.Rd} | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) rename man/{make_tar_index.Rd => add_tar_index.Rd} (94%) diff --git a/NAMESPACE b/NAMESPACE index a466b57..e0a016d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,10 +3,10 @@ export(add_list) export(add_pkg) export(add_repo) +export(add_tar_index) export(build) export(file_packager) export(make_library) -export(make_tar_index) export(make_vfs_library) export(make_vfs_repo) export(rm_pkg) diff --git a/R/build.R b/R/build.R index 29a39ad..0059cdf 100644 --- a/R/build.R +++ b/R/build.R @@ -217,7 +217,7 @@ wasm_build <- function(pkg, tarball_path, contrib_bin, compress) { if (compress) { # Use binary .tgz file to build Emscripten filesystem image metadata - make_tar_index(bin_dest, strip = 1) + add_tar_index(bin_dest, strip = 1) } else { # Build an uncompressed Emscripten filesystem image for the package tmp_bin_dir <- fs::path(tempfile()) diff --git a/R/tar.R b/R/tar.R index 1df934e..fca66b2 100644 --- a/R/tar.R +++ b/R/tar.R @@ -20,7 +20,7 @@ #' @param strip Remove the specified number of leading path elements when #' mounting with webR. Defaults to `0`. #' @export -make_tar_index <- function(file, strip = 0) { +add_tar_index <- function(file, strip = 0) { file <- fs::path_norm(file) file_ext <- tolower(fs::path_ext(file)) file_base <- fs::path_ext_remove(file) diff --git a/man/make_tar_index.Rd b/man/add_tar_index.Rd similarity index 94% rename from man/make_tar_index.Rd rename to man/add_tar_index.Rd index 73ea74a..c3ba1d6 100644 --- a/man/make_tar_index.Rd +++ b/man/add_tar_index.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/tar.R -\name{make_tar_index} -\alias{make_tar_index} +\name{add_tar_index} +\alias{add_tar_index} \title{Add Emscripten virtual filesystem metadata to a given \code{tar} archive} \usage{ -make_tar_index(file, strip = 0) +add_tar_index(file, strip = 0) } \arguments{ \item{file}{Filename of the \code{tar} archive for which metadata is to be added.} From 00dfe4e85aad012b9c66cc1d63e26838cc156ede Mon Sep 17 00:00:00 2001 From: George Stagg Date: Wed, 4 Sep 2024 10:01:41 +0100 Subject: [PATCH 07/12] Update pkgdown documentation --- _pkgdown.yml | 3 +- inst/pkgdown.yml | 13 +++++++ vignettes/mount-fs-image.Rmd | 75 +++++++++++++++++++++++++----------- vignettes/mount-host-dir.Rmd | 2 + vignettes/tar-metadata.Rmd | 27 +++++++++++++ 5 files changed, 97 insertions(+), 23 deletions(-) create mode 100644 inst/pkgdown.yml create mode 100644 vignettes/tar-metadata.Rmd diff --git a/_pkgdown.yml b/_pkgdown.yml index 5340db7..d9aed8a 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,4 +1,5 @@ url: https://r-wasm.github.io/rwasm/ template: bootstrap: 5 - +deploy: + install_metadata: true diff --git a/inst/pkgdown.yml b/inst/pkgdown.yml new file mode 100644 index 0000000..90b7770 --- /dev/null +++ b/inst/pkgdown.yml @@ -0,0 +1,13 @@ +pandoc: '3.2' +pkgdown: 2.0.9.9000 +pkgdown_sha: 34ee692e4ce10c8abfb863cc782da771838558f7 +articles: + github-actions: github-actions.html + mount-fs-image: mount-fs-image.html + mount-host-dir: mount-host-dir.html + rwasm: rwasm.html + tar-metadata: tar-metadata.html +last_built: 2024-09-04T08:58Z +urls: + reference: https://r-wasm.github.io/rwasm/reference + article: https://r-wasm.github.io/rwasm/articles diff --git a/vignettes/mount-fs-image.Rmd b/vignettes/mount-fs-image.Rmd index f60ecdc..9231011 100644 --- a/vignettes/mount-fs-image.Rmd +++ b/vignettes/mount-fs-image.Rmd @@ -7,47 +7,82 @@ vignette: > %\VignetteEncoding{UTF-8} --- -## Introduction +The Emscripten WebAssembly (Wasm) environment provides a virtual filesystem (VFS) which supports the concept of *mounting*. With this, an entire file and directory structure can be packaged into a filesystem image, efficiently making individual files or entire R package libraries available for use in webR. -The Emscripten WebAssembly environment provides a virtual filesystem (VFS) which supports the concept of *mounting*. With this, an entire file and directory structure can be packaged into a filesystem image to be loaded and mounted at runtime by WebAssembly (Wasm) applications. We can take advantage of this interface to efficiently mount R package libraries, pre-packaged and containing potentially many related R packages, in the VFS accessible to webR. +## Create filesystem images -## Building an R package library +### Emscripten's `file_packager` tool -To build an R package library image we must first build one or more Wasm R packages using `add_pkg()`. As an example, let's build a package with a few hard dependencies. Ensure that you are running R in an environment with access to Wasm development tools^[See the "Setting up the WebAssembly toolchain" section in `vignette("rwasm")` for further details.], then run: +The [`file_packager`](https://emscripten.org/docs/porting/files/packaging_files.html#packaging-using-the-file-packager-tool) tool, provided by Emscripten, takes in a directory structure as input and produces a webR compatible filesystem image as output. The [`file_packager`](https://emscripten.org/docs/porting/files/packaging_files.html#packaging-using-the-file-packager-tool) tool may be invoked from the [rwasm](https://r-wasm.github.io/rwasm/) package: ```{r eval=FALSE} -rwasm::add_pkg("dplyr") +> rwasm::file_packager("./input", out_dir = ".", out_name = "output") ``` -After the build process has completed, the new `repo` directory contains a CRAN-like package repository with R packages build for Wasm. +It can also be invoked directly using its CLI^[See the [`file_packager`](https://emscripten.org/docs/porting/files/packaging_files.html#packaging-using-the-file-packager-tool) Emscripten documentation for details.], if you prefer: -Next, run the following to build an Emscripten VFS image: +```bash +$ file_packager output.data --preload ./input@/ \ + --separate-metadata --js-output=output.js +``` + +In the above examples, the files in the directory `./input` are packaged and an output filesystem image is created^[When using the `file_packager` CLI, a third file named `output.js` will also be created. If you only plan to mount the image using webR, this file may be discarded.] consisting of a data file, `output.data`, and a metadata file, `output.js.metadata`. + +To prepare for mounting the filesystem image with webR, ensure that both files have the same basename (in this example, `output`). The resulting URLs or relative paths for the two files should differ only by the file extension. + +#### Compression + +Filesystem image `.data` files may optionally be `gzip` compressed prior to deployment. The file extension for compressed filesystem images should be `.data.gz`, and compression should be indicated by setting the property `gzip: true` on the metadata JSON stored in the `.js.metadata` file. + +**NOTE**: Loading compressed VFS images requires at least version 0.4.1 of webR. + +### Mount `.tar` archives as a filesystem image + +Archives in `.tar` format, optionally gzip compressed as `.tar.gz` or `.tgz` files, can also be used as filesystem images by pre-processing the `.tar` archive using the `rwasm::add_tar_index()` function. The function reads archive contents and appends the required filesystem metadata to the end of the `.tar` archive data in a way that is understood by webR. For further information about the format see the [Technical details for .tar archive metadata](tar-metadata.html) article. ```{r eval=FALSE} -rwasm::make_vfs_library() +> rwasm::add_tar_index("./path/to/archive.tar.gz") +# Appending virtual filesystem metadata for: ./path/to/archive.tar.gz ``` -By default, this function will create a new directory named `vfs` if it does not exist. The files `vfs/library.data` and `vfs/library.js.metadata` together form an Emscripten filesystem image containing an R package library consisting of all the packages previously added to the CRAN-like repository in `repo` using `add_pkg()`. +Once processed by `rwasm::add_tar_index()`, the `.tar` archive can be deployed and used directly as a filesystem image. -### Packaging arbitrary data +## Mounting filesystem images -It is also possible to package an arbitrary data directory as an Emscripten filesystem image using the `file_packager()` function: +When running in a web browser, the [`webr::mount()`](https://docs.r-wasm.org/webr/latest/api/r.qmd#mount) function downloads and mounts a filesystem image from a URL source, using the `WORKERFS` filesystem type. ```{r eval=FALSE} -rwasm::file_packager("./some/data/directory", out_name = "output_image.data") +webr::mount( + mountpoint = "/data", + source = "https://example.com/output.data" +) ``` -Again, this function writes output filesystem images to the `vfs` directory by default. +Filesystem images should be deployed to static file hosting^[e.g. GitHub Pages, Netlify, AWS S3, etc.] and the resulting URL provided as the source argument. The image will be mounted in the virtual filesystem under the path given by the `mountpoint` argument. If the `mountpoint` directory does not exist, it will be created prior to mounting. -### Compression +When running under Node.js, the source may also be provided as a relative path to a filesystem image on disk. -The `add_pkg()`, `make_vfs_library()`, `file_packager()` and other related functions support the `compression` argument. The default value is `FALSE`, but when `TRUE` VFS images will be `gzip` compressed for deployment. For some types of package content, the savings in file size with compression can be significant. +To test filesystem images before deployment, serve them using a local static webserver. See the Local Testing section below for an example using `httpuv::runStaticServer()` in R. -**NOTE**: Loading compressed VFS images requires at least version 0.4.1 of webR. +## Building an R package library image -## Mounting filesystem images +A collection of R packages can be collected and bundled into a single filesystem image for mounting. + +To build an R package library image we must first build one or more Wasm R packages using `add_pkg()`. As an example, let's build a package with a few hard dependencies. Ensure that you are running R in an environment with access to Wasm development tools^[See the "Setting up the WebAssembly toolchain" section in `vignette("rwasm")` for further details.], then run: + +```{r eval=FALSE} +rwasm::add_pkg("dplyr") +``` + +After the build process has completed, the new `repo` directory contains a CRAN-like package repository with R packages build for Wasm. -The filesystem image(s) should now be hosted by a web server so that it is available at some URL. Such a URL can then be passed to `webr::mount()` to be made available on the virtual filesystem for the Wasm R process. +Next, run the following to build an Emscripten VFS image: + +```{r eval=FALSE} +rwasm::make_vfs_library() +``` + +By default, this function will create a new directory named `vfs` if it does not exist. The files `vfs/library.data` and `vfs/library.js.metadata` together form an Emscripten filesystem image containing an R package library consisting of all the packages previously added to the CRAN-like repository in `repo` using `add_pkg()`. ### Local testing @@ -92,7 +127,3 @@ library(dplyr) #> #> intersect, setdiff, setequal, union ``` - -### Deployment - -The filesystem image files should be deployed to the static file hosting service of your choice, so that they are available for download anywhere. See the "Deployment to static hosting" section in `vignette("rwasm")` for an example of how to host static files with GitHub pages, substituting the `repo` directory for the `vfs` directory containing Emscripten filesystem images. diff --git a/vignettes/mount-host-dir.Rmd b/vignettes/mount-host-dir.Rmd index 8b13e75..d40ec82 100644 --- a/vignettes/mount-host-dir.Rmd +++ b/vignettes/mount-host-dir.Rmd @@ -11,6 +11,8 @@ vignette: > When running under Node.js, the Emscripten WebAssembly environment can make available the contents of a directory on the host filesystem. In addition to providing webR access to external data files, a pre-prepared R package library can be mounted from the host filesystem. This avoids the need to download potentially large R packages or filesystem images over the network. +See the [webR documentation for more details](https://docs.r-wasm.org/webr/latest/mounting.html#mount-an-existing-host-directory) on mounting host directories under Node.js. + ## Building an R package library To build an R package library, we must first build one or more Wasm R packages using `add_pkg()`. As an example, let's build a package with a few hard dependencies. Ensure that you are running R in an environment with access to Wasm development tools^[See the "Setting up the WebAssembly toolchain" section in `vignette("rwasm")` for further details.], then run: diff --git a/vignettes/tar-metadata.Rmd b/vignettes/tar-metadata.Rmd new file mode 100644 index 0000000..403b6f8 --- /dev/null +++ b/vignettes/tar-metadata.Rmd @@ -0,0 +1,27 @@ +--- +title: "Technical details for .tar archive metadata" +output: rmarkdown::html_document +vignette: > + %\VignetteIndexEntry{Technical details for .tar archive metadata} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +The `rwasm::add_tar_index()` function appends Emscripten filesystem metadata to an (optionally gzip compressed) `.tar` archive. +The resulting output can be directly mounted by webR to the virtual filesystem, making the content of the archive available to the WebAssembly R process. + +See the [Mounting filesystem images](mount-fs-image.html) article for more information about mounting filesystem images. + +## Archive data layout + +A `.tar` archive that includes Emscripten filesystem metadata has the data layout given below. The resulting `.tar` file may be gzip compressed, with file extension `.tar.gz` or `.tgz`. + +| Field | Size | Description | +|-|---|-------------| +| 0 | Variable | Standard `.tar` data, including end-of-archive marker. | +| 1 | Variable | JSON metadata, UTF8 encoded, padded with `0x00` to 4 byte boundary. | +| 2 | 4 bytes | Magic bytes: The string `"webR"`, UTF8 encoded (`0x77656252`). | +| 3 | 4 bytes | Reserved, currently `0x00000000`. | +| 4 | 4 bytes | Offset of JSON metadata (field 1), in units of 512-byte blocks. Signed integer, big endian. | +| 5 | 4 bytes | Length of JSON metadata, in bytes. Signed integer, big endian. | +Table: Data layout for a `.tar` archive with filesystem metadata. From 7bc5c5244bfe1df420f0d50c94254af36c40310f Mon Sep 17 00:00:00 2001 From: George Stagg Date: Mon, 9 Sep 2024 11:02:23 +0100 Subject: [PATCH 08/12] Explicitly write metadata values as integer type --- R/tar.R | 4 ++-- inst/pkgdown.yml | 2 +- vignettes/tar-metadata.Rmd | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/R/tar.R b/R/tar.R index fca66b2..63684b8 100644 --- a/R/tar.R +++ b/R/tar.R @@ -61,8 +61,8 @@ add_tar_index <- function(file, strip = 0) { json <- charToRaw(jsonlite::toJSON(metadata, auto_unbox = TRUE)) magic <- charToRaw('webR') reserved <- raw(4) # reserved for future use - block <- writeBin(as.integer(tar_end / 512), raw(), size = 4, endian = "big") - len <- writeBin(length(json), raw(), size = 4, endian = "big") + block <- writeBin(tar_end / 512, "integer", size = 4, endian = "big") + len <- writeBin(length(json), "integer", size = 4, endian = "big") length(json) <- 4 * ceiling(length(json) / 4) # pad to 4 byte boundary data <- c(data[1:tar_end], json, magic, reserved, block, len) diff --git a/inst/pkgdown.yml b/inst/pkgdown.yml index 90b7770..b182e83 100644 --- a/inst/pkgdown.yml +++ b/inst/pkgdown.yml @@ -7,7 +7,7 @@ articles: mount-host-dir: mount-host-dir.html rwasm: rwasm.html tar-metadata: tar-metadata.html -last_built: 2024-09-04T08:58Z +last_built: 2024-09-09T10:04Z urls: reference: https://r-wasm.github.io/rwasm/reference article: https://r-wasm.github.io/rwasm/articles diff --git a/vignettes/tar-metadata.Rmd b/vignettes/tar-metadata.Rmd index 403b6f8..368b099 100644 --- a/vignettes/tar-metadata.Rmd +++ b/vignettes/tar-metadata.Rmd @@ -22,6 +22,6 @@ A `.tar` archive that includes Emscripten filesystem metadata has the data layou | 1 | Variable | JSON metadata, UTF8 encoded, padded with `0x00` to 4 byte boundary. | | 2 | 4 bytes | Magic bytes: The string `"webR"`, UTF8 encoded (`0x77656252`). | | 3 | 4 bytes | Reserved, currently `0x00000000`. | -| 4 | 4 bytes | Offset of JSON metadata (field 1), in units of 512-byte blocks. Signed integer, big endian. | -| 5 | 4 bytes | Length of JSON metadata, in bytes. Signed integer, big endian. | +| 4 | 4 bytes | Offset of JSON metadata, in units of 512-byte blocks. Signed integer, big endian. | +| 5 | 4 bytes | Length of JSON metadata, not including trailing null characters, in bytes. Signed integer, big endian. | Table: Data layout for a `.tar` archive with filesystem metadata. From ea40804143d8c52dfe50c31aa0a3bf67cfc3a4ba Mon Sep 17 00:00:00 2001 From: George Stagg Date: Tue, 10 Sep 2024 16:31:07 +0100 Subject: [PATCH 09/12] Embed filesystem metadata as a tar entry --- R/tar.R | 52 +++++++++++++++++++++++++++++++++----- inst/pkgdown.yml | 2 +- vignettes/tar-metadata.Rmd | 31 +++++++++++++++++------ 3 files changed, 69 insertions(+), 16 deletions(-) diff --git a/R/tar.R b/R/tar.R index 63684b8..f976bab 100644 --- a/R/tar.R +++ b/R/tar.R @@ -57,14 +57,19 @@ add_tar_index <- function(file, strip = 0) { remote_package_size = length(data) ) - # Append metadata to .tar data - json <- charToRaw(jsonlite::toJSON(metadata, auto_unbox = TRUE)) + # Add metadata as additional .tar entry + entry <- create_metadata_entry(metadata) + json_block <- as.integer(tar_end / 512) + 1L + + # Append additional metadata hint for webR magic <- charToRaw('webR') reserved <- raw(4) # reserved for future use - block <- writeBin(tar_end / 512, "integer", size = 4, endian = "big") - len <- writeBin(length(json), "integer", size = 4, endian = "big") - length(json) <- 4 * ceiling(length(json) / 4) # pad to 4 byte boundary - data <- c(data[1:tar_end], json, magic, reserved, block, len) + block <- writeBin(json_block, raw(), size = 4, endian = "big") + len <- writeBin(entry$length, raw(), size = 4, endian = "big") + hint <- c(magic, reserved, block, len) + + # Build new .tar archive data + data <- c(data[1:tar_end], entry$data, raw(1024), hint) # Write output and move into place out <- tempfile() @@ -78,6 +83,38 @@ add_tar_index <- function(file, strip = 0) { fs::file_copy(out, file, overwrite = TRUE) } +create_metadata_entry <- function(metadata) { + # metadata contents + json <- charToRaw(jsonlite::toJSON(metadata, auto_unbox = TRUE)) + len <- length(json) + blocks <- ceiling(len/512) + length(json) <- 512 * blocks + + # entry header + timestamp <- as.integer(Sys.time()) + header <- raw(512) + header[1:15] <- charToRaw('.vfs-index.json') # filename + header[101:108] <- charToRaw('0000644 ') # mode + header[109:116] <- charToRaw('0000000 ') # uid + header[117:124] <- charToRaw('0000000 ') # gid + header[125:136] <- charToRaw(sprintf("%011o ", len)) # length + header[137:148] <- charToRaw(sprintf("%011o ", timestamp)) # timestamp + header[149:156] <- charToRaw(' ') # placeholder + header[157:157] <- charToRaw('0') # type + header[258:262] <- charToRaw('ustar') # ustar magic + header[264:265] <- charToRaw('00') # ustar version + header[266:269] <- charToRaw('root') # user + header[298:302] <- charToRaw('wheel') # group + + # populate checksum field + checksum <- raw(8) + checksum[1:6] <- charToRaw(sprintf("%06o", sum(as.integer(header)))) + checksum[8] <- charToRaw(' ') + header[149:156] <- checksum + + list(data = c(header, json), length = len) +} + read_tar_offsets <- function(con, strip) { entries <- list() next_filename <- NULL @@ -88,7 +125,8 @@ read_tar_offsets <- function(con, strip) { # Empty header indicates end of archive if (all(header == 0)) { - seek(con, 512, origin = "current") + # Return connection position to just before this header + seek(con, -512, origin = "current") break } diff --git a/inst/pkgdown.yml b/inst/pkgdown.yml index b182e83..91f6cbf 100644 --- a/inst/pkgdown.yml +++ b/inst/pkgdown.yml @@ -7,7 +7,7 @@ articles: mount-host-dir: mount-host-dir.html rwasm: rwasm.html tar-metadata: tar-metadata.html -last_built: 2024-09-09T10:04Z +last_built: 2024-09-10T15:29Z urls: reference: https://r-wasm.github.io/rwasm/reference article: https://r-wasm.github.io/rwasm/articles diff --git a/vignettes/tar-metadata.Rmd b/vignettes/tar-metadata.Rmd index 368b099..5052469 100644 --- a/vignettes/tar-metadata.Rmd +++ b/vignettes/tar-metadata.Rmd @@ -12,16 +12,31 @@ The resulting output can be directly mounted by webR to the virtual filesystem, See the [Mounting filesystem images](mount-fs-image.html) article for more information about mounting filesystem images. +## Filesystem metadata + +Virtual filesystem metadata is a JavaScript object, encoded as a JSON string. The format is defined and output by Emscripten's `file_packager` tool and understood by [webR's mounting API](mount-fs-image.html). The metadata object gives the location of each file in the archive to be mounted, and takes the following format: + +```javascript +{ + files: { + filename: string; + start: number; + end: number; + }[], +}; +``` + ## Archive data layout -A `.tar` archive that includes Emscripten filesystem metadata has the data layout given below. The resulting `.tar` file may be gzip compressed, with file extension `.tar.gz` or `.tgz`. +A `.tar` archive that can be directly mounted by webR includes filesystem metadata as a file named `.vfs-index.json` at the top level of the archive. The `.tar` archive may also include a "metadata hint" at the very end of the file, after the end-of-archive marker. Appending additional hint data is optional, but allows for more efficient mounting of archive contents to the virtual filesystem. + +The resulting `.tar` file may be gzip compressed, with file extension `.tar.gz` or `.tgz`. | Field | Size | Description | |-|---|-------------| -| 0 | Variable | Standard `.tar` data, including end-of-archive marker. | -| 1 | Variable | JSON metadata, UTF8 encoded, padded with `0x00` to 4 byte boundary. | -| 2 | 4 bytes | Magic bytes: The string `"webR"`, UTF8 encoded (`0x77656252`). | -| 3 | 4 bytes | Reserved, currently `0x00000000`. | -| 4 | 4 bytes | Offset of JSON metadata, in units of 512-byte blocks. Signed integer, big endian. | -| 5 | 4 bytes | Length of JSON metadata, not including trailing null characters, in bytes. Signed integer, big endian. | -Table: Data layout for a `.tar` archive with filesystem metadata. +| 0 | Variable | Standard `.tar` data, including the end-of-archive marker. | +| 1 | 4 bytes | Magic bytes: The string `"webR"`, UTF8 encoded (`0x77656252`). | +| 2 | 4 bytes | Reserved, currently `0x00000000`. | +| 3 | 4 bytes | Offset of `.vfs-index.json`, in units of 512-byte blocks. Signed integer, big endian. | +| 4 | 4 bytes | Length of `.vfs-index.json`, in bytes. Signed integer, big endian. | +Table: Data layout for a `.tar` archive containing filesystem metadata. From 6b4aed761aaec8378c7ed4049c1962e258f75a63 Mon Sep 17 00:00:00 2001 From: George Stagg Date: Wed, 11 Sep 2024 11:56:28 +0100 Subject: [PATCH 10/12] Set highest compression level when repacking tar --- R/tar.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/tar.R b/R/tar.R index f976bab..7939928 100644 --- a/R/tar.R +++ b/R/tar.R @@ -74,7 +74,7 @@ add_tar_index <- function(file, strip = 0) { # Write output and move into place out <- tempfile() out_con <- if (gzip) { - gzfile(out, open = "wb") + gzfile(out, open = "wb", compression = 9) } else { file(out, open = "wb") } From 4d9954f5d74462711a97f5da561ab769cdc974e7 Mon Sep 17 00:00:00 2001 From: George Stagg Date: Wed, 11 Sep 2024 11:56:59 +0100 Subject: [PATCH 11/12] Early exit tar processing on existing metadata --- R/tar.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/R/tar.R b/R/tar.R index 7939928..bc1454d 100644 --- a/R/tar.R +++ b/R/tar.R @@ -123,8 +123,11 @@ read_tar_offsets <- function(con, strip) { # Read tar entry header block header <- readBin(con, "raw", n = 512) - # Empty header indicates end of archive - if (all(header == 0)) { + # Basic tar filename + filename <- rawToChar(header[1:100]) + + # Empty header indicates end of archive, early exit for existing metadata + if (all(header == 0) || filename == ".vfs-index.json") { # Return connection position to just before this header seek(con, -512, origin = "current") break @@ -157,9 +160,6 @@ read_tar_offsets <- function(con, strip) { next } - # Basic tar filename - filename <- rawToChar(header[1:100]) - # Apply ustar formatted extended filename magic <- rawToChar(header[258:263]) if (magic == "ustar"){ From 866d553465111a36f9235a3546e92c4e24b987df Mon Sep 17 00:00:00 2001 From: George Stagg Date: Wed, 11 Sep 2024 11:57:25 +0100 Subject: [PATCH 12/12] Deal with hard and symbolic links in tar indexing --- R/tar.R | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/R/tar.R b/R/tar.R index bc1454d..108137d 100644 --- a/R/tar.R +++ b/R/tar.R @@ -187,6 +187,19 @@ read_tar_offsets <- function(con, strip) { # Calculate file offsets entry <- list(filename = filename, start = offset, end = offset + size) + + # Deal with hard and symbolic links + if (grepl("1|2", type)) { + link_name <- rawToChar(header[158:257]) + if (type == "2") { + link_name <- fs::path_norm(fs::path(fs::path_dir(filename), link_name)) + } + link_entry <- Find(\(e) e$filename == link_name, entries) + entry$start = link_entry$start + entry$end = link_entry$end + file_blocks <- 0 + } + entries <- append(entries, list(entry)) # Skip to next entry header