diff --git a/DESCRIPTION b/DESCRIPTION index 2e44920..a38eb60 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: memoise Title: Memoisation of Functions -Version: 1.1.0.9000 -Authors@R: +Version: 1.1.0.9001 +Authors@R: c(person(given = "Hadley", family = "Wickham", role = "aut", @@ -10,6 +10,10 @@ Authors@R: family = "Hester", role = c("aut", "cre"), email = "jim.hester@rstudio.com"), + person(given = "Winston", + family = "Chang", + role = "aut", + email = "winston@rstudio.com"), person(given = "Kirill", family = "Müller", role = "aut", @@ -29,13 +33,17 @@ License: MIT + file LICENSE URL: https://github.com/r-lib/memoise BugReports: https://github.com/r-lib/memoise/issues Imports: - digest (>= 0.6.3) + rlang (>= 0.4.10), + cachem Suggests: + digest, aws.s3, covr, googleAuthR, googleCloudStorageR, httr, testthat +Remotes: + r-lib/cachem Encoding: UTF-8 RoxygenNote: 7.1.1 diff --git a/NAMESPACE b/NAMESPACE index 159abbb..82acdd3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -13,5 +13,5 @@ export(is.memoized) export(memoise) export(memoize) export(timeout) -importFrom(digest,digest) +importFrom(cachem,key_missing) importFrom(stats,setNames) diff --git a/NEWS.md b/NEWS.md index adec4f2..3169a72 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,6 @@ -# Version 1.1.0.9000 +# Version 1.1.0.9001 + +* Memoise now uses caching objects from the cachem package by default. These caches support automatic pruning, so that they won't grow indefinitely. The older-style cache objects in the memoise package are still supported, but we suggest using new-style caches from cachem. (#112) * Name clashes between function arguments and variables defined when memoising no longer occur (@egnha, #43). @@ -7,10 +9,10 @@ * Add `compress` option for non-memory caches (@coolbutuseless, #71). -* Use absolute path in cache file system backend, so user can change working +* Use absolute path in cache file system backend, so user can change working directory after using relative path (@xhdong-umd, #51, #65) -* Add `drop_cache()` to drop the cached result for particular arguments +* Add `drop_cache()` to drop the cached result for particular arguments (@richardkunze, #78) * Suppress messages of `aws.s3::head_object` within `cache_s3`'s `cache_has_key` diff --git a/R/cache_filesystem.R b/R/cache_filesystem.R index 1fb4b0f..153f8d4 100644 --- a/R/cache_filesystem.R +++ b/R/cache_filesystem.R @@ -26,6 +26,7 @@ #' @export #' @inheritParams cache_memory cache_filesystem <- function(path, algo = "xxhash64", compress = FALSE) { + if (!(requireNamespace("digest"))) { stop("Package `digest` must be installed for `cache_filesystem()`.") } # nocov if (!dir.exists(path)) { dir.create(path, showWarnings = FALSE) diff --git a/R/cache_gcs.R b/R/cache_gcs.R index 701e9fb..7addb2d 100644 --- a/R/cache_gcs.R +++ b/R/cache_gcs.R @@ -22,6 +22,7 @@ cache_gcs <- function(cache_name = googleCloudStorageR::gcs_get_global_bucket(), algo = "sha512", compress = FALSE) { + if (!(requireNamespace("digest"))) { stop("Package `digest` must be installed for `cache_gcs()`.") } # nocov if (!(requireNamespace("googleCloudStorageR"))) { stop("Package `googleCloudStorageR` must be installed for `cache_gcs()`.") } # nocov path <- tempfile("memoise-") diff --git a/R/cache_memory.R b/R/cache_memory.R index 8f8f065..3baecf1 100644 --- a/R/cache_memory.R +++ b/R/cache_memory.R @@ -5,6 +5,7 @@ #' \code{\link[digest]{digest}} for available algorithms. #' @export cache_memory <- function(algo = "sha512") { + if (!(requireNamespace("digest"))) { stop("Package `digest` must be installed for `cache_memory()`.") } # nocov cache <- NULL cache_reset <- function() { diff --git a/R/cache_s3.R b/R/cache_s3.R index 975d277..fd2e74f 100644 --- a/R/cache_s3.R +++ b/R/cache_s3.R @@ -22,6 +22,7 @@ cache_s3 <- function(cache_name, algo = "sha512", compress = FALSE) { + if (!(requireNamespace("digest"))) { stop("Package `digest` must be installed for `cache_s3()`.") } # nocov if (!(requireNamespace("aws.s3"))) { stop("Package `aws.s3` must be installed for `cache_s3()`.") } # nocov if (!(aws.s3::bucket_exists(cache_name))) { @@ -61,7 +62,7 @@ cache_s3 <- function(cache_name, algo = "sha512", compress = FALSE) { cache_keys <- function() { items <- lapply(aws.s3::get_bucket(bucket = cache_name), `[[`, "Key") - unlist(Filter(Negate(is.null), items)) + as.character(unlist(Filter(Negate(is.null), items))) } list( diff --git a/R/memoise.R b/R/memoise.R index e9347ab..0b51674 100644 --- a/R/memoise.R +++ b/R/memoise.R @@ -1,3 +1,5 @@ +#' Memoise a function +#' #' \code{mf <- memoise(f)} creates \code{mf}, a memoised copy of #' \code{f}. A memoised copy is basically a #' lazier version of the same function: it saves the answers of @@ -49,18 +51,19 @@ #' } #' } #' @name memoise -#' @title Memoise a function. #' @param f Function of which to create a memoised copy. #' @param ... optional variables to use as additional restrictions on #' caching, specified as one-sided formulas (no LHS). See Examples for usage. #' @param envir Environment of the returned function. -#' @param cache Cache function. +#' @param cache Cache object. The default is a [cachem::cache_mem()] with a max +#' size of 1024 MB. +#' @param hash A function which takes an R object as input and returns a string +#' which is used as a cache key. #' @param omit_args Names of arguments to ignore when calculating hash. #' @seealso \code{\link{forget}}, \code{\link{is.memoised}}, #' \code{\link{timeout}}, \url{http://en.wikipedia.org/wiki/Memoization} #' @aliases memoise memoize #' @export memoise memoize -#' @importFrom digest digest #' @examples #' # a() is evaluated anew each time. memA() is only re-evaluated #' # when you call it with a new set of parameters. @@ -99,6 +102,11 @@ #' memA(2) # Still the same outcome #' memA2(2) # Different cache, different outcome #' +#' # Multiple memoized functions can share a cache. +#' cm <- cachem::cache_mem(max_size = 50 * 1024^2) +#' memA <- memoise(a, cache = cm) +#' memB <- memoise(b, cache = cm) +#' #' # Don't do the same memoisation assignment twice: a brand-new #' # memoised function also means a brand-new cache, and *that* #' # you could as easily and more legibly achieve using forget(). @@ -107,15 +115,19 @@ #' memA(2) #' memA <- memoise(a) #' memA(2) -#' # Making a memoized automatically time out after 10 seconds. -#' memA3 <- memoise(a, ~{current <- as.numeric(Sys.time()); (current - current %% 10) %/% 10 }) -#' memA3(2) #' -#' # The timeout function is an easy way to do the above. -#' memA4 <- memoise(a, ~timeout(10)) -#' memA4(2) +#' # Make a memoized result automatically time out after 10 seconds. +#' memA3 <- memoise(a, cache = cachem::cache_mem(max_age = 10)) +#' memA3(2) #' @importFrom stats setNames -memoise <- memoize <- function(f, ..., envir = environment(f), cache = cache_memory(), omit_args = c()) { +memoise <- memoize <- function( + f, + ..., + envir = environment(f), + cache = cachem::cache_mem(max_size = 1024 * 1024^2), + omit_args = c(), + hash = rlang::hash) +{ f_formals <- formals(args(f)) if(is.memoised(f)) { stop("`f` must not be memoised.", call. = FALSE) @@ -130,7 +142,7 @@ memoise <- memoize <- function(f, ..., envir = environment(f), cache = cache_mem called_args <- as.list(mc)[-1] # Formals with a default - default_args <- Filter(function(x) !identical(x, quote(expr = )), as.list(formals())) + default_args <- encl$`_default_args` # That has not been called default_args <- default_args[setdiff(names(default_args), names(called_args))] @@ -142,18 +154,20 @@ memoise <- memoize <- function(f, ..., envir = environment(f), cache = cache_mem args <- c(lapply(called_args, eval, parent.frame()), lapply(default_args, eval, envir = environment())) - hash <- encl$`_cache`$digest( - c(as.character(body(encl$`_f`)), args, - lapply(encl$`_additional`, function(x) eval(x[[2L]], environment(x)))) + key <- `_hash`( + c( + encl$`_f_hash`, + args, + lapply(encl$`_additional`, function(x) eval(x[[2L]], environment(x))) + ) ) - if (encl$`_cache`$has_key(hash)) { - res <- encl$`_cache`$get(hash) - } else { + res <- encl$`_cache`$get(key) + if (inherits(res, "key_missing")) { # modify the call to use the original function and evaluate it mc[[1L]] <- encl$`_f` res <- withVisible(eval(mc, parent.frame())) - encl$`_cache`$set(hash, res) + encl$`_cache`$set(key, res) } if (res$visible) { @@ -170,11 +184,28 @@ memoise <- memoize <- function(f, ..., envir = environment(f), cache = cache_mem envir <- baseenv() } + # Handle old-style memoise cache objects + if (is_old_cache(cache)) { + # Old-style caches include their own digest algorithm, so use that instead + # of whatever is passed in. + hash <- cache$digest + cache <- wrap_old_cache(cache) + } + memo_f_env <- new.env(parent = envir) + memo_f_env$`_hash` <- hash memo_f_env$`_cache` <- cache memo_f_env$`_f` <- f + # Precompute hash of function. This saves work because when this is added to + # the list of objects to hash, it doesn't need to serialize and hash the + # entire function. This does not include the environment or source refs. + # The as.character() is there to ensure source refs are not included. + memo_f_env$`_f_hash` <- rlang::hash(list(formals(f), as.character(body(f)))) memo_f_env$`_additional` <- additional memo_f_env$`_omit_args` <- omit_args + # Formals with a default value + memo_f_env$`_default_args` <- Filter(function(x) !identical(x, quote(expr = )), f_formals) + environment(memo_f) <- memo_f_env class(memo_f) <- c("memoised", "function") @@ -282,7 +313,7 @@ has_cache <- function(f) { # Modify the function body of the function to simply return TRUE and FALSE # rather than get or set the results of the cache body <- body(f) - body[[10]] <- quote(if (encl$`_cache`$has_key(hash)) return(TRUE) else return(FALSE)) + body[[11]] <- quote(return(encl$`_cache`$exists(key))) body(f) <- body f @@ -309,8 +340,8 @@ drop_cache <- function(f) { # Modify the function body of the function to simply drop the key # and return TRUE if successfully removed body <- body(f) - body[[10]] <- quote(if (encl$`_cache`$has_key(hash)) { - encl$`_cache`$drop_key(hash) + body[[10]] <- quote(if (encl$`_cache`$exists(key)) { + encl$`_cache`$remove(key) return(TRUE) } else { return(FALSE) diff --git a/R/old_cache.R b/R/old_cache.R new file mode 100644 index 0000000..24a3320 --- /dev/null +++ b/R/old_cache.R @@ -0,0 +1,34 @@ +# Wrap an old-style cache so that the external API is consistent with that from +# the cache package. + +#' @importFrom cachem key_missing +wrap_old_cache <- function(x) { + if (!is_old_cache(x)) { + stop("`x` must be an old-style cache.", call. = FALSE) + } + + list( + digest = x$digest, + reset = x$reset, + set = x$set, + get = function(key) { + if (!x$has_key(key)) { + return(key_missing()) + } + x$get(key) + }, + exists = x$has_key, + remove = x$drop_key, + keys = x$keys + ) +} + +# Returns TRUE if it's an old-style cache. +is_old_cache <- function(x) { + is.function(x$reset) && + is.function(x$digest) && + is.function(x$set) && + is.function(x$get) && + is.function(x$has_key) && + is.function(x$drop_key) +} diff --git a/README.Rmd b/README.Rmd index d6efc5a..a12cf69 100644 --- a/README.Rmd +++ b/README.Rmd @@ -1,5 +1,7 @@ --- output: github_document +editor_options: + chunk_output_type: console --- @@ -42,14 +44,20 @@ f <- function(x) { mean(x) } mf <- memoise(f) +``` +```{r eval=FALSE} system.time(mf(1:10)) +#> user system elapsed +#> 0.002 0.000 1.003 system.time(mf(1:10)) +#> user system elapsed +#> 0.000 0.000 0.001 ``` You can clear `mf`'s cache with: -```{r} +```{r eval=FALSE} forget(mf) ``` @@ -57,16 +65,50 @@ And you can test whether a function is memoised with `is.memoised()`. ## Caches -By default, memoise uses an in-memory cache. But you can customise this with the `cache` arugment and another built-in cache: +By default, memoise uses an in-memory cache, using `cache_mem()` from the [cachem](https://github.com/r-lib/cachem) package. `cachem::cache_disk()` allows caching using files on a local filesystem. -- `cache_filesystem()` allows caching using files on a local filesystem. This is useful for preserving the cache between R sessions as well as sharing between systems when using a shared or synced files system such as Dropbox or Google Drive. +Both `cachem::cache_mem()` and `cachem::cache_disk()` support automatic pruning by default; this means that they will not keep growing past a certain size, and eventually older items will be removed from the cache. The default size `cache_mem()` is 512 MB, and the default size for a `cache_disk()` is 1 GB, but this can be customized by specifying `max_size`: - ```{r, eval = FALSE} - fc <- cache_filesystem("~/.cache") +```{r} +# 100 MB limit +cm <- cachem::cache_mem(max_size = 100 * 1024^2) - # Store in Dropbox - dbc <- cache_filesystem("~/Dropbox/.rcache") - ``` +mf <- memoise(f, cache = cm) +``` + +You can also change the maximum age of items in the cache with `max_age`: + +```{r} +# Expire items in cache after 15 minutes +cm <- cachem::cache_mem(max_age = 15 * 60) + +mf <- memoise(f, cache = cm) +``` + +By default, a `cache_disk()` uses a subdirectory the R process's temp directory, but it is possible to specify the directory. This is useful for persisting a cache across R sessions, sharing a cache among different processes, or even for synchronizing across the network. + +```{r, eval = FALSE} +# Store in "R-myapp" directory inside of user-level cache directory +cd <- cachem::cache_disk(rappdirs::user_cache_dir("R-myapp")) + +# Store in Dropbox +cdb <- cachem::cache_disk("~/Dropbox/.rcache") +``` + +A single cache object can be shared among multiple memoised functions. By default, the cache key includes not only the arguments to the function, but also the body of the function. This essentially eliminates the possibility of a cache collision, even if two memoised functions are called with the same arguments. + +```{r} +m <- cachem::cache_mem() + +times2 <- memoise(function(x) { x * 2 }, cache = m) +times4 <- memoise(function(x) { x * 4 }, cache = m) + +times2(10) +times4(10) +``` + + +Before version 1.2, memoise used different caching objects, which did not have automatic pruning and had a slightly different API. These caching objects can still be used, but we recommend using the caching objects from cachem when possible. The following cache objects do not currently have an equivalent in cachem. - `cache_s3()` allows caching on [Amazon S3](https://aws.amazon.com/s3/) Requires you to specify a bucket using `cache_name`. When creating buckets, they must be unique among all s3 users when created. diff --git a/README.md b/README.md index 5e7eada..47db549 100644 --- a/README.md +++ b/README.md @@ -37,40 +37,91 @@ f <- function(x) { mean(x) } mf <- memoise(f) +``` +``` r system.time(mf(1:10)) -#> user system elapsed -#> 0.000 0.000 1.003 +#> user system elapsed +#> 0.002 0.000 1.003 system.time(mf(1:10)) -#> user system elapsed -#> 0.031 0.001 0.032 +#> user system elapsed +#> 0.000 0.000 0.001 ``` You can clear `mf`’s cache with: ``` r forget(mf) -#> [1] TRUE ``` And you can test whether a function is memoised with `is.memoised()`. ## Caches -By default, memoise uses an in-memory cache. But you can customise this -with the `cache` arugment and another built-in cache: +By default, memoise uses an in-memory cache, using `cache_mem()` from +the [cachem](https://github.com/r-lib/cachem) package. +`cachem::cache_disk()` allows caching using files on a local filesystem. - - `cache_filesystem()` allows caching using files on a local - filesystem. This is useful for preserving the cache between R - sessions as well as sharing between systems when using a shared or - synced files system such as Dropbox or Google Drive. - - ``` r - fc <- cache_filesystem("~/.cache") - - # Store in Dropbox - dbc <- cache_filesystem("~/Dropbox/.rcache") - ``` +Both `cachem::cache_mem()` and `cachem::cache_disk()` support automatic +pruning by default; this means that they will not keep growing past a +certain size, and eventually older items will be removed from the cache. +The default size `cache_mem()` is 512 MB, and the default size for a +`cache_disk()` is 1 GB, but this can be customized by specifying +`max_size`: + +``` r +# 100 MB limit +cm <- cachem::cache_mem(max_size = 100 * 1024^2) + +mf <- memoise(f, cache = cm) +``` + +You can also change the maximum age of items in the cache with +`max_age`: + +``` r +# Expire items in cache after 15 minutes +cm <- cachem::cache_mem(max_age = 15 * 60) + +mf <- memoise(f, cache = cm) +``` + +By default, a `cache_disk()` uses a subdirectory the R process’s temp +directory, but it is possible to specify the directory. This is useful +for persisting a cache across R sessions, sharing a cache among +different processes, or even for synchronizing across the network. + +``` r +# Store in "R-myapp" directory inside of user-level cache directory +cd <- cachem::cache_disk(rappdirs::user_cache_dir("R-myapp")) + +# Store in Dropbox +cdb <- cachem::cache_disk("~/Dropbox/.rcache") +``` + +A single cache object can be shared among multiple memoised functions. +By default, the cache key includes not only the arguments to the +function, but also the body of the function. This essentially eliminates +the possibility of a cache collision, even if two memoised functions are +called with the same arguments. + +``` r +m <- cachem::cache_mem() + +times2 <- memoise(function(x) { x * 2 }, cache = m) +times4 <- memoise(function(x) { x * 4 }, cache = m) + +times2(10) +#> [1] 20 +times4(10) +#> [1] 40 +``` + +Before version 1.2, memoise used different caching objects, which did +not have automatic pruning and had a slightly different API. These +caching objects can still be used, but we recommend using the caching +objects from cachem when possible. The following cache objects do not +currently have an equivalent in cachem. - `cache_s3()` allows caching on [Amazon S3](https://aws.amazon.com/s3/) Requires you to specify a bucket diff --git a/man/memoise.Rd b/man/memoise.Rd index 883ef0a..341b316 100644 --- a/man/memoise.Rd +++ b/man/memoise.Rd @@ -3,14 +3,15 @@ \name{memoise} \alias{memoise} \alias{memoize} -\title{Memoise a function.} +\title{Memoise a function} \usage{ memoise( f, ..., envir = environment(f), - cache = cache_memory(), - omit_args = c() + cache = cachem::cache_mem(max_size = 1024 * 1024^2), + omit_args = c(), + hash = rlang::hash ) } \arguments{ @@ -21,9 +22,13 @@ caching, specified as one-sided formulas (no LHS). See Examples for usage.} \item{envir}{Environment of the returned function.} -\item{cache}{Cache function.} +\item{cache}{Cache object. The default is a [cachem::cache_mem()] with a max +size of 1024 MB.} \item{omit_args}{Names of arguments to ignore when calculating hash.} + +\item{hash}{A function which takes an R object as input and returns a string +which is used as a cache key.} } \description{ \code{mf <- memoise(f)} creates \code{mf}, a memoised copy of @@ -116,6 +121,11 @@ memA2 <- memoise(a) memA(2) # Still the same outcome memA2(2) # Different cache, different outcome +# Multiple memoized functions can share a cache. +cm <- cachem::cache_mem(max_size = 50 * 1024^2) +memA <- memoise(a, cache = cm) +memB <- memoise(b, cache = cm) + # Don't do the same memoisation assignment twice: a brand-new # memoised function also means a brand-new cache, and *that* # you could as easily and more legibly achieve using forget(). @@ -124,13 +134,10 @@ memA2(2) # Different cache, different outcome memA(2) memA <- memoise(a) memA(2) -# Making a memoized automatically time out after 10 seconds. -memA3 <- memoise(a, ~{current <- as.numeric(Sys.time()); (current - current \%\% 10) \%/\% 10 }) -memA3(2) -# The timeout function is an easy way to do the above. -memA4 <- memoise(a, ~timeout(10)) -memA4(2) +# Make a memoized result automatically time out after 10 seconds. +memA3 <- memoise(a, cache = cachem::cache_mem(max_age = 10)) +memA3(2) } \seealso{ \code{\link{forget}}, \code{\link{is.memoised}}, diff --git a/tests/testthat/test-memoise.R b/tests/testthat/test-memoise.R index 46edb01..bb26a4a 100644 --- a/tests/testthat/test-memoise.R +++ b/tests/testthat/test-memoise.R @@ -111,6 +111,69 @@ test_that("symbol collision", { expect_equal(cachem(), 5) }) +test_that("different body avoids collisions", { + # Same args, different body + m <- cachem::cache_mem() + times2 <- memoise(function(x) { x * 2 }, cache = m) + times4 <- memoise(function(x) { x * 4 }, cache = m) + + expect_identical(times2(10), 20) + expect_equal(m$size(), 1) + expect_identical(times4(10), 40) + expect_equal(m$size(), 2) +}) + +test_that("different formals avoids collisions", { + # Different formals (even if not used) avoid collisions, because formals + # are used in key. + m <- cachem::cache_mem() + f <- function(x, y) { x * 2 } + times2 <- memoise(function(x, y) { x * 2 }, cache = m) + times2a <- memoise(function(x, y = 1) { x * 2 }, cache = m) + + expect_identical(times2(10), 20) + expect_equal(m$size(), 1) + expect_identical(times2a(10), 20) + expect_equal(m$size(), 2) +}) + +test_that("same body results in collisions", { + # Two identical memoised functions should result in cache hits so that cache + # can be shared more easily. + # https://github.com/r-lib/memoise/issues/58 + m <- cachem::cache_mem() + times2 <- memoise(function(x, y) { x * 2 }, cache = m) + times2a <- memoise(function(x, y) { x * 2 }, cache = m) + + expect_identical(times2(10), 20) + expect_identical(times2a(10), 20) + expect_equal(m$size(), 1) +}) + +test_that("same body results in collisions", { + # Even though t2 and t4 produce different results, the memoised versions, + # times2 and times4, have cache collisions because the functions have the same + # body and formals. It would be nice if we could somehow avoid this. + m <- cachem::cache_mem() + + t2 <- local({ + n <- 2 + function(x) x * n + }) + t4 <- local({ + n <- 4 + function(x) x * n + }) + + times2 <- memoise(t2, cache = m) + times4 <- memoise(t4, cache = m) + + expect_identical(times2(10), 20) + expect_identical(times4(10), 20) # Bad (but expected) cache collision! + expect_equal(m$size(), 1) +}) + + test_that("visibility", { vis <- function() NULL invis <- function() invisible()