Merge pull request #48 from reconhub/fix-default-warning

Fix default warning
reconhub · Mar 1, 2019 · 6d10845 · 6d10845
2 parents 8d527ae + 4cb86f0
commit 6d10845
Show file tree

Hide file tree

Showing 13 changed files with 192 additions and 18 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: linelist
 Title: Tools to Import and Tidy Case Linelist Data
-Version: 0.8.0.9000
+Version: 0.8.1.9000
 Authors@R: c(person("Thibaut", "Jombart", email = "thibautjombart@gmail.com", role = c("aut", "cre")),
     person("Zhian N.", "Kamvar", email = "zkamvar@gmail.com", role = c("aut")))
 Description: A collection of wrappers for importing case linelist data from usual formats, and tools for cleaning data, detecting dates, and storing meta-information on the content of the resulting \code{data.frame}.  

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,12 @@
+# linelist 0.8.1.9000
+
+* `clean_spelling()` will no longer throw a warning if there is no value for
+  .default to replace.
+* `clean_variable_spelling()`, `clean_variables()`, and `clean_data()` gain the
+  `warn` and `warn_spelling` arguments which will capture all errors and
+  warnings issued from `clean_spelling()` for each variable. 
+  See https://github.com/reconhub/linelist/pull/48 for details).  
+
 # linelist 0.8.0.9000
 
 * `compare_data()` allows users to compare _structural changes_ to data frames

diff --git a/R/clean_data.R b/R/clean_data.R
@@ -74,8 +74,9 @@
 
 
 clean_data <- function(x, sep = "_", force_Date = TRUE, guess_dates = TRUE, 
-                       error_tolerance = 0.5, wordlists = NULL, spelling_vars = 3, 
-                       sort_by = NULL, protect = FALSE, ...) {
+                       error_tolerance = 0.5, wordlists = NULL, 
+                       spelling_vars = 3, sort_by = NULL, warn_spelling = FALSE,
+                       protect = FALSE, ...) {
 
   xname <- deparse(substitute(x))
   if (!is.data.frame(x)) {
@@ -100,6 +101,7 @@ clean_data <- function(x, sep = "_", force_Date = TRUE, guess_dates = TRUE,
                          wordlists = wordlists,
                          spelling_vars = spelling_vars,
                          sort_by = sort_by,
+                         warn_spelling = warn_spelling,
                          classes = classes)
 
   # Cleaning and guessing dates ------------------------------------------------

diff --git a/R/clean_spelling.R b/R/clean_spelling.R
@@ -14,6 +14,11 @@
 #' @param quiet a `logical` indicating if warnings should be issued if no
 #'   replacement is made; if `FALSE`, these warnings will be disabled
 #' 
+#' @param warn_default a `logical`. When a `.default` keyword is set and 
+#'   `warn_default = TRUE`, a warning will be issued listing the variables
+#'   that were changed to the default value. This can be used to update your
+#'   wordlist.
+#' 
 #'
 #' @details 
 #'
@@ -57,8 +62,13 @@
 #' # You can also set a default value
 #' corrections_with_default <- rbind(corrections, c(bad = ".default", good = "unknown"))
 #' corrections_with_default
+#' 
+#' # a warning will be issued about the data that were converted
 #' clean_spelling(my_data, corrections_with_default)
 #'
+#' # use the warn_default = FALSE, if you are absolutely sure you don't want it.
+#' clean_spelling(my_data, corrections_with_default, warn_default = FALSE)
+#'
 #' # The function will give you a warning if the wordlist does not
 #' # match the data
 #' clean_spelling(letters, corrections)
@@ -76,7 +86,7 @@
 #' @importFrom rlang "!!!"
 
 clean_spelling <- function(x = character(), wordlist = data.frame(),
-                           quiet = FALSE) {
+                           quiet = FALSE, warn_default = TRUE) {
 
   if (length(x) == 0 || !is.atomic(x)) {
     stop("x must be coerceable to a character")
@@ -176,7 +186,15 @@ clean_spelling <- function(x = character(), wordlist = data.frame(),
 
   # Replace any untranslated variables if .default is defined -----------------
   if (length(default) > 0) {
-    x <- forcats::fct_other(x, keep = c(names(dict), names(nas)), other = names(default))
+    default_vars <- levels(x)[!levels(x) %in% c(names(dict), names(nas))]
+    if (warn_default && length(default_vars) > 0) {
+      was <- if (length(default_vars) > 1) "were" else "was"
+      msg <- "'%s' %s changed to the default value ('%s')"
+      warning(sprintf(msg, paste(default_vars, collapse = "', '"), was, names(default)))
+    }
+    suppressWarnings({
+      x <- forcats::fct_other(x, keep = c(names(dict), names(nas)), other = names(default))
+    })
   }
 
   # Make sure order is preserved if it's a factor -----------------------------

diff --git a/R/clean_variable_spelling.R b/R/clean_variable_spelling.R
@@ -20,6 +20,9 @@
 #'   each data frame. If the incoming variables are factors, this determines how
 #'   the resulting factors will be sorted.
 #' 
+#' @param warn if `TRUE`, warnings and errors from [clean_spelling()] will be 
+#'   shown as a single warning. Defaults to `FALSE`, which shows nothing.
+#'
 #' @inheritParams clean_variable_labels
 #' 
 #'
@@ -131,7 +134,7 @@
 #' head(res)
 #' as.list(head(res))
 
-clean_variable_spelling <- function(x = data.frame(), wordlists = list(), spelling_vars = 3, sort_by = NULL, classes = NULL) {
+clean_variable_spelling <- function(x = data.frame(), wordlists = list(), spelling_vars = 3, sort_by = NULL, classes = NULL, warn = FALSE) {
 
   if (length(x) == 0 || !is.data.frame(x)) {
     stop("x must be a data frame")
@@ -154,11 +157,11 @@ clean_variable_spelling <- function(x = data.frame(), wordlists = list(), spelli
     # There is a spelling_varsing column ----------------------------------------
     if (!is.null(spelling_vars) && length(spelling_vars) == 1) {
       is_number <- is.numeric(spelling_vars) &&          # spelling_vars is a number
-                   as.integer(spelling_vars) == spelling_vars && # ... and an integer
-                   spelling_vars <= ncol(wordlists)      # ... and is within the bounds
+        as.integer(spelling_vars) == spelling_vars && # ... and an integer
+          spelling_vars <= ncol(wordlists)      # ... and is within the bounds
 
       is_name   <- is.character(spelling_vars) &&         # spelling_vars is a name
-                   any(names(wordlists) == spelling_vars) # ... in the wordlists
+        any(names(wordlists) == spelling_vars) # ... in the wordlists
       if (is_number || is_name) {
         wordlists <- split(wordlists, wordlists[[spelling_vars]])
       } else {
@@ -223,11 +226,29 @@ clean_variable_spelling <- function(x = data.frame(), wordlists = list(), spelli
       to_iterate <- unique(c(to_iterate, unprotected))
     }
   }
+
+  # Prepare warning/error labels ---------------------------------------------
+  warns <- vector(mode = "list", length = length(to_iterate)) -> errs
+  iter_print <- gsub(" ", "_", format(to_iterate))
+  names(iter_print) <- to_iterate
+
   # Loop over the variables and clean spelling --------------------------------
   for (i in to_iterate) {
     d <- if (ddf) wordlists else wordlists[[i]] 
     d <- if (is.null(d)) global_words else d
-    try(x[[i]] <- clean_spelling(x[[i]], d, quiet = TRUE))
+    # Evaluate and collect any warnings/errors that pop up
+    w          <- withWarnings(clean_spelling(x[[i]], d, quiet = FALSE))
+    x[[i]]     <- if(is.null(w$val)) x[[i]] else w$val
+    if (warn) {
+      warns[[i]] <- collect_ya_errs(w$warnings, iter_print[i])
+      errs[[i]]  <- collect_ya_errs(w$errors, iter_print[i])
+    }
+  }
+
+  # Process warnings and errors and give a warning if there were any
+  if (warn) {
+    wemsg <- process_werrors(warns, errs)
+    if (!is.null(wemsg)) warning(wemsg)
   }
 
   x

diff --git a/R/clean_variables.R b/R/clean_variables.R
@@ -8,11 +8,17 @@
 #' @param protect a logical or numeric vector defining the columns to protect
 #'   from any manipulation. Note: columns in `protect` will override any columns
 #'   in either `force_Date` or `guess_dates`.
+#' 
+#' @param warn_spelling if `TRUE`, errors and warnings from [clean_spelling()]
+#'   will be aggregated and presented for each column that issues them. The
+#'   default value is `FALSE`, which means that all errors and warnings will be
+#'   ignored. 
 #'
 #' @export 
 #' @author Zhian N. Kamvar
 #' @seealso [clean_variable_labels()] to standardise text, 
 #'   [clean_variable_spelling()] to correct spelling with a wordlist.
+#'
 #' @inheritParams clean_variable_spelling
 #'
 #' @examples
@@ -45,7 +51,8 @@
 #'                 spelling_vars = "variable"
 #'                )
 clean_variables <- function(x, sep = "_", wordlists = NULL, spelling_vars = 3, 
-                            sort_by = NULL, protect = FALSE, classes = NULL) {
+                            sort_by = NULL, protect = FALSE, classes = NULL,
+                            warn_spelling = FALSE) {
 
   xname <- deparse(substitute(x))
   if (!is.data.frame(x)) {
@@ -71,6 +78,7 @@ clean_variables <- function(x, sep = "_", wordlists = NULL, spelling_vars = 3,
                                    wordlists = wordlists, 
                                    spelling_vars = spelling_vars, 
                                    sort_by = sort_by,
+                                   warn    = warn_spelling,
                                    classes = classes)
   }
 

diff --git a/R/warnings_errors.R b/R/warnings_errors.R
@@ -0,0 +1,45 @@
+
+withWarnings <- function(expr) {
+  myWarnings <- NULL
+  myErrors   <- NULL
+  wHandler <- function(w) {
+    myWarnings <<- c(myWarnings, list(w))
+    invokeRestart("muffleWarning")
+  }
+  eHandler <- function(e) {
+    myErrors <<- c(myErrors, list(e))
+    NULL
+  }
+  val <- withCallingHandlers(tryCatch(expr, error = eHandler), warning = wHandler)
+  list(value = val, warnings = myWarnings, errors = myErrors)
+}
+
+collect_ya_errs <- function(e, fmt) {
+  if (is.null(e)) return(NULL)
+  warn <- vapply(e, "[[", character(1), "message")
+  warn <- paste0(" ", warn, collapse = "\n  ....")
+  paste(sprintf("  %s__:\n  ....%s", fmt, warn), collapse = "\n")
+}
+
+process_werrors <- function(warns, errs) {
+  warns <- warns[lengths(warns) > 0]
+  errs  <- errs[lengths(errs) > 0]
+  warned <- length(warns) > 0
+  errored <- length(errs) > 0
+  if (warned || errored) {
+
+    wrn <- "" -> err
+    if (warned) {
+      wngs <- do.call("paste", c(warns, sep = "\n"))
+      wrn <- sprintf("The following warnings were found...\n%s", wngs)
+    }
+    if (errored) {
+      errs <- do.call("paste", c(errs, sep = "\n"))
+      err  <- sprintf("The following errors were found:\n%s", errs)
+    }
+    res <- sprintf("%s\n%s", wrn, err)
+    if (res == "\n") NULL else res
+  } else {
+    NULL
+  }
+}
diff --git a/man/clean_data.Rd b/man/clean_data.Rd
diff --git a/man/clean_spelling.Rd b/man/clean_spelling.Rd
diff --git a/man/clean_variable_spelling.Rd b/man/clean_variable_spelling.Rd
diff --git a/man/clean_variables.Rd b/man/clean_variables.Rd
diff --git a/tests/testthat/test-clean_spelling.R b/tests/testthat/test-clean_spelling.R
@@ -71,10 +71,25 @@ test_that("clean_spelling will throw a warning if there are duplicated keys", {
 
 test_that("clean_spelling will take in a default value", {
 
-  with_default <- rbind(corrections, c(bad = ".default", good = "unknown"))
+  with_default <- rbind(corrections, c(bad = ".default", good = "check me"))
   cleaned_default <- cleaned_data
-  cleaned_default[!cleaned_default %in% with_default$good] <- "unknown"
-  expect_identical(clean_spelling(my_data, with_default), cleaned_default)
+  cleaned_default[!cleaned_default %in% with_default$good] <- "check me"
+  d_warn <- "'a', 'b', 'c', 'd', 'e', 'fumar' were changed to the default value \\('check me'\\)"
+  expect_warning({
+    my_cleaned <- clean_spelling(my_data, with_default) 
+  }, d_warn)
+  expect_identical(my_cleaned, cleaned_default)
+
+})
+
+
+test_that("nothing to default will not throw a warning", {
+
+
+  with_default <- rbind(corrections, c(bad = ".default", good = "unknown"))
+  cleaned_default <- cleaned_data[6:7]
+  expect_failure(expect_warning(clean_spelling(my_data[6:7], with_default)))
+
 
 })