Add support for spell checking roxygen comments

`roxygen2::parse_file()` parses the roxygen comments in each file. Text from relevant tags is then searched for spelling errors with `hunspell::hunspell()` to find misspelled words. Because roxygen does not store the original positions of parsed tags we then need to find the misspelled word locations in the original roxygen comment lines of the source. This is done by `find_word_positions()`.
ropensci · Sep 7, 2017 · 46cf627 · 46cf627
1 parent 7f5e3f6
commit 46cf627
Show file tree

Hide file tree

Showing 11 changed files with 151 additions and 7 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -14,10 +14,16 @@ Encoding: UTF-8
 LazyData: true
 URL: https://github.com/ropensci/spelling#readme
 BugReports: https://github.com/ropensci/spelling/issues
-Imports:
-  commonmark,
-  xml2, 
-  hunspell,
-  knitr
+Imports: 
+    commonmark,
+    xml2,
+    hunspell,
+    knitr,
+    roxygen2,
+    Rcpp
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 6.0.1
+LinkingTo: 
+    Rcpp
+Remotes:
+  klutometis/roxygen
diff --git a/NAMESPACE b/NAMESPACE
@@ -8,3 +8,5 @@ export(spell_check_setup)
 export(spell_check_test)
 export(spell_check_text)
 export(update_wordlist)
+importFrom(Rcpp,sourceCpp)
+useDynLib(spelling, .registration = TRUE)
diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -0,0 +1,7 @@
+# Generated by using Rcpp::compileAttributes() -> do not edit by hand
+# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+find_word_positions <- function(lines, words) {
+    .Call(`_spelling_find_word_positions`, lines, words)
+}
+
diff --git a/R/check-files.R b/R/check-files.R
@@ -89,3 +89,54 @@ spell_check_file_plain <- function(path, format, dict){
   text <- vapply(words, paste, character(1), collapse = " ")
   spell_check_plain(text, dict = dict)
 }
+
+#' @useDynLib spelling, .registration = TRUE
+#' @importFrom Rcpp sourceCpp
+spell_check_file_roxygen <- function(path, dict, global_options = list()) {
+
+  parsed <- roxygen2::parse_file(file = path, global_options = global_options)
+
+  lines <- readLines(path)
+  is_roxygen <- grep("^[[:space:]]*#+'", lines)
+  roxygen_lines <- lines[is_roxygen]
+
+  # Some roxygen tags (such as param) have a name and a description, we only
+  # want to spell check the latter.
+  extract_text <- function(x) {
+    if (is.list(x) && exists("description", x)) {
+      return(x[["description"]])
+    }
+    x
+  }
+
+  # roxygen tags that contain text
+  text_tags <- c("concept", "describeIn", "description", "details", "field", "note", "param", "return", "section", "slot", "title")
+  parse_block <- function(tags) {
+    text <- unlist(lapply(tags[names(tags) %in% text_tags], extract_text))
+    if (length(text) == 0) {
+      return(data.frame(word = character(), line = integer(), start = integer(), stringsAsFactors = FALSE))
+    }
+
+    # blank out rd tags, tag list derived from RdTextFilter
+    # https://github.com/wch/r-source/blob/89ec1150299f7be62b839d5d5eb46bd9a63653bd/src/library/tools/R/Rdtools.R#L113-L126
+    rd_tags <- c("S3method", "S4method", "command", "code", "docType", "email", "encoding", "file", "keyword", "link", "linkS4class", "method", "pkg", "var")
+    re <- paste0("\\\\(", paste0(collapse = "|", rd_tags), ")[^}]+}")
+    text <- blank_matches(text, re)
+    bad_words <- hunspell::hunspell(text, dict = dict)
+    res <- find_word_positions(roxygen_lines, unique(sort(unlist(bad_words))))
+
+    # Fix line numbers for real file.
+    res$line <- is_roxygen[res$line]
+
+    vapply(split(res$line, res$word), paste, character(1), collapse = ", ")
+  }
+
+  unlist(lapply(parsed, parse_block))
+}
+
+blank_matches <- function(str, re) {
+  m <- gregexpr(re, str)
+  blanks <- function(n) strrep(" ", n)
+  regmatches(str, m) <- Map(blanks, lapply(regmatches(str, m), nchar))
+  str
+}
diff --git a/R/spell-check.R b/R/spell-check.R
@@ -40,15 +40,19 @@ spell_check_package <- function(pkg = ".", vignettes = TRUE, lang = "en_GB", use
   rd_files <- list.files(file.path(pkg$path, "man"), "\\.rd$", ignore.case = TRUE, full.names = TRUE)
   rd_lines <- lapply(sort(rd_files), spell_check_file_rd, dict = dict)
 
+  # Check Roxygen comments
+  r_files <- list.files(file.path(pkg$path, "R"), "\\.R$", ignore.case = TRUE, full.names = TRUE)
+  r_lines <- lapply(sort(r_files), spell_check_file_roxygen, dict = dict, global_options = roxygen2::load_options(pkg$path))
+
   # Check 'DESCRIPTION' fields
   pkg_fields <- c("title", "description")
   pkg_lines <- lapply(pkg_fields, function(x){
     spell_check_file_text(textConnection(pkg[[x]]), dict = dict)
   })
 
   # Combine
-  all_sources <- c(rd_files, pkg_fields)
-  all_lines <- c(rd_lines, pkg_lines)
+  all_sources <- c(r_files, rd_files, pkg_fields)
+  all_lines <- c(r_lines, rd_lines, pkg_lines)
 
   if(isTRUE(vignettes)){
     # Markdown vignettes

diff --git a/man/spell_check_files.Rd b/man/spell_check_files.Rd
diff --git a/man/spell_check_package.Rd b/man/spell_check_package.Rd
diff --git a/man/wordlist.Rd b/man/wordlist.Rd
diff --git a/src/.gitignore b/src/.gitignore
@@ -0,0 +1,3 @@
+*.o
+*.so
+*.dll
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
@@ -0,0 +1,29 @@
+// Generated by using Rcpp::compileAttributes() -> do not edit by hand
+// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+#include <Rcpp.h>
+
+using namespace Rcpp;
+
+// find_word_positions
+Rcpp::DataFrame find_word_positions(CharacterVector lines, CharacterVector words);
+RcppExport SEXP _spelling_find_word_positions(SEXP linesSEXP, SEXP wordsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< CharacterVector >::type lines(linesSEXP);
+    Rcpp::traits::input_parameter< CharacterVector >::type words(wordsSEXP);
+    rcpp_result_gen = Rcpp::wrap(find_word_positions(lines, words));
+    return rcpp_result_gen;
+END_RCPP
+}
+
+static const R_CallMethodDef CallEntries[] = {
+    {"_spelling_find_word_positions", (DL_FUNC) &_spelling_find_word_positions, 2},
+    {NULL, NULL, 0}
+};
+
+RcppExport void R_init_spelling(DllInfo *dll) {
+    R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
+    R_useDynamicSymbols(dll, FALSE);
+}
diff --git a/src/find_word_positions.cpp b/src/find_word_positions.cpp
@@ -0,0 +1,39 @@
+#include <Rcpp.h>
+#include <cstring>
+using namespace Rcpp;
+
+// [[Rcpp::export]]
+Rcpp::DataFrame find_word_positions(CharacterVector lines,
+                                    CharacterVector words) {
+  std::vector<const char*> found_words;
+  std::vector<int> found_lines;
+  std::vector<int> found_starts;
+
+  for (int i = 0; i < words.size(); ++i) {
+    const char* word = words.at(i);
+    size_t len = strlen(word);
+    bool found = false;
+    for (int j = 0; j < lines.size(); ++j) {
+      const char* line = lines.at(j);
+      for (const char* p = line; (p = strstr(p, word)) != NULL; ++p) {
+        if ((p == line) || (p != NULL && !isalnum(p[-1]))) {
+          if (!isalnum(p[len])) {
+            found = true;
+            found_words.push_back(word);
+            found_lines.push_back(j + 1);
+            found_starts.push_back((int)(p - lines.at(j)) + 1);
+          }
+          p += len;
+        }
+      }
+    }
+    if (!found) {
+      found_words.push_back(word);
+      found_lines.push_back(NA_INTEGER);
+      found_starts.push_back(NA_INTEGER);
+    }
+  }
+  return DataFrame::create(_["word"] = found_words, _["line"] = found_lines,
+                           _["start"] = found_starts,
+                           Rcpp::_["stringsAsFactors"] = false);
+}