From 46cf6276b65dddbea220ee8a38d59a7fa2d48cfe Mon Sep 17 00:00:00 2001
From: Jim Hester <james.f.hester@gmail.com>
Date: Thu, 7 Sep 2017 15:57:18 -0400
Subject: [PATCH] Add support for spell checking roxygen comments

`roxygen2::parse_file()` parses the roxygen comments
in each file. Text from relevant tags is then searched for spelling
errors with `hunspell::hunspell()` to find misspelled words. Because
roxygen does not store the original positions of parsed tags we then
need to find the misspelled word locations in the original roxygen
comment lines of the source. This is done by `find_word_positions()`.
---
 DESCRIPTION                 | 16 ++++++++----
 NAMESPACE                   |  2 ++
 R/RcppExports.R             |  7 +++++
 R/check-files.R             | 51 +++++++++++++++++++++++++++++++++++++
 R/spell-check.R             |  8 ++++--
 man/spell_check_files.Rd    |  1 +
 man/spell_check_package.Rd  |  1 +
 man/wordlist.Rd             |  1 +
 src/.gitignore              |  3 +++
 src/RcppExports.cpp         | 29 +++++++++++++++++++++
 src/find_word_positions.cpp | 39 ++++++++++++++++++++++++++++
 11 files changed, 151 insertions(+), 7 deletions(-)
 create mode 100644 R/RcppExports.R
 create mode 100644 src/.gitignore
 create mode 100644 src/RcppExports.cpp
 create mode 100644 src/find_word_positions.cpp

diff --git a/DESCRIPTION b/DESCRIPTION
index c351f98..4fb82c5 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -14,10 +14,16 @@ Encoding: UTF-8
 LazyData: true
 URL: https://github.com/ropensci/spelling#readme
 BugReports: https://github.com/ropensci/spelling/issues
-Imports:
-  commonmark,
-  xml2, 
-  hunspell,
-  knitr
+Imports: 
+    commonmark,
+    xml2,
+    hunspell,
+    knitr,
+    roxygen2,
+    Rcpp
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 6.0.1
+LinkingTo: 
+    Rcpp
+Remotes:
+  klutometis/roxygen
diff --git a/NAMESPACE b/NAMESPACE
index 78d0a37..f8f1409 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -8,3 +8,5 @@ export(spell_check_setup)
 export(spell_check_test)
 export(spell_check_text)
 export(update_wordlist)
+importFrom(Rcpp,sourceCpp)
+useDynLib(spelling, .registration = TRUE)
diff --git a/R/RcppExports.R b/R/RcppExports.R
new file mode 100644
index 0000000..66ca933
--- /dev/null
+++ b/R/RcppExports.R
@@ -0,0 +1,7 @@
+# Generated by using Rcpp::compileAttributes() -> do not edit by hand
+# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+find_word_positions <- function(lines, words) {
+    .Call(`_spelling_find_word_positions`, lines, words)
+}
+
diff --git a/R/check-files.R b/R/check-files.R
index 3cf3322..f87215e 100644
--- a/R/check-files.R
+++ b/R/check-files.R
@@ -89,3 +89,54 @@ spell_check_file_plain <- function(path, format, dict){
   text <- vapply(words, paste, character(1), collapse = " ")
   spell_check_plain(text, dict = dict)
 }
+
+#' @useDynLib spelling, .registration = TRUE
+#' @importFrom Rcpp sourceCpp
+spell_check_file_roxygen <- function(path, dict, global_options = list()) {
+
+  parsed <- roxygen2::parse_file(file = path, global_options = global_options)
+
+  lines <- readLines(path)
+  is_roxygen <- grep("^[[:space:]]*#+'", lines)
+  roxygen_lines <- lines[is_roxygen]
+
+  # Some roxygen tags (such as param) have a name and a description, we only
+  # want to spell check the latter.
+  extract_text <- function(x) {
+    if (is.list(x) && exists("description", x)) {
+      return(x[["description"]])
+    }
+    x
+  }
+
+  # roxygen tags that contain text
+  text_tags <- c("concept", "describeIn", "description", "details", "field", "note", "param", "return", "section", "slot", "title")
+  parse_block <- function(tags) {
+    text <- unlist(lapply(tags[names(tags) %in% text_tags], extract_text))
+    if (length(text) == 0) {
+      return(data.frame(word = character(), line = integer(), start = integer(), stringsAsFactors = FALSE))
+    }
+
+    # blank out rd tags, tag list derived from RdTextFilter
+    # https://github.com/wch/r-source/blob/89ec1150299f7be62b839d5d5eb46bd9a63653bd/src/library/tools/R/Rdtools.R#L113-L126
+    rd_tags <- c("S3method", "S4method", "command", "code", "docType", "email", "encoding", "file", "keyword", "link", "linkS4class", "method", "pkg", "var")
+    re <- paste0("\\\\(", paste0(collapse = "|", rd_tags), ")[^}]+}")
+    text <- blank_matches(text, re)
+    bad_words <- hunspell::hunspell(text, dict = dict)
+    res <- find_word_positions(roxygen_lines, unique(sort(unlist(bad_words))))
+
+    # Fix line numbers for real file.
+    res$line <- is_roxygen[res$line]
+
+    vapply(split(res$line, res$word), paste, character(1), collapse = ", ")
+  }
+
+  unlist(lapply(parsed, parse_block))
+}
+
+blank_matches <- function(str, re) {
+  m <- gregexpr(re, str)
+  blanks <- function(n) strrep(" ", n)
+  regmatches(str, m) <- Map(blanks, lapply(regmatches(str, m), nchar))
+  str
+}
diff --git a/R/spell-check.R b/R/spell-check.R
index da69236..4f9184f 100644
--- a/R/spell-check.R
+++ b/R/spell-check.R
@@ -40,6 +40,10 @@ spell_check_package <- function(pkg = ".", vignettes = TRUE, lang = "en_GB", use
   rd_files <- list.files(file.path(pkg$path, "man"), "\\.rd$", ignore.case = TRUE, full.names = TRUE)
   rd_lines <- lapply(sort(rd_files), spell_check_file_rd, dict = dict)
 
+  # Check Roxygen comments
+  r_files <- list.files(file.path(pkg$path, "R"), "\\.R$", ignore.case = TRUE, full.names = TRUE)
+  r_lines <- lapply(sort(r_files), spell_check_file_roxygen, dict = dict, global_options = roxygen2::load_options(pkg$path))
+
   # Check 'DESCRIPTION' fields
   pkg_fields <- c("title", "description")
   pkg_lines <- lapply(pkg_fields, function(x){
@@ -47,8 +51,8 @@ spell_check_package <- function(pkg = ".", vignettes = TRUE, lang = "en_GB", use
   })
 
   # Combine
-  all_sources <- c(rd_files, pkg_fields)
-  all_lines <- c(rd_lines, pkg_lines)
+  all_sources <- c(r_files, rd_files, pkg_fields)
+  all_lines <- c(r_lines, rd_lines, pkg_lines)
 
   if(isTRUE(vignettes)){
     # Markdown vignettes
diff --git a/man/spell_check_files.Rd b/man/spell_check_files.Rd
index 0b589ad..8ae8581 100644
--- a/man/spell_check_files.Rd
+++ b/man/spell_check_files.Rd
@@ -32,3 +32,4 @@ spell_check_files(files)
 Other spelling: \code{\link{spell_check_package}},
   \code{\link{wordlist}}
 }
+\concept{spelling}
diff --git a/man/spell_check_package.Rd b/man/spell_check_package.Rd
index 3be27d5..afe8512 100644
--- a/man/spell_check_package.Rd
+++ b/man/spell_check_package.Rd
@@ -45,3 +45,4 @@ require installation of a custom dictionary, see \link[hunspell:hunspell]{hunspe
 Other spelling: \code{\link{spell_check_files}},
   \code{\link{wordlist}}
 }
+\concept{spelling}
diff --git a/man/wordlist.Rd b/man/wordlist.Rd
index df2e94e..35db462 100644
--- a/man/wordlist.Rd
+++ b/man/wordlist.Rd
@@ -37,3 +37,4 @@ they have been removed from the documentation or added to the \code{lang} dictio
 Other spelling: \code{\link{spell_check_files}},
   \code{\link{spell_check_package}}
 }
+\concept{spelling}
diff --git a/src/.gitignore b/src/.gitignore
new file mode 100644
index 0000000..22034c4
--- /dev/null
+++ b/src/.gitignore
@@ -0,0 +1,3 @@
+*.o
+*.so
+*.dll
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
new file mode 100644
index 0000000..b977f7f
--- /dev/null
+++ b/src/RcppExports.cpp
@@ -0,0 +1,29 @@
+// Generated by using Rcpp::compileAttributes() -> do not edit by hand
+// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+#include <Rcpp.h>
+
+using namespace Rcpp;
+
+// find_word_positions
+Rcpp::DataFrame find_word_positions(CharacterVector lines, CharacterVector words);
+RcppExport SEXP _spelling_find_word_positions(SEXP linesSEXP, SEXP wordsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< CharacterVector >::type lines(linesSEXP);
+    Rcpp::traits::input_parameter< CharacterVector >::type words(wordsSEXP);
+    rcpp_result_gen = Rcpp::wrap(find_word_positions(lines, words));
+    return rcpp_result_gen;
+END_RCPP
+}
+
+static const R_CallMethodDef CallEntries[] = {
+    {"_spelling_find_word_positions", (DL_FUNC) &_spelling_find_word_positions, 2},
+    {NULL, NULL, 0}
+};
+
+RcppExport void R_init_spelling(DllInfo *dll) {
+    R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
+    R_useDynamicSymbols(dll, FALSE);
+}
diff --git a/src/find_word_positions.cpp b/src/find_word_positions.cpp
new file mode 100644
index 0000000..6100a92
--- /dev/null
+++ b/src/find_word_positions.cpp
@@ -0,0 +1,39 @@
+#include <Rcpp.h>
+#include <cstring>
+using namespace Rcpp;
+
+// [[Rcpp::export]]
+Rcpp::DataFrame find_word_positions(CharacterVector lines,
+                                    CharacterVector words) {
+  std::vector<const char*> found_words;
+  std::vector<int> found_lines;
+  std::vector<int> found_starts;
+
+  for (int i = 0; i < words.size(); ++i) {
+    const char* word = words.at(i);
+    size_t len = strlen(word);
+    bool found = false;
+    for (int j = 0; j < lines.size(); ++j) {
+      const char* line = lines.at(j);
+      for (const char* p = line; (p = strstr(p, word)) != NULL; ++p) {
+        if ((p == line) || (p != NULL && !isalnum(p[-1]))) {
+          if (!isalnum(p[len])) {
+            found = true;
+            found_words.push_back(word);
+            found_lines.push_back(j + 1);
+            found_starts.push_back((int)(p - lines.at(j)) + 1);
+          }
+          p += len;
+        }
+      }
+    }
+    if (!found) {
+      found_words.push_back(word);
+      found_lines.push_back(NA_INTEGER);
+      found_starts.push_back(NA_INTEGER);
+    }
+  }
+  return DataFrame::create(_["word"] = found_words, _["line"] = found_lines,
+                           _["start"] = found_starts,
+                           Rcpp::_["stringsAsFactors"] = false);
+}