From 46cf6276b65dddbea220ee8a38d59a7fa2d48cfe Mon Sep 17 00:00:00 2001 From: Jim Hester Date: Thu, 7 Sep 2017 15:57:18 -0400 Subject: [PATCH] Add support for spell checking roxygen comments `roxygen2::parse_file()` parses the roxygen comments in each file. Text from relevant tags is then searched for spelling errors with `hunspell::hunspell()` to find misspelled words. Because roxygen does not store the original positions of parsed tags we then need to find the misspelled word locations in the original roxygen comment lines of the source. This is done by `find_word_positions()`. --- DESCRIPTION | 16 ++++++++---- NAMESPACE | 2 ++ R/RcppExports.R | 7 +++++ R/check-files.R | 51 +++++++++++++++++++++++++++++++++++++ R/spell-check.R | 8 ++++-- man/spell_check_files.Rd | 1 + man/spell_check_package.Rd | 1 + man/wordlist.Rd | 1 + src/.gitignore | 3 +++ src/RcppExports.cpp | 29 +++++++++++++++++++++ src/find_word_positions.cpp | 39 ++++++++++++++++++++++++++++ 11 files changed, 151 insertions(+), 7 deletions(-) create mode 100644 R/RcppExports.R create mode 100644 src/.gitignore create mode 100644 src/RcppExports.cpp create mode 100644 src/find_word_positions.cpp diff --git a/DESCRIPTION b/DESCRIPTION index c351f98..4fb82c5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -14,10 +14,16 @@ Encoding: UTF-8 LazyData: true URL: https://github.com/ropensci/spelling#readme BugReports: https://github.com/ropensci/spelling/issues -Imports: - commonmark, - xml2, - hunspell, - knitr +Imports: + commonmark, + xml2, + hunspell, + knitr, + roxygen2, + Rcpp Roxygen: list(markdown = TRUE) RoxygenNote: 6.0.1 +LinkingTo: + Rcpp +Remotes: + klutometis/roxygen diff --git a/NAMESPACE b/NAMESPACE index 78d0a37..f8f1409 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,3 +8,5 @@ export(spell_check_setup) export(spell_check_test) export(spell_check_text) export(update_wordlist) +importFrom(Rcpp,sourceCpp) +useDynLib(spelling, .registration = TRUE) diff --git a/R/RcppExports.R b/R/RcppExports.R new file mode 100644 index 0000000..66ca933 --- /dev/null +++ b/R/RcppExports.R @@ -0,0 +1,7 @@ +# Generated by using Rcpp::compileAttributes() -> do not edit by hand +# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +find_word_positions <- function(lines, words) { + .Call(`_spelling_find_word_positions`, lines, words) +} + diff --git a/R/check-files.R b/R/check-files.R index 3cf3322..f87215e 100644 --- a/R/check-files.R +++ b/R/check-files.R @@ -89,3 +89,54 @@ spell_check_file_plain <- function(path, format, dict){ text <- vapply(words, paste, character(1), collapse = " ") spell_check_plain(text, dict = dict) } + +#' @useDynLib spelling, .registration = TRUE +#' @importFrom Rcpp sourceCpp +spell_check_file_roxygen <- function(path, dict, global_options = list()) { + + parsed <- roxygen2::parse_file(file = path, global_options = global_options) + + lines <- readLines(path) + is_roxygen <- grep("^[[:space:]]*#+'", lines) + roxygen_lines <- lines[is_roxygen] + + # Some roxygen tags (such as param) have a name and a description, we only + # want to spell check the latter. + extract_text <- function(x) { + if (is.list(x) && exists("description", x)) { + return(x[["description"]]) + } + x + } + + # roxygen tags that contain text + text_tags <- c("concept", "describeIn", "description", "details", "field", "note", "param", "return", "section", "slot", "title") + parse_block <- function(tags) { + text <- unlist(lapply(tags[names(tags) %in% text_tags], extract_text)) + if (length(text) == 0) { + return(data.frame(word = character(), line = integer(), start = integer(), stringsAsFactors = FALSE)) + } + + # blank out rd tags, tag list derived from RdTextFilter + # https://github.com/wch/r-source/blob/89ec1150299f7be62b839d5d5eb46bd9a63653bd/src/library/tools/R/Rdtools.R#L113-L126 + rd_tags <- c("S3method", "S4method", "command", "code", "docType", "email", "encoding", "file", "keyword", "link", "linkS4class", "method", "pkg", "var") + re <- paste0("\\\\(", paste0(collapse = "|", rd_tags), ")[^}]+}") + text <- blank_matches(text, re) + bad_words <- hunspell::hunspell(text, dict = dict) + res <- find_word_positions(roxygen_lines, unique(sort(unlist(bad_words)))) + + # Fix line numbers for real file. + res$line <- is_roxygen[res$line] + + vapply(split(res$line, res$word), paste, character(1), collapse = ", ") + } + + unlist(lapply(parsed, parse_block)) +} + +blank_matches <- function(str, re) { + m <- gregexpr(re, str) + blanks <- function(n) strrep(" ", n) + regmatches(str, m) <- Map(blanks, lapply(regmatches(str, m), nchar)) + str +} diff --git a/R/spell-check.R b/R/spell-check.R index da69236..4f9184f 100644 --- a/R/spell-check.R +++ b/R/spell-check.R @@ -40,6 +40,10 @@ spell_check_package <- function(pkg = ".", vignettes = TRUE, lang = "en_GB", use rd_files <- list.files(file.path(pkg$path, "man"), "\\.rd$", ignore.case = TRUE, full.names = TRUE) rd_lines <- lapply(sort(rd_files), spell_check_file_rd, dict = dict) + # Check Roxygen comments + r_files <- list.files(file.path(pkg$path, "R"), "\\.R$", ignore.case = TRUE, full.names = TRUE) + r_lines <- lapply(sort(r_files), spell_check_file_roxygen, dict = dict, global_options = roxygen2::load_options(pkg$path)) + # Check 'DESCRIPTION' fields pkg_fields <- c("title", "description") pkg_lines <- lapply(pkg_fields, function(x){ @@ -47,8 +51,8 @@ spell_check_package <- function(pkg = ".", vignettes = TRUE, lang = "en_GB", use }) # Combine - all_sources <- c(rd_files, pkg_fields) - all_lines <- c(rd_lines, pkg_lines) + all_sources <- c(r_files, rd_files, pkg_fields) + all_lines <- c(r_lines, rd_lines, pkg_lines) if(isTRUE(vignettes)){ # Markdown vignettes diff --git a/man/spell_check_files.Rd b/man/spell_check_files.Rd index 0b589ad..8ae8581 100644 --- a/man/spell_check_files.Rd +++ b/man/spell_check_files.Rd @@ -32,3 +32,4 @@ spell_check_files(files) Other spelling: \code{\link{spell_check_package}}, \code{\link{wordlist}} } +\concept{spelling} diff --git a/man/spell_check_package.Rd b/man/spell_check_package.Rd index 3be27d5..afe8512 100644 --- a/man/spell_check_package.Rd +++ b/man/spell_check_package.Rd @@ -45,3 +45,4 @@ require installation of a custom dictionary, see \link[hunspell:hunspell]{hunspe Other spelling: \code{\link{spell_check_files}}, \code{\link{wordlist}} } +\concept{spelling} diff --git a/man/wordlist.Rd b/man/wordlist.Rd index df2e94e..35db462 100644 --- a/man/wordlist.Rd +++ b/man/wordlist.Rd @@ -37,3 +37,4 @@ they have been removed from the documentation or added to the \code{lang} dictio Other spelling: \code{\link{spell_check_files}}, \code{\link{spell_check_package}} } +\concept{spelling} diff --git a/src/.gitignore b/src/.gitignore new file mode 100644 index 0000000..22034c4 --- /dev/null +++ b/src/.gitignore @@ -0,0 +1,3 @@ +*.o +*.so +*.dll diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp new file mode 100644 index 0000000..b977f7f --- /dev/null +++ b/src/RcppExports.cpp @@ -0,0 +1,29 @@ +// Generated by using Rcpp::compileAttributes() -> do not edit by hand +// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +#include + +using namespace Rcpp; + +// find_word_positions +Rcpp::DataFrame find_word_positions(CharacterVector lines, CharacterVector words); +RcppExport SEXP _spelling_find_word_positions(SEXP linesSEXP, SEXP wordsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< CharacterVector >::type lines(linesSEXP); + Rcpp::traits::input_parameter< CharacterVector >::type words(wordsSEXP); + rcpp_result_gen = Rcpp::wrap(find_word_positions(lines, words)); + return rcpp_result_gen; +END_RCPP +} + +static const R_CallMethodDef CallEntries[] = { + {"_spelling_find_word_positions", (DL_FUNC) &_spelling_find_word_positions, 2}, + {NULL, NULL, 0} +}; + +RcppExport void R_init_spelling(DllInfo *dll) { + R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); + R_useDynamicSymbols(dll, FALSE); +} diff --git a/src/find_word_positions.cpp b/src/find_word_positions.cpp new file mode 100644 index 0000000..6100a92 --- /dev/null +++ b/src/find_word_positions.cpp @@ -0,0 +1,39 @@ +#include +#include +using namespace Rcpp; + +// [[Rcpp::export]] +Rcpp::DataFrame find_word_positions(CharacterVector lines, + CharacterVector words) { + std::vector found_words; + std::vector found_lines; + std::vector found_starts; + + for (int i = 0; i < words.size(); ++i) { + const char* word = words.at(i); + size_t len = strlen(word); + bool found = false; + for (int j = 0; j < lines.size(); ++j) { + const char* line = lines.at(j); + for (const char* p = line; (p = strstr(p, word)) != NULL; ++p) { + if ((p == line) || (p != NULL && !isalnum(p[-1]))) { + if (!isalnum(p[len])) { + found = true; + found_words.push_back(word); + found_lines.push_back(j + 1); + found_starts.push_back((int)(p - lines.at(j)) + 1); + } + p += len; + } + } + } + if (!found) { + found_words.push_back(word); + found_lines.push_back(NA_INTEGER); + found_starts.push_back(NA_INTEGER); + } + } + return DataFrame::create(_["word"] = found_words, _["line"] = found_lines, + _["start"] = found_starts, + Rcpp::_["stringsAsFactors"] = false); +}