Comply with Text Interchange Format for inputs

Converts all tokenization functions to methods. After some basic checking, data frame methods coerce the corpus data frame to a named character vector then pass it to the default method. The default method can handle either character vectors or lists as before. Addresses #49
ropensci · Mar 13, 2018 · bd3a11c · bd3a11c
1 parent 6ebbd22
commit bd3a11c
Show file tree

Hide file tree

Showing 10 changed files with 611 additions and 198 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,5 +1,29 @@
 # Generated by roxygen2: do not edit by hand
 
+S3method(tokenize_character_shingles,data.frame)
+S3method(tokenize_character_shingles,default)
+S3method(tokenize_characters,data.frame)
+S3method(tokenize_characters,default)
+S3method(tokenize_lines,data.frame)
+S3method(tokenize_lines,default)
+S3method(tokenize_ngrams,data.frame)
+S3method(tokenize_ngrams,default)
+S3method(tokenize_paragraphs,data.frame)
+S3method(tokenize_paragraphs,default)
+S3method(tokenize_ptb,data.frame)
+S3method(tokenize_ptb,default)
+S3method(tokenize_regex,data.frame)
+S3method(tokenize_regex,default)
+S3method(tokenize_sentences,data.frame)
+S3method(tokenize_sentences,default)
+S3method(tokenize_skip_ngrams,data.frame)
+S3method(tokenize_skip_ngrams,default)
+S3method(tokenize_tweets,data.frame)
+S3method(tokenize_tweets,default)
+S3method(tokenize_word_stems,data.frame)
+S3method(tokenize_word_stems,default)
+S3method(tokenize_words,data.frame)
+S3method(tokenize_words,default)
 export(chunk_text)
 export(count_characters)
 export(count_sentences)

diff --git a/R/basic-tokenizers.R b/R/basic-tokenizers.R
@@ -53,16 +53,38 @@ NULL
 
 #' @export
 #' @rdname basic-tokenizers
-tokenize_characters <- function(x, lowercase = TRUE, strip_non_alphanum = TRUE,
-                           simplify = FALSE) {
+tokenize_characters <-
+  function(x,
+           lowercase = TRUE,
+           strip_non_alphanum = TRUE,
+           simplify = FALSE) {
+    UseMethod("tokenize_characters")
+  }
+
+#' @export
+tokenize_characters.data.frame <- function(x,
+                                      lowercase = TRUE,
+                                      strip_non_alphanum = TRUE,
+                                      simplify = FALSE) {
+  x <- corpus_df_as_corpus_vector(x)
+  tokenize_characters(x, lowercase, strip_non_alphanum, simplify)
+}
+
+#' @export
+tokenize_characters.default <- function(x,
+                                        lowercase = TRUE,
+                                        strip_non_alphanum = TRUE,
+                                        simplify = FALSE) {
   check_input(x)
   named <- names(x)
   if (lowercase)
     x <- stri_trans_tolower(x)
   if (strip_non_alphanum)
-    x <- stri_replace_all_charclass(x, "[[:punct:][:whitespace:]]", "")
+    x <-
+    stri_replace_all_charclass(x, "[[:punct:][:whitespace:]]", "")
   out <- stri_split_boundaries(x, type = "character")
-  if (!is.null(named)) names(out) <- named
+  if (!is.null(named))
+    names(out) <- named
   simplify_list(out, simplify)
 }
 
@@ -71,6 +93,24 @@ tokenize_characters <- function(x, lowercase = TRUE, strip_non_alphanum = TRUE,
 tokenize_words <- function(x, lowercase = TRUE, stopwords = NULL,
                            strip_punct = TRUE, strip_numeric = FALSE,
                            simplify = FALSE) {
+  UseMethod("tokenize_words")
+}
+
+#' @export
+tokenize_words.data.frame <- function(x,
+                                      lowercase = TRUE,
+                                      stopwords = NULL,
+                                      strip_punct = TRUE,
+                                      strip_numeric = FALSE,
+                                      simplify = FALSE) {
+  x <- corpus_df_as_corpus_vector(x)
+  tokenize_words(x, lowercase, stopwords, strip_punct, strip_numeric, simplify)
+}
+
+#' @export
+tokenize_words.default <- function(x, lowercase = TRUE, stopwords = NULL,
+                                   strip_punct = TRUE, strip_numeric = FALSE,
+                                   simplify = FALSE) {
   check_input(x)
   named <- names(x)
   if (lowercase) x <- stri_trans_tolower(x)
@@ -87,47 +127,128 @@ tokenize_words <- function(x, lowercase = TRUE, stopwords = NULL,
 
 #' @export
 #' @rdname basic-tokenizers
-tokenize_sentences <- function(x, lowercase = FALSE, strip_punct = FALSE,
-                               simplify = FALSE) {
-  check_input(x)
-  named <- names(x)
-  x <- stri_replace_all_charclass(x, "[[:whitespace:]]", " ")
-  out <- stri_split_boundaries(x, type = "sentence", skip_word_none = FALSE)
-  out <- lapply(out, stri_trim_both)
-  if (lowercase) out <- lapply(out, stri_trans_tolower)
-  if (strip_punct)
-    out <- lapply(out, stri_replace_all_charclass, "[[:punct:]]", "")
-  if (!is.null(named)) names(out) <- named
-  simplify_list(out, simplify)
-}
+tokenize_sentences <-
+  function(x,
+           lowercase = FALSE,
+           strip_punct = FALSE,
+           simplify = FALSE) {
+    UseMethod("tokenize_sentences")
+  }
+
+#' @export
+tokenize_sentences.data.frame <-
+  function(x,
+           lowercase = FALSE,
+           strip_punct = FALSE,
+           simplify = FALSE) {
+    x <- corpus_df_as_corpus_vector(x)
+    tokenize_sentences(x, lowercase, strip_punct, simplify)
+  }
+
+#' @export
+tokenize_sentences.default <-
+  function(x,
+           lowercase = FALSE,
+           strip_punct = FALSE,
+           simplify = FALSE) {
+    check_input(x)
+    named <- names(x)
+    x <- stri_replace_all_charclass(x, "[[:whitespace:]]", " ")
+    out <-
+      stri_split_boundaries(x, type = "sentence", skip_word_none = FALSE)
+    out <- lapply(out, stri_trim_both)
+    if (lowercase)
+      out <- lapply(out, stri_trans_tolower)
+    if (strip_punct)
+      out <-
+      lapply(out, stri_replace_all_charclass, "[[:punct:]]", "")
+    if (!is.null(named))
+      names(out) <- named
+    simplify_list(out, simplify)
+  }
 
 #' @export
 #' @rdname basic-tokenizers
 tokenize_lines <- function(x, simplify = FALSE) {
+  UseMethod("tokenize_lines")
+}
+
+#' @export
+tokenize_lines.data.frame <- function(x, simplify = FALSE) {
+  x <- corpus_df_as_corpus_vector(x)
+  tokenize_lines(x, simplify)
+}
+
+#' @export
+tokenize_lines.default <- function(x, simplify = FALSE) {
   check_input(x)
   named <- names(x)
   out <- stri_split_lines(x, omit_empty = TRUE)
-  if (!is.null(named)) names(out) <- named
+  if (!is.null(named))
+    names(out) <- named
   simplify_list(out, simplify)
 }
 
 #' @export
 #' @rdname basic-tokenizers
-tokenize_paragraphs <- function(x, paragraph_break = "\n\n", simplify = FALSE) {
-  check_input(x)
-  named <- names(x)
-  out <- stri_split_fixed(x, pattern = paragraph_break, omit_empty = TRUE)
-  out <- lapply(out, stri_replace_all_charclass, "[[:whitespace:]]", " ")
-  if (!is.null(named)) names(out) <- named
-  simplify_list(out, simplify)
-}
+tokenize_paragraphs <-
+  function(x,
+           paragraph_break = "\n\n",
+           simplify = FALSE) {
+    UseMethod("tokenize_paragraphs")
+  }
+
+#' @export
+tokenize_paragraphs.data.frame <-
+  function(x,
+           paragraph_break = "\n\n",
+           simplify = FALSE) {
+    x <- corpus_df_as_corpus_vector(x)
+    tokenize_paragraphs(x, paragraph_break, simplify)
+  }
+
+#' @export
+tokenize_paragraphs.default <-
+  function(x,
+           paragraph_break = "\n\n",
+           simplify = FALSE) {
+    check_input(x)
+    named <- names(x)
+    out <-
+      stri_split_fixed(x, pattern = paragraph_break, omit_empty = TRUE)
+    out <-
+      lapply(out, stri_replace_all_charclass, "[[:whitespace:]]", " ")
+    if (!is.null(named))
+      names(out) <- named
+    simplify_list(out, simplify)
+  }
 
 #' @export
 #' @rdname basic-tokenizers
-tokenize_regex <- function(x, pattern = "\\s+", simplify = FALSE) {
-  check_input(x)
-  named <- names(x)
-  out <- stri_split_regex(x, pattern = pattern, omit_empty = TRUE)
-  if (!is.null(named)) names(out) <- named
-  simplify_list(out, simplify)
-}
+tokenize_regex <- function(x,
+                           pattern = "\\s+",
+                           simplify = FALSE) {
+  UseMethod("tokenize_regex")
+}
+
+#' @export
+tokenize_regex.data.frame <-
+  function(x,
+           pattern = "\\s+",
+           simplify = FALSE) {
+    x <- corpus_df_as_corpus_vector(x)
+    tokenize_regex(x, pattern, simplify)
+  }
+
+#' @export
+tokenize_regex.default <-
+  function(x,
+           pattern = "\\s+",
+           simplify = FALSE) {
+    check_input(x)
+    named <- names(x)
+    out <- stri_split_regex(x, pattern = pattern, omit_empty = TRUE)
+    if (!is.null(named))
+      names(out) <- named
+    simplify_list(out, simplify)
+  }
diff --git a/R/character-shingles-tokenizers.R b/R/character-shingles-tokenizers.R
@@ -35,19 +35,51 @@
 #'
 #' @export
 #' @rdname shingle-tokenizers
-tokenize_character_shingles <- function(x, n = 3L, n_min = n,
+tokenize_character_shingles <- function(x,
+                                        n = 3L,
+                                        n_min = n,
                                         lowercase = TRUE,
                                         strip_non_alphanum = TRUE,
                                         simplify = FALSE) {
-  check_input(x)
-  named <- names(x)
-  if (n < n_min || n_min <= 0)
-    stop("n and n_min must be integers, and n_min must be less than ",
-         "n and greater than 1.")
-  chars <- tokenize_characters(x, lowercase = lowercase,
-                               strip_non_alphanum = strip_non_alphanum)
-  out <- generate_ngrams_batch(chars, ngram_min = n_min, ngram_max = n,
-                               stopwords = "", ngram_delim = "")
-  if (!is.null(named)) names(out) <- named
-  simplify_list(out, simplify)
+  UseMethod("tokenize_character_shingles")
 }
+
+#' @export
+tokenize_character_shingles.data.frame <-
+  function(x,
+           n = 3L,
+           n_min = n,
+           lowercase = TRUE,
+           strip_non_alphanum = TRUE,
+           simplify = FALSE) {
+    x <- corpus_df_as_corpus_vector(x)
+    tokenize_character_shingles(x, n, n_min, lowercase, strip_non_alphanum, simplify)
+  }
+
+#' @export
+tokenize_character_shingles.default <-
+  function(x,
+           n = 3L,
+           n_min = n,
+           lowercase = TRUE,
+           strip_non_alphanum = TRUE,
+           simplify = FALSE) {
+    check_input(x)
+    named <- names(x)
+    if (n < n_min || n_min <= 0)
+      stop("n and n_min must be integers, and n_min must be less than ",
+           "n and greater than 1.")
+    chars <- tokenize_characters(x, lowercase = lowercase,
+                                 strip_non_alphanum = strip_non_alphanum)
+    out <-
+      generate_ngrams_batch(
+        chars,
+        ngram_min = n_min,
+        ngram_max = n,
+        stopwords = "",
+        ngram_delim = ""
+      )
+    if (!is.null(named))
+      names(out) <- named
+    simplify_list(out, simplify)
+  }
diff --git a/R/coercion.R b/R/coercion.R
@@ -0,0 +1,19 @@
+is_corpus_df <- function(corpus) {
+  stopifnot(inherits(corpus, "data.frame"),
+            ncol(corpus) >= 2,
+            all(names(corpus)[1L:2L] == c("doc_id", "text")),
+            is.character(corpus$doc_id),
+            is.character(corpus$doc_id),
+            nrow(corpus) > 0)
+  TRUE # if it doesn't fail from the tests above then it fits the standard
+}
+
+corpus_df_as_corpus_vector <- function(corpus) {
+  if (is_corpus_df(corpus)) {
+    out <- corpus$text
+    names(out) <- corpus$doc_id
+  } else {
+    stop("Not a corpus data.frame")
+  }
+  out
+}