Skip to content

Commit

Permalink
Comply with Text Interchange Format for inputs
Browse files Browse the repository at this point in the history
Converts all tokenization functions to methods. After some basic checking, data frame methods coerce the corpus data frame to a named character vector then pass it to the default method. The default method can handle either character vectors or lists as before.

Addresses #49
  • Loading branch information
lmullen committed Mar 13, 2018
1 parent 6ebbd22 commit bd3a11c
Show file tree
Hide file tree
Showing 10 changed files with 611 additions and 198 deletions.
24 changes: 24 additions & 0 deletions NAMESPACE
@@ -1,5 +1,29 @@
# Generated by roxygen2: do not edit by hand

S3method(tokenize_character_shingles,data.frame)
S3method(tokenize_character_shingles,default)
S3method(tokenize_characters,data.frame)
S3method(tokenize_characters,default)
S3method(tokenize_lines,data.frame)
S3method(tokenize_lines,default)
S3method(tokenize_ngrams,data.frame)
S3method(tokenize_ngrams,default)
S3method(tokenize_paragraphs,data.frame)
S3method(tokenize_paragraphs,default)
S3method(tokenize_ptb,data.frame)
S3method(tokenize_ptb,default)
S3method(tokenize_regex,data.frame)
S3method(tokenize_regex,default)
S3method(tokenize_sentences,data.frame)
S3method(tokenize_sentences,default)
S3method(tokenize_skip_ngrams,data.frame)
S3method(tokenize_skip_ngrams,default)
S3method(tokenize_tweets,data.frame)
S3method(tokenize_tweets,default)
S3method(tokenize_word_stems,data.frame)
S3method(tokenize_word_stems,default)
S3method(tokenize_words,data.frame)
S3method(tokenize_words,default)
export(chunk_text)
export(count_characters)
export(count_sentences)
Expand Down
187 changes: 154 additions & 33 deletions R/basic-tokenizers.R
Expand Up @@ -53,16 +53,38 @@ NULL

#' @export
#' @rdname basic-tokenizers
tokenize_characters <- function(x, lowercase = TRUE, strip_non_alphanum = TRUE,
simplify = FALSE) {
tokenize_characters <-
function(x,
lowercase = TRUE,
strip_non_alphanum = TRUE,
simplify = FALSE) {
UseMethod("tokenize_characters")
}

#' @export
tokenize_characters.data.frame <- function(x,
lowercase = TRUE,
strip_non_alphanum = TRUE,
simplify = FALSE) {
x <- corpus_df_as_corpus_vector(x)
tokenize_characters(x, lowercase, strip_non_alphanum, simplify)
}

#' @export
tokenize_characters.default <- function(x,
lowercase = TRUE,
strip_non_alphanum = TRUE,
simplify = FALSE) {
check_input(x)
named <- names(x)
if (lowercase)
x <- stri_trans_tolower(x)
if (strip_non_alphanum)
x <- stri_replace_all_charclass(x, "[[:punct:][:whitespace:]]", "")
x <-
stri_replace_all_charclass(x, "[[:punct:][:whitespace:]]", "")
out <- stri_split_boundaries(x, type = "character")
if (!is.null(named)) names(out) <- named
if (!is.null(named))
names(out) <- named
simplify_list(out, simplify)
}

Expand All @@ -71,6 +93,24 @@ tokenize_characters <- function(x, lowercase = TRUE, strip_non_alphanum = TRUE,
tokenize_words <- function(x, lowercase = TRUE, stopwords = NULL,
strip_punct = TRUE, strip_numeric = FALSE,
simplify = FALSE) {
UseMethod("tokenize_words")
}

#' @export
tokenize_words.data.frame <- function(x,
lowercase = TRUE,
stopwords = NULL,
strip_punct = TRUE,
strip_numeric = FALSE,
simplify = FALSE) {
x <- corpus_df_as_corpus_vector(x)
tokenize_words(x, lowercase, stopwords, strip_punct, strip_numeric, simplify)
}

#' @export
tokenize_words.default <- function(x, lowercase = TRUE, stopwords = NULL,
strip_punct = TRUE, strip_numeric = FALSE,
simplify = FALSE) {
check_input(x)
named <- names(x)
if (lowercase) x <- stri_trans_tolower(x)
Expand All @@ -87,47 +127,128 @@ tokenize_words <- function(x, lowercase = TRUE, stopwords = NULL,

#' @export
#' @rdname basic-tokenizers
tokenize_sentences <- function(x, lowercase = FALSE, strip_punct = FALSE,
simplify = FALSE) {
check_input(x)
named <- names(x)
x <- stri_replace_all_charclass(x, "[[:whitespace:]]", " ")
out <- stri_split_boundaries(x, type = "sentence", skip_word_none = FALSE)
out <- lapply(out, stri_trim_both)
if (lowercase) out <- lapply(out, stri_trans_tolower)
if (strip_punct)
out <- lapply(out, stri_replace_all_charclass, "[[:punct:]]", "")
if (!is.null(named)) names(out) <- named
simplify_list(out, simplify)
}
tokenize_sentences <-
function(x,
lowercase = FALSE,
strip_punct = FALSE,
simplify = FALSE) {
UseMethod("tokenize_sentences")
}

#' @export
tokenize_sentences.data.frame <-
function(x,
lowercase = FALSE,
strip_punct = FALSE,
simplify = FALSE) {
x <- corpus_df_as_corpus_vector(x)
tokenize_sentences(x, lowercase, strip_punct, simplify)
}

#' @export
tokenize_sentences.default <-
function(x,
lowercase = FALSE,
strip_punct = FALSE,
simplify = FALSE) {
check_input(x)
named <- names(x)
x <- stri_replace_all_charclass(x, "[[:whitespace:]]", " ")
out <-
stri_split_boundaries(x, type = "sentence", skip_word_none = FALSE)
out <- lapply(out, stri_trim_both)
if (lowercase)
out <- lapply(out, stri_trans_tolower)
if (strip_punct)
out <-
lapply(out, stri_replace_all_charclass, "[[:punct:]]", "")
if (!is.null(named))
names(out) <- named
simplify_list(out, simplify)
}

#' @export
#' @rdname basic-tokenizers
tokenize_lines <- function(x, simplify = FALSE) {
UseMethod("tokenize_lines")
}

#' @export
tokenize_lines.data.frame <- function(x, simplify = FALSE) {
x <- corpus_df_as_corpus_vector(x)
tokenize_lines(x, simplify)
}

#' @export
tokenize_lines.default <- function(x, simplify = FALSE) {
check_input(x)
named <- names(x)
out <- stri_split_lines(x, omit_empty = TRUE)
if (!is.null(named)) names(out) <- named
if (!is.null(named))
names(out) <- named
simplify_list(out, simplify)
}

#' @export
#' @rdname basic-tokenizers
tokenize_paragraphs <- function(x, paragraph_break = "\n\n", simplify = FALSE) {
check_input(x)
named <- names(x)
out <- stri_split_fixed(x, pattern = paragraph_break, omit_empty = TRUE)
out <- lapply(out, stri_replace_all_charclass, "[[:whitespace:]]", " ")
if (!is.null(named)) names(out) <- named
simplify_list(out, simplify)
}
tokenize_paragraphs <-
function(x,
paragraph_break = "\n\n",
simplify = FALSE) {
UseMethod("tokenize_paragraphs")
}

#' @export
tokenize_paragraphs.data.frame <-
function(x,
paragraph_break = "\n\n",
simplify = FALSE) {
x <- corpus_df_as_corpus_vector(x)
tokenize_paragraphs(x, paragraph_break, simplify)
}

#' @export
tokenize_paragraphs.default <-
function(x,
paragraph_break = "\n\n",
simplify = FALSE) {
check_input(x)
named <- names(x)
out <-
stri_split_fixed(x, pattern = paragraph_break, omit_empty = TRUE)
out <-
lapply(out, stri_replace_all_charclass, "[[:whitespace:]]", " ")
if (!is.null(named))
names(out) <- named
simplify_list(out, simplify)
}

#' @export
#' @rdname basic-tokenizers
tokenize_regex <- function(x, pattern = "\\s+", simplify = FALSE) {
check_input(x)
named <- names(x)
out <- stri_split_regex(x, pattern = pattern, omit_empty = TRUE)
if (!is.null(named)) names(out) <- named
simplify_list(out, simplify)
}
tokenize_regex <- function(x,
pattern = "\\s+",
simplify = FALSE) {
UseMethod("tokenize_regex")
}

#' @export
tokenize_regex.data.frame <-
function(x,
pattern = "\\s+",
simplify = FALSE) {
x <- corpus_df_as_corpus_vector(x)
tokenize_regex(x, pattern, simplify)
}

#' @export
tokenize_regex.default <-
function(x,
pattern = "\\s+",
simplify = FALSE) {
check_input(x)
named <- names(x)
out <- stri_split_regex(x, pattern = pattern, omit_empty = TRUE)
if (!is.null(named))
names(out) <- named
simplify_list(out, simplify)
}
56 changes: 44 additions & 12 deletions R/character-shingles-tokenizers.R
Expand Up @@ -35,19 +35,51 @@
#'
#' @export
#' @rdname shingle-tokenizers
tokenize_character_shingles <- function(x, n = 3L, n_min = n,
tokenize_character_shingles <- function(x,
n = 3L,
n_min = n,
lowercase = TRUE,
strip_non_alphanum = TRUE,
simplify = FALSE) {
check_input(x)
named <- names(x)
if (n < n_min || n_min <= 0)
stop("n and n_min must be integers, and n_min must be less than ",
"n and greater than 1.")
chars <- tokenize_characters(x, lowercase = lowercase,
strip_non_alphanum = strip_non_alphanum)
out <- generate_ngrams_batch(chars, ngram_min = n_min, ngram_max = n,
stopwords = "", ngram_delim = "")
if (!is.null(named)) names(out) <- named
simplify_list(out, simplify)
UseMethod("tokenize_character_shingles")
}

#' @export
tokenize_character_shingles.data.frame <-
function(x,
n = 3L,
n_min = n,
lowercase = TRUE,
strip_non_alphanum = TRUE,
simplify = FALSE) {
x <- corpus_df_as_corpus_vector(x)
tokenize_character_shingles(x, n, n_min, lowercase, strip_non_alphanum, simplify)
}

#' @export
tokenize_character_shingles.default <-
function(x,
n = 3L,
n_min = n,
lowercase = TRUE,
strip_non_alphanum = TRUE,
simplify = FALSE) {
check_input(x)
named <- names(x)
if (n < n_min || n_min <= 0)
stop("n and n_min must be integers, and n_min must be less than ",
"n and greater than 1.")
chars <- tokenize_characters(x, lowercase = lowercase,
strip_non_alphanum = strip_non_alphanum)
out <-
generate_ngrams_batch(
chars,
ngram_min = n_min,
ngram_max = n,
stopwords = "",
ngram_delim = ""
)
if (!is.null(named))
names(out) <- named
simplify_list(out, simplify)
}
19 changes: 19 additions & 0 deletions R/coercion.R
@@ -0,0 +1,19 @@
is_corpus_df <- function(corpus) {
stopifnot(inherits(corpus, "data.frame"),
ncol(corpus) >= 2,
all(names(corpus)[1L:2L] == c("doc_id", "text")),
is.character(corpus$doc_id),
is.character(corpus$doc_id),
nrow(corpus) > 0)
TRUE # if it doesn't fail from the tests above then it fits the standard
}

corpus_df_as_corpus_vector <- function(corpus) {
if (is_corpus_df(corpus)) {
out <- corpus$text
names(out) <- corpus$doc_id
} else {
stop("Not a corpus data.frame")
}
out
}

0 comments on commit bd3a11c

Please sign in to comment.