Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Comply with Text Interchange Format for inputs
Converts all tokenization functions to methods. After some basic checking, data frame methods coerce the corpus data frame to a named character vector then pass it to the default method. The default method can handle either character vectors or lists as before. Addresses #49
- Loading branch information
Showing
10 changed files
with
611 additions
and
198 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
is_corpus_df <- function(corpus) { | ||
stopifnot(inherits(corpus, "data.frame"), | ||
ncol(corpus) >= 2, | ||
all(names(corpus)[1L:2L] == c("doc_id", "text")), | ||
is.character(corpus$doc_id), | ||
is.character(corpus$doc_id), | ||
nrow(corpus) > 0) | ||
TRUE # if it doesn't fail from the tests above then it fits the standard | ||
} | ||
|
||
corpus_df_as_corpus_vector <- function(corpus) { | ||
if (is_corpus_df(corpus)) { | ||
out <- corpus$text | ||
names(out) <- corpus$doc_id | ||
} else { | ||
stop("Not a corpus data.frame") | ||
} | ||
out | ||
} |
Oops, something went wrong.