preparing compliance with ROpenSci onboarding

ropenscilabs · Mar 14, 2018 · c495b51 · c495b51
1 parent bd1182c
commit c495b51
Show file tree

Hide file tree

Showing 11 changed files with 136 additions and 17 deletions.
diff --git a/CONDUCT.md b/CONDUCT.md
@@ -0,0 +1,25 @@
+# Contributor Code of Conduct
+
+As contributors and maintainers of this project, we pledge to respect all people who 
+contribute through reporting issues, posting feature requests, updating documentation,
+submitting pull requests or patches, and other activities.
+
+We are committed to making participation in this project a harassment-free experience for
+everyone, regardless of level of experience, gender, gender identity and expression,
+sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion.
+
+Examples of unacceptable behavior by participants include the use of sexual language or
+imagery, derogatory comments or personal attacks, trolling, public or private harassment,
+insults, or other unprofessional conduct.
+
+Project maintainers have the right and responsibility to remove, edit, or reject comments,
+commits, code, wiki edits, issues, and other contributions that are not aligned to this 
+Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed 
+from the project team.
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by 
+opening an issue or contacting one or more of the project maintainers.
+
+This Code of Conduct is adapted from the Contributor Covenant 
+(http:contributor-covenant.org), version 1.0.0, available at 
+http://contributor-covenant.org/version/1/0/0/
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,17 @@
 Package: tif
 Type: Package
 Title: Text Interchange Format
-Version: 0.2
-Author: rOpenSci Text Workshop
+Version: 0.3
+Authors@R: c(person("Taylor", "Arnold", role = c("aut", "cre"),
+                     email = "taylor.arnold@acm.org"),
+              person("Ken", "Benoit", role = "aut",
+                     email = "k.r.benoit@lse.ac.uk"),
+              person("Lincoln", "Mullen", role = "aut",
+                     email = "lmullen@gmu.edu "),
+              person("Adam", "Obeng", role = "aut",
+                     email = "contact@adamobeng.com"),
+              person("rOpenSci Text Workshop Participants (2017)",
+                     role = "aut"))
 Maintainer: Taylor B. Arnold <taylor.arnold@acm.org>
 Description: Provides validation functions for common
     interchange formats for representing text data in R.

diff --git a/NAMESPACE b/NAMESPACE
@@ -10,3 +10,4 @@ export(tif_is_dtm)
 export(tif_is_tokens_df)
 export(tif_is_tokens_list)
 import(Matrix)
+importFrom(Matrix,Matrix)
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,9 @@
+# tif 0.3.0
+
+* Further discussion has lead us to simplify the corpus and token data frame
+formats. The doc_id, text, and token columns can be in any position within the
+data frame.
+
 # tif 0.2.0
 
 * After a round of input for the initial version of the specification,
@@ -30,11 +36,9 @@ a list-based tokens object.
 
 * The old validate functions have been renamed `tif_is_corpus_df`,
 `tif_is_dtm` and `tif_is_tokens_df`. This is more in line with base-R
-functions and seperates the "df" version of the corpus and tokens from
+functions and separates the "df" version of the corpus and tokens from
 the alternative new forms.
 
-
-
 # tif 0.1.0
 
 * This is the initial implementation of the ideas discussed at

diff --git a/R/coercion.R b/R/coercion.R
@@ -41,7 +41,7 @@ tif_as_corpus_df <- function(corpus) {
   if (!inherits(corpus, "data.frame")) {
     # Need to convert from character
     if (is.null(names(corpus))) {
-      doc_id <- sprintf("doc%d", 1:length(corpus))
+      doc_id <- sprintf("doc%d", seq_along(corpus))
     } else {
       doc_id <- names(corpus)
     }
@@ -60,7 +60,7 @@ tif_as_tokens_df <- function(tokens) {
   if (!inherits(tokens, "data.frame")) {
     # Need to convert from list to data frame
     if (is.null(names(tokens))) {
-      doc_id <- sprintf("doc%d", 1:length(tokens))
+      doc_id <- sprintf("doc%d", seq_along(tokens))
     } else {
       doc_id <- names(tokens)
     }

diff --git a/R/validators.R b/R/validators.R
@@ -114,8 +114,9 @@ tif_is_corpus_character <- function(corpus, warn = FALSE) {
     return(FALSE)
   }
 
-  if (!is.null(attributes(corpus)) && any(names(attributes(corpus)) != "names")) {
-    if (warn) warning("corpus object should have no attributes other than 'names'")
+  if (!is.null(attributes(corpus)) &&
+      any(names(attributes(corpus)) != "names")) {
+    if (warn) warning("corpus object should only have 'names' attribute")
     return(FALSE)
   }
 
@@ -166,6 +167,7 @@ tif_is_corpus_character <- function(corpus, warn = FALSE) {
 #' it may not contain row or column names.
 #'
 #' @example inst/examples/tif_is_dtm.R
+#' @importFrom Matrix Matrix
 #' @export
 tif_is_dtm <- function(dtm, warn = FALSE) {
 
@@ -323,8 +325,9 @@ tif_is_tokens_list <- function(tokens, warn = FALSE) {
     return(FALSE)
   }
 
-  if (!is.null(attributes(tokens)) && any(names(attributes(tokens)) != "names")) {
-    if (warn) warning("tokens object should have no attributes other than 'names'")
+  if (!is.null(attributes(tokens)) &&
+      any(names(attributes(tokens)) != "names")) {
+    if (warn) warning("tokens object should only have 'names' attribute")
     return(FALSE)
   }
 
@@ -333,18 +336,20 @@ tif_is_tokens_list <- function(tokens, warn = FALSE) {
     return(FALSE)
   }
 
-  if (any(sapply(tokens, is.null))) {
+  if (any(unlist(lapply(tokens, is.null)))) {
     if (warn) warning("no elements of tokens should be 'NULL'")
     return(FALSE)
   }
 
-  if (!all(sapply(tokens, is.character))) {
+  if (!all(unlist(lapply(tokens, is.character)))) {
     if (warn) warning("elements of tokens should all be a character vectors")
     return(FALSE)
   }
 
-  if (!all(sapply(lapply(tokens, attributes), is.null))) {
-    if (warn) warning("elements of tokens should have no additional attributes")
+  if (!all(unlist(lapply(lapply(tokens, attributes), is.null)))) {
+    if (warn) {
+      warning("elements of tokens should have no additional attributes")
+    }
     return(FALSE)
   }
 

diff --git a/README.md b/README.md
@@ -1,4 +1,64 @@
-### Text Interchange Formats
+## tif: Text Interchange Formats
+
+[![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/statsmaths/tif?branch=master&svg=true)](https://ci.appveyor.com/project/statsmaths/cleanNLP) [![Travis-CI Build Status](https://travis-ci.org/statsmaths/cleanNLP.svg?branch=master)](https://travis-ci.org/ropensci/tif)
+
+This package describes and validates formats for storing
+common object arising in text analysis as native R objects.
+Representations of a text corpus, document term matrix, and
+tokenized text are included. The tokenized text format is
+extensible to include other annotations. There are two versions
+of the corpus and tokens objects; packages should accept
+both and return or coerce to at least one of these.
+
+## Installation
+
+
+
+## Installation
+
+You can install the development version using devtools:
+
+```{r}
+devtools::install_github("ropensci/tif")
+```
+
+## Usage
+
+The package can be used to check that a particular object is in a valid 
+format. For example, here we see that the object `corpus` is a valid corpus
+data frame:
+
+```{r}
+library(tif)
+corpus <- data.frame(doc_id = c("doc1", "doc2", "doc3"),
+                     text = c("Aujourd'hui, maman est morte.",
+                      "It was a pleasure to burn.",
+                      "All this happened, more or less."),
+                     stringsAsFactors = FALSE)
+
+tif_is_corpus_df(corpus)
+```
+```
+TRUE
+```
+
+The package also has functions to convert between the list and data frame
+formats for corpus and token object. For example:
+
+```{r}
+tif_as_corpus_character(corpus)
+```
+```
+                              doc1                               doc2 
+   "Aujourd'hui, maman est morte."       "It was a pleasure to burn." 
+                              doc3 
+"All this happened, more or less." 
+```
+
+Note that extra meta data columns will be lost in the conversion from a data
+frame to a named character vector.
+
+## Details
 
 This package describes and validates formats for storing
 common object arising in text analysis as native R objects.

diff --git a/inst/examples/tif_is_dtm.R b/inst/examples/tif_is_dtm.R
@@ -1,3 +1,4 @@
+#' @importFrom Matrix Matrix
 dtm <- Matrix::Matrix(0, ncol = 26, nrow = 5, sparse = TRUE)
 colnames(dtm) <- LETTERS
 rownames(dtm) <- sprintf("doc%d", 1:5)

diff --git a/man/tif-package.Rd b/man/tif-package.Rd
diff --git a/man/tif_is_dtm.Rd b/man/tif_is_dtm.Rd
diff --git a/tests/testthat/test-validators.R b/tests/testthat/test-validators.R
@@ -93,7 +93,8 @@ test_that("tif_is_corpus_df", {
 	)
 
 	# A corpus with duplicated doc_id s
-	tc <- data.frame(doc_id=c('1', '2', '1'), text=rep('foobar', 3), stringsAsFactors=F)
+	tc <- data.frame(doc_id=c('1', '2', '1'), text=rep('foobar', 3),
+									 stringsAsFactors=F)
 	expect_false(tif_is_corpus_df(tc))
 	expect_warning(
 		tif_is_corpus_df(tc, warn=T),