Merge pull request #9 from quanteda/mod_edits

Add nnseq function to quanteda.classifiers
quanteda · May 15, 2019 · 0e275a0 · 0e275a0
2 parents a26b930 + 5b57ddb
commit 0e275a0
Show file tree

Hide file tree

Showing 16 changed files with 364 additions and 142 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,9 +1,32 @@
 # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r
-
 language: R
+
+dist: trusty
 sudo: false
-cache: packages
+
+cache:
+  packages: true
+  directories:
+    - $HOME/.keras
+    - $HOME/.cache/pip
+
 warnings_are_errors: true
 
+matrix:
+  include:
+    - name: "Backend: TensorFlow | Implementation: Keras"
+      env:
+        - KERAS_BACKEND="tensorflow"
+        - KERAS_IMPLEMENTATION="tensorflow"
+        - TENSORFLOW_VERSION="default"
+
+before_script:
+  - sudo apt-get update
+  - sudo apt-get install python3
+  - pip install --upgrade --ignore-installed --user travis virtualenv
+  - R CMD INSTALL .
+  - R -e 'tensorflow::install_tensorflow(version = Sys.getenv("TENSORFLOW_VERSION"))'
+  - R -e 'tensorflow::tf_config()'
+
 after_success:
-  - Rscript -e 'covr::codecov()'
+  - Rscript -e 'covr::codecov()'
diff --git a/NAMESPACE b/NAMESPACE
@@ -4,20 +4,24 @@ S3method(coef,textmodel_svm)
 S3method(coef,textmodel_svmlin)
 S3method(coefficients,textmodel_svm)
 S3method(coefficients,textmodel_svmlin)
-S3method(predict,textmodel_slstm)
+S3method(predict,textmodel_nnseq)
 S3method(predict,textmodel_svm)
 S3method(predict,textmodel_svmlin)
+S3method(print,predict.textmodel_nnseq)
 S3method(print,predict.textmodel_svm)
 S3method(print,predict.textmodel_svmlin)
+S3method(print,textmodel_nnseq)
 S3method(print,textmodel_svm)
 S3method(print,textmodel_svmlin)
+S3method(summary,textmodel_nnseq)
 S3method(summary,textmodel_svm)
 S3method(summary,textmodel_svmlin)
+S3method(textmodel_nnseq,dfm)
 S3method(textmodel_svm,default)
 S3method(textmodel_svm,dfm)
 S3method(textmodel_svmlin,default)
 S3method(textmodel_svmlin,dfm)
-export(textmodel_slstm)
+export(textmodel_nnseq)
 export(textmodel_svm)
 export(textmodel_svmlin)
 import(quanteda)

diff --git a/R/textmodel_nnseq.R b/R/textmodel_nnseq.R
@@ -0,0 +1,175 @@
+#' sequential neural network model for text classification
+#'
+#' This function is a wrapper for a sequential neural network model with a
+#' single hidden layer network with two layers, implemented in the \pkg{keras}
+#' package.
+#' @inheritParams textmodel_svm
+#' @param units The number of network nodes used in the first layer of the
+#'   sequential model
+#' @param dropout A floating variable bound between 0 and 1. It determines the
+#'   rate at which units are dropped for the linear transformation of the
+#'   inputs.
+#' @param optimizer optimizer used to fit model to training data, see
+#'   \code{\link[keras]{compile.keras.engine.training.Model}}
+#' @param loss objective loss function, see
+#'   \code{\link[keras]{compile.keras.engine.training.Model}}
+#' @param metrics metric used to train algorithm, see
+#'   \code{\link[keras]{compile.keras.engine.training.Model}}
+#' @param ... additional options passed to
+#'   \code{\link[keras]{fit.keras.engine.training.Model}}
+#' @keywords textmodel
+#' @importFrom keras keras_model_sequential to_categorical
+#' @importFrom keras layer_dense layer_activation layer_dropout compile fit
+#' @export
+#' @examples 
+#' \dontrun{
+#' # create a dataset with evenly balanced coded and uncoded immigration sentences
+#' corpcoded <- corpus_subset(data_corpus_manifestosentsUK, !is.na(crowd_immigration_label))
+#' corpuncoded <- data_corpus_manifestosentsUK %>%
+#'     corpus_subset(is.na(crowd_immigration_label) & year > 1980) %>%
+#'     corpus_sample(size = ndoc(corpcoded))
+#' corp <- corpcoded + corpuncoded
+#' 
+#' # form a tf-idf-weighted dfm
+#' dfmat <- dfm(corp) %>%
+#'     dfm_tfidf()
+#' 
+#' set.seed(1000)
+#' tmod <- textmodel_nnseq(dfmat, y = docvars(dfmat, "crowd_immigration_label"),
+#'                         epochs = 5, verbose = 1)
+#' pred <- predict(tmod, newdata = dfm_subset(dfmat, is.na(crowd_immigration_label)))
+#' table(pred)
+#' tail(texts(corpuncoded)[pred == "Immigration"], 10)
+#' }
+textmodel_nnseq <- function(x, y, units = 512, dropout = .2, 
+                            optimizer = "adam",
+                            loss = "categorical_crossentropy", 
+                            metrics = "categorical_accuracy",
+                            ...) {
+    UseMethod("textmodel_nnseq")
+}
+
+#' @export
+textmodel_nnseq.dfm <- function(x, y, units = 512, dropout = .2, 
+                                optimizer = "adam",
+                                loss = "categorical_crossentropy", 
+                                metrics = "categorical_accuracy", ...) {
+    stopifnot(ndoc(x) == length(y))
+
+    x <- as.dfm(x)
+    y <- as.factor(y)
+    result <- list(x = x, y = y, call = match.call(), classnames = levels(y))
+
+    # trim missings for fitting model
+    na_ind <- which(is.na(y))
+    if (length(na_ind) > 0) {
+        # message(length(na_ind), "observations with the value 'NA' were removed.")
+        y <- y[-na_ind]
+        x <- x[-na_ind]
+    }
+
+    # "one-hot" encode y
+    y2 <- to_categorical(as.integer(y) - 1, num_classes = nlevels(y))
+
+    # use keras to fit the model
+    model <- keras_model_sequential()
+    model %>%
+        layer_dense(units = units, input_shape = nfeat(x)) %>%
+        layer_activation(activation = "relu") %>%
+        layer_dropout(rate = dropout) %>%
+        layer_dense(units = nlevels(y)) %>%
+        layer_activation(activation = "softmax")
+    compile(model, loss = loss, optimizer = optimizer, metrics = metrics)
+    history <- fit(model, x, y2, ...)
+
+    # compile, class, and return the result
+    result <- c(result, list(seqfitted = model))
+    class(result) <- c("textmodel_nnseq", "textmodel", "list")
+    return(result)
+}
+
+#' Prediction from a fitted textmodel_nnseq object
+#'
+#' \code{predict.textmodel_nnseq()} implements class predictions from a fitted
+#' sequential neural network model.
+#' @param object a fitted \link{textmodel_nnseq} model
+#' @param newdata dfm on which prediction should be made
+#' @param type the type of predicted values to be returned; see Value
+#' @param force make \code{newdata}'s feature set conformant to the model terms
+#' @param ... not used
+#' @return \code{predict.textmodel_nnseq} returns either a vector of class
+#'   predictions for each row of \code{newdata} (when \code{type = "class"}), or
+#'   a document-by-class matrix of class probabilities (when \code{type =
+#'   "probability"}).
+#' @seealso \code{\link{textmodel_nnseq}}
+#' @keywords textmodel internal
+#' @importFrom keras predict_classes predict_proba
+#' @export
+predict.textmodel_nnseq <- function(object, newdata = NULL,
+                                  type = c("class", "probability"),
+                                  force = TRUE,
+                                  ...) {
+    quanteda:::unused_dots(...)
+
+    type <- match.arg(type)
+
+    if (!is.null(newdata)) {
+        data <- as.dfm(newdata)
+    } else {
+        data <- as.dfm(object$x)
+    }
+
+    data <- if (is.null(newdata)) {
+        suppressWarnings(quanteda:::force_conformance(data, featnames(data), force))
+    } else {
+        quanteda:::force_conformance(data, featnames(data), force)
+    }
+
+    if (type == "class") {
+        pred_y <- predict_classes(object$seqfitted, x = data)
+        pred_y <- factor(pred_y, labels = object$classnames, levels = (seq_along(object$classnames) - 1))
+        names(pred_y) <- docnames(data)
+    } else if (type == "probability") {
+        pred_y <- predict_proba(object$seqfitted, x = data)
+        colnames(pred_y) <- object$classnames
+        rownames(pred_y) <- docnames(data)
+    }
+
+    pred_y
+}
+
+#' @export
+#' @method print textmodel_nnseq
+print.textmodel_nnseq <- function(x, ...) {
+    layer_names <- gsub(pattern = "_\\d*", "", lapply(x$seqfitted$layers, function(z) z$name))
+    cat("\nCall:\n")
+    print(x$call)
+    cat("\n",
+        format(length(na.omit(x$y)), big.mark = ","), " training documents; ",
+        format(length(x$weights), big.mark = ","), " fitted features",
+        ".\n",
+        "Structure: ", paste(layer_names, collapse = " -> "), "\n",
+        sep = "")
+}
+
+#' summary method for textmodel_svm objects
+#' @param object output from \code{\link{textmodel_svm}}
+#' @param ... additional arguments not used
+#' @keywords textmodel internal
+#' @method summary textmodel_nnseq
+#' @export
+summary.textmodel_nnseq <- function(object, ...) {
+    layer_names <- gsub(pattern = "_\\d*", "", lapply(object$seqfitted$layers, function(x) x$name))
+
+    result <- list(
+        "call" = object$call,
+        "model structure" = paste(layer_names, collapse = " -> ")
+    )
+    as.summary.textmodel(result)
+}
+
+#' @export
+#' @method print predict.textmodel_nnseq
+print.predict.textmodel_nnseq <- function(x, ...) {
+    print(unclass(x))
+}
diff --git a/R/textmodel_slstm.R b/R/textmodel_slstm.R
diff --git a/R/textmodel_svm.R b/R/textmodel_svm.R
@@ -134,7 +134,7 @@ predict.textmodel_svm <- function(object, newdata = NULL,
                       proba = (type == "probability"))
 
     if (type == "class") {
-        pred_y <- as.character(pred_y$predictions)
+        pred_y <- pred_y$predictions
         names(pred_y) <- docnames(data)
     } else if (type == "probability") {
         pred_y <- pred_y$probabilities

diff --git a/R/textmodel_svmlin.R b/R/textmodel_svmlin.R
@@ -4,6 +4,11 @@
 #' svmlin code by Vikas Sindhwani and S. Sathiya Keerthi for fast linear
 #' transductive SVMs. This is passed through to \code{\link[RSSL]{svmlin}} as
 #' implemented by the \pkg{RSSL} package.
+#' 
+#' @description This function has been retained for testing purposes only; 
+#'   we recommend that you use \code{\link{textmodel_svm}} instead.  That 
+#'   function is more efficient, and implements prediction for more than 
+#'   two classes.
 #' @param x the \link{dfm} on which the model will be fit.  Does not need to
 #'   contain only the training documents.
 #' @param y vector of training labels associated with each document identified 
@@ -21,7 +26,7 @@
 #' V. Sindhwani and S. Sathiya Keerthi (2006).  Newton Methods for Fast Solution of Semi-supervised
 #' Linear SVMs. Book Chapter in \emph{Large Scale Kernel Machines}, MIT Press, 2006.
 #' 
-#' @seealso \code{\link[RSSL]{svmlin}}
+#' @seealso \code{\link[RSSL]{svmlin}}, \code{text{textmodel_svm}}
 #' @examples
 #' # use Lenihan for govt class and Bruton for opposition
 #' docvars(data_corpus_irishbudget2010, "govtopp") <- c("Govt", "Opp", rep(NA, 12))
@@ -34,6 +39,7 @@
 #'                       pos_frac = 5/14))
 #' @import quanteda
 #' @importFrom stats na.omit predict
+#' @keywords textmodel internal
 #' @export
 textmodel_svmlin <- function(x, y, intercept = TRUE, ...) {
     UseMethod("textmodel_svmlin")

diff --git a/R/zzz.R b/R/zzz.R
@@ -0,0 +1,4 @@
+.onAttach <- function(...) {
+    options(keras.fit_verbose = 0)
+    options(keras.view_metrics = FALSE)
+}