Merge 3ffca24 into 788be81

syberia · Aug 2, 2015 · 2ade865 · 2ade865
2 parents 788be81 + 3ffca24
commit 2ade865
Show file tree

Hide file tree

Showing 30 changed files with 719 additions and 27 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -4,7 +4,7 @@ Description: Tundra provides a standardized container format for classifiers
     developed in R. This allows easier deployment by keeping both the data
     preparation procedure and the statistics in one place with an easy
     interface.
-Version: 0.2.3
+Version: 0.3.0
 Author: Robert Krzyzanowski <technoguyrob@gmail.com>
 Maintainer: Robert Krzyzanowski <technoguyrob@gmail.com>
 Authors@R: c(person("Robert", "Krzyzanowski",
@@ -13,10 +13,11 @@ Depends:
     R (>= 3.0.1)
 Imports:
     mungebits,
-    stagerunner
+    stagerunner,
+    R6,
+    crayon
 License: MIT
 LazyData: true
 Suggests:
-    knitr,
-    microbenchmark
+    testthatsomemore
 Roxygen: list(wrap = FALSE)
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,7 +1,8 @@
-# Generated by roxygen2 (4.0.1): do not edit by hand
+# Generated by roxygen2 (4.1.1): do not edit by hand
 
 S3method(print,tundraContainer)
 S3method(summary,tundraContainer)
+export(tundraContainer)
 export(tundra_container)
 export(tundra_gbm)
 export(tundra_random_forest)

diff --git a/R/hooks.R b/R/hooks.R
@@ -0,0 +1,45 @@
+#' Add a hook to a tundraContainer.
+#'
+#' Hooks are useful for defining additional checks that should be
+#' performed prior to and during training and prediction. For example,
+#' one might want to issue a warning if the user is predicting on 
+#' rows that were used for training, or a sanity check might be 
+#' present prior to training to ensure a dependent variable is present.
+#'
+#' The following hooks are available.
+#'
+#' \enumerate{
+#'   \item{train_pre_munge}{This hook runs during a call to the
+#'     container's \code{train} method, just prior to invoking the
+#'     \code{munge_procedure} to clean up the dataset. It could be
+#'     useful for defining pre-conditions on the dataset to ensure
+#'     it can be munged successfully.}
+#'   \item{train_post_munge}{This hook runs during a call to the
+#'     container's \code{train} method, just after invoking the
+#'     \code{munge_procedure} to clean up the dataset. It could be
+#'     useful for defining post-conditions on the dataset to ensure
+#'     it was munged successfully.}
+#'   \item{train_finalize}{This hook runs just after the \code{train}
+#'     method calls the \code{train_function}. It could be used to
+#'     verify presence or validate properties of the trained model.}
+#'   \item{predict_pre_munge}{This hook runs during a call to the
+#'     container's \code{predict} method, just prior to invoking the
+#'     \code{munge_procedure} to clean up the dataset. It could be
+#'     useful for defining pre-conditions on the dataset to ensure
+#'     it can be munged successfully.}
+#'   \item{predict_post_munge}{This hook runs during a call to the
+#'     container's \code{predict} method, just after invoking the
+#'     \code{munge_procedure} to clean up the dataset. It could be
+#'     useful for defining post-conditions on the dataset to ensure
+#'     it was munged successfully.}
+#' }
+#'
+#' Each hook will be provided the \code{tundraContainer} as input
+#' (unless it has no arguments, in which case it will simply be called).
+#'
+#' @param hook_name character. The hook to run. Must be one of the available
+#'    hooks.
+run_hooks <- function(hook_name) {
+
+}
+
diff --git a/R/package.tundra.R b/R/package.tundra.R
@@ -0,0 +1,37 @@
+#' Tundra is a standardized classifier container format for R.
+#'
+#' Deploying models in production systems is generally a cumbersome process.
+#' If analysis is performed in a language like R or SAS, the coefficients of the
+#' model are usually extracted and translated to a "production-ready" language like
+#' R or Java.
+#'
+#' However, this approach is flawed. The translation process is time consuming
+#' and error-prone. R is demonstrably capable of serving models
+#' in production environments as long as submillisecond latency is not a
+#' requirement. This means it should be possible to push analysis performed in
+#' R to directly score records in production systems without an intermediary.
+#' This significantly decreases the cost of iterating on machine learning
+#' models.
+#'
+#' A tundraContainer is a simple bundling of the two critical components of 
+#' any machine learning model.
+#'
+#' \itemize{
+#'   \item{The data preparation required to convert raw production data to
+#'     a record that is acceptable to a trained classifier. For example,
+#'     a regression-based model may need discretization of non-categorical
+#'     variables or imputation of missing values.}
+#'   \item{The trained classifier, usually a native R S3 object with
+#'     a \code{train} method.}
+#' }
+#'
+#' The former is provided by the \href{https://github.com/robertzk/mungebits}{mungebits}
+#' package, while the latter is fully customizable to any R function. This
+#' approach allows arbitrary data preparation and statistical methods, unlike
+#' attempts such as PMML (Predictive Modeling Markup Language) which constrain
+#' the space of possible data preparation methodologies and statistical
+#' methodologies to a very limited subset.
+#'
+#' @name tundra
+#' @docType package
+NULL
diff --git a/R/tundra-package.r b/R/tundra-package.r
diff --git a/R/tundraContainer-initialize.R b/R/tundraContainer-initialize.R
@@ -0,0 +1,48 @@
+#' Initialize a tundraContainer object.
+#'
+#' @param keyword character. The name of the classifier; for example,
+#'    "lm" or "knn".
+#' @param train_function function. The function used to train the model.
+#'    Its first argument will be a data.frame, and the second argument
+#'    a list of additional parameters used for training the model.
+#' @param predict_function function. The function used to predict
+#'    on new datasets. Its first argument will be a data.frame,
+#'    the dataset to predict on, and its second (optional)
+#'    argument will be additional parameters used for prediction
+#'    output (such as whether to return a probabilistic or absolute
+#'    value).
+#' @param munge_procedure list. A list of trained
+#'    \code{\link[mungebits]{mungepiece}}s to apply to data sets
+#'    during prediction. 
+#' @param default_args list. A list of default arguments to provide to
+#'    the second argument of the \code{train_function}. The additional
+#'    arguments provided to the \code{tundraContainer}'s \code{train}
+#'    method will be merged on top of these defaults.
+#' @param internal list. Internal metadata that should accompany the
+#'    model. Usually this is domain/organization specific, and can
+#'    include things such as a list of primary keys used for training
+#'    the model, identifiers or names of data sources used for
+#'    training the model, etc. It is a playground entirely under
+#'    your control, and can be used by other packages or a production
+#'    server hosting the model to achieve additional behavior.
+initialize <- function(keyword, train_function = identity,
+                       predict_function = identity, munge_procedure = list(),
+                       default_args = list(), internal = list()) {
+  if (!(is.list(munge_procedure) || is(munge_procedure, "stageRunner"))) {
+    stop("The ", sQuote("munge_procedure"), " parameter must be a list or ",
+         "stageRunner object.")
+  }
+
+  self$.keyword          <<- keyword
+  self$.train_function   <<- train_function
+  self$.predict_function <<- predict_function
+  self$.munge_procedure  <<- munge_procedure
+  self$.default_args     <<- default_args
+  self$.internal         <<- internal
+
+  self$.input            <<- list_to_env(list())
+  lockEnvironment(self$.input)
+  self$.output           <<- list_to_env(list())
+  self$.internal         <<- list_to_env(list())
+}
+
diff --git a/R/tundraContainer-predict.R b/R/tundraContainer-predict.R
@@ -0,0 +1,53 @@
+#' Predict on a dataset using a trained tundraContainer.
+#'
+#' @param dataframe data.frame. The dataset to generate predictions on
+#'    with the trained model. The data will be preprocessed with the
+#'    \code{tundraContainer}'s trained \code{munge_procedure} and
+#'    then passed as the first argument to the \code{tundraContainer}'s
+#'    \code{predict_function}.
+#' @param predict_args list. A list of arguments to pass to the
+#'    \code{tundraContainer}'s \code{predict_function} as its second argument.
+#' @param verbose logical. Either \code{TRUE} or \code{FALSE}, by
+#'    default the latter. If \code{TRUE}, then output produced by
+#'    running the \code{munge_procedure} or the \code{predict_function}
+#'    will not be silenced.
+#' @param munge logical. Either \code{TRUE} or \code{FALSE}, by
+#'    default the former. If \code{TRUE}, the \code{munge_procedure}
+#'    provided to the container during initialization will be used to
+#'    preprocess the given \code{dataframe}.
+#' @return The value returned by the \code{tundraContainer}'s
+#'    \code{predict_function}, usually a numeric vector or
+#'    \code{data.frame} of predictions.
+predict <- function(dataframe, predict_args = list(), verbose = FALSE, munge = TRUE) {
+  if (!isTRUE(self$.trained)) {
+    stop("Tundra model ", sQuote(self$.keyword), " has not been trained yet.")
+  }
+
+  force(verbose)
+  force(munge)
+  force(predict_args)
+
+  private$run_hooks("predict_pre_munge")
+  if (isTRUE(munge) && length(self$.munge_procedure) > 0) {
+    initial_nrow <- NROW(datafram)
+    dataframe <- munge(dataframe, self$.munge_procedure, verbose)
+    if (NROW(dataframe) != initial_nrow) {
+      warning("Some rows were removed during data preparation. ",
+              "Predictions will not match input dataframe.")
+    }
+  }
+  private$run_hooks("predict_post_munge")
+
+  if (length(formals(self$.predict_function) < 2 || missing(predict_args))) {
+    args <- list(dataframe)
+  } else {
+    args <- list(dataframe, predict_args)
+  }
+
+  call_with(
+    self$.predict_function,
+    args,
+    list(input = self$.input, output = self$.output)
+  )
+}
+
diff --git a/R/tundraContainer-train.R b/R/tundraContainer-train.R
@@ -0,0 +1,61 @@
+#' Train a model encapsulated within a tundraContainer.
+#'
+#' @param dataframe data.frame. The dataset to train the model on. This
+#'    will be preprocessed with the \code{tundraContainer}'s 
+#'    \code{munge_procedure} and then passed as the first argument to
+#'    the \code{tundraContainer}'s \code{train_function}.
+#' @param train_args list. A list of arguments to make available
+#'    to the \code{tundraContainer}'s \code{train_function} through
+#'    use of the \code{input} keyword. See the examples.
+#' @param verbose logical. Either \code{TRUE} or \code{FALSE}, by
+#'    default the latter. If \code{TRUE}, then output produced by
+#'    running the \code{munge_procedure} or the \code{train_function}
+#'    will not be silenced.
+#' @param munge logical. Either \code{TRUE} or \code{FALSE}, by
+#'    default the former. If \code{FALSE}, the \code{munge_procedure}
+#'    provided to the container during initialization will be assumed
+#'    to have been trained, and the \code{dataframe} provided will not
+#'    be run through it.
+#' @return The value returned by the \code{tundraContainer}'s
+#'    \code{train_function}. Since the \code{train_function} has side effects
+#'    on the container as its primary purpose, this can usually be
+#'    \code{invisible(NULL)}.
+train <- function(dataframe, train_args = list(), verbose = FALSE, munge = TRUE) {
+  if (isTRUE(self$.trained)) {
+    stop("The tundra ", sQuote(self$.keyword), " model has already been trained.")
+  }
+
+  force(train_args)
+  force(verbose)
+  force(munge)
+
+  private$run_hooks("train_pre_munge")
+  if (isTRUE(munge) && length(self$.munge_procedure) > 0) {
+    dataframe <- munge(dataframe, self$.munge_procedure, verbose)
+    attr(dataframe, "mungepieces") <- NULL
+  }
+  private$run_hooks("train_post_munge")
+
+  output <- call_with(
+    self$.train_function,
+    list(dataframe),
+    list(
+      input = list_to_env(list_merge(self$.default_args, train_args), self$.input),
+      output = self$.output
+    )
+  )
+
+  private$run_hooks("train_finalize")
+  self$.trained <<- TRUE
+
+  output
+}
+
+munge <- function(dataframe, munge_procedure, verbose) {
+  if (isTRUE(verbose)) {
+    capture.output(Recall(dataframe, munge_procedure, FALSE))
+  } else {
+    mungebits::munge(dataframe, munge_procedure)
+  }
+}
+
diff --git a/R/tundraContainer.R b/R/tundraContainer.R
@@ -0,0 +1,37 @@
+#' A standard container format for classifiers developed in R.
+#'
+#' @docType class
+#' @name tundraContainer
+#' @export
+tundraContainer <- R6::R6Class("tundraContainer",
+  public = list(
+    .keyword          = NULL, # character
+    .train_function   = NULL, # function
+    .predict_function = NULL, # function
+    .munge_procedure  = NULL, # list of mungepieces
+    .default_args     = NULL, # list
+    .trained          = FALSE, # logical
+    .input            = NULL, # environment
+    .output           = NULL, # environment
+    .internal         = NULL, # environment
+    .hooks            = NULL, # list
+
+    initialize = initialize,
+    train      = train,
+    predict    = predict,
+    #add_hook   = add_hook,
+
+    munge      = function(dataframe, steps = TRUE) {
+      mungebits::munge(dataframe, munge_procedure[steps])
+    },
+    show       = function() {
+      cat("A tundraContainer of type ", sQuote(self$.keyword), "\n")
+    }
+  ),
+
+  private = list(
+    run_hooks = run_hooks
+  )  
+
+)
+
diff --git a/R/utils.r b/R/utils.r
@@ -1,2 +1,68 @@
 `%||%` <- function(x, y) if (is.null(x)) y else x
 
+list_to_env <- function(obj, parent = emptyenv()) {
+  if (length(obj) == 0) {
+    new.env(parent = parent)
+  } else {
+    list2env(obj, parent = parent)
+  }
+}
+
+#' Evaluate a function while injecting some locals.
+#'
+#' Instead of modifying a closure's parent environment directly,
+#' sometimes it may be desirable to do a one-time injection that
+#' overrides what would normally be accessible through the closure.
+#' \code{call_with} allows this by extending the usual \code{do.call}
+#' to a third argument that is a list or environment temporarily
+#' injected during the course of the call.
+#'
+#' @param fn function.
+#' @param args list. The arguments to call the \code{fn} with.
+#' @param with list or environment. Additional locals to make available
+#'    during the call.
+#' @return The result of calling \code{fn} with the injection provided
+#'    by the \code{with} parameter.
+#' @examples \dontrun{
+#' fn <- local({ x <- 1; function(y) { x + y } })
+#' stopifnot(fn(1) == 2)
+#' stopifnot(call_with(fn, list(1), list(x = 2)) == 3)
+#' }
+call_with <- function(fn, args, with) {
+  stopifnot(is.list(with) || is.environment(with))
+  debugged <- isdebugged(fn)
+  copy_fn <- fn
+  if (debugged) debug(copy_fn)
+  env <- with
+  if (!is.environment(with)) {
+    with <- list_to_env(with, parent = environment(copy_fn))
+  }
+  environment(fn) <- with
+  do.call(fn, args)
+}
+
+#' Merge two lists and overwrite latter entries with former entries
+#' if names are the same.
+#'
+#' For example, \code{list_merge(list(a = 1, b = 2), list(b = 3, c = 4))}
+#' will be \code{list(a = 1, b = 3, c = 4)}.
+#' @param list1 list
+#' @param list2 list
+#' @return the merged list.
+#' @examples \dontrun{
+#' stopifnot(identical(list_merge(list(a = 1, b = 2), list(b = 3, c = 4)),
+#'                     list(a = 1, b = 3, c = 4)))
+#' stopifnot(identical(list_merge(NULL, list(a = 1)), list(a = 1)))
+#' }
+list_merge <- function(list1, list2) {
+  list1 <- list1 %||% list()
+  # Pre-allocate memory to make this slightly faster.
+  list1[Filter(function(x) nchar(x) > 0, names(list2) %||% c())] <- NULL
+  for (i in seq_along(list2)) {
+    name <- names(list2)[i]
+    if (!identical(name, NULL) && !identical(name, "")) list1[[name]] <- list2[[i]]
+    else list1 <- append(list1, list(list2[[i]]))
+  }
+  list1
+}
+