Skip to content

Commit

Permalink
Merge 8a175b0 into 788be81
Browse files Browse the repository at this point in the history
  • Loading branch information
robertzk committed Aug 2, 2015
2 parents 788be81 + 8a175b0 commit e929a99
Show file tree
Hide file tree
Showing 31 changed files with 845 additions and 27 deletions.
9 changes: 5 additions & 4 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ Description: Tundra provides a standardized container format for classifiers
developed in R. This allows easier deployment by keeping both the data
preparation procedure and the statistics in one place with an easy
interface.
Version: 0.2.3
Version: 0.3.0
Author: Robert Krzyzanowski <technoguyrob@gmail.com>
Maintainer: Robert Krzyzanowski <technoguyrob@gmail.com>
Authors@R: c(person("Robert", "Krzyzanowski",
Expand All @@ -13,10 +13,11 @@ Depends:
R (>= 3.0.1)
Imports:
mungebits,
stagerunner
stagerunner,
R6,
crayon
License: MIT
LazyData: true
Suggests:
knitr,
microbenchmark
testthatsomemore
Roxygen: list(wrap = FALSE)
3 changes: 2 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# Generated by roxygen2 (4.0.1): do not edit by hand
# Generated by roxygen2 (4.1.1): do not edit by hand

S3method(print,tundraContainer)
S3method(summary,tundraContainer)
export(tundraContainer)
export(tundra_container)
export(tundra_gbm)
export(tundra_random_forest)
Expand Down
69 changes: 69 additions & 0 deletions R/hooks.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#' Add a hook to a tundraContainer.
#'
#' Hooks are useful for defining additional checks that should be
#' performed prior to and during training and prediction. For example,
#' one might want to issue a warning if the user is predicting on
#' rows that were used for training, or a sanity check might be
#' present prior to training to ensure a dependent variable is present.
#'
#' The following hooks are available.
#'
#' \enumerate{
#' \item{train_pre_munge}{This hook runs during a call to the
#' container's \code{train} method, just prior to invoking the
#' \code{munge_procedure} to clean up the dataset. It could be
#' useful for defining pre-conditions on the dataset to ensure
#' it can be munged successfully.}
#' \item{train_post_munge}{This hook runs during a call to the
#' container's \code{train} method, just after invoking the
#' \code{munge_procedure} to clean up the dataset. It could be
#' useful for defining post-conditions on the dataset to ensure
#' it was munged successfully.}
#' \item{train_finalize}{This hook runs just after the \code{train}
#' method calls the \code{train_function}. It could be used to
#' verify presence or validate properties of the trained model.}
#' \item{predict_pre_munge}{This hook runs during a call to the
#' container's \code{predict} method, just prior to invoking the
#' \code{munge_procedure} to clean up the dataset. It could be
#' useful for defining pre-conditions on the dataset to ensure
#' it can be munged successfully.}
#' \item{predict_post_munge}{This hook runs during a call to the
#' container's \code{predict} method, just after invoking the
#' \code{munge_procedure} to clean up the dataset. It could be
#' useful for defining post-conditions on the dataset to ensure
#' it was munged successfully.}
#' }
#'
#' Each hook will be provided the \code{tundraContainer} as input
#' (unless it has no arguments, in which case it will simply be called).
#'
#' @name hooks
#' @param hook_name character. The hook to run. Must be one of the available
#' hooks.
run_hooks <- function(hook_name) {
for (hook in self$.hooks[[hook_name]]) {
if (length(formals(hook)) > 0) {
hook(self)
} else {
hook()
}
}
}

#' Add a hook to a tundraContainer.
#'
#' @param hook_function function. The hook to execute. It will be provided
#' the \code{tundraContainer} as its only argument.
#' @rdname hooks
add_hook <- function(hook_name, hook_function) {
stopifnot(is.simple_string(hook_name),
is.function(hook_function))

allowed_types <- c("train_pre_munge", "predict_pre_munge",
"train_post_munge", "predict_post_munge",
"train_finalize")
hook_name <- match.arg(hook_name, allowed_types)

self$.hooks[[hook_name]] <- c(self$.hooks[[hook_name]], hook_function)
}

37 changes: 37 additions & 0 deletions R/package.tundra.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#' Tundra is a standardized classifier container format for R.
#'
#' Deploying models in production systems is generally a cumbersome process.
#' If analysis is performed in a language like R or SAS, the coefficients of the
#' model are usually extracted and translated to a "production-ready" language like
#' R or Java.
#'
#' However, this approach is flawed. The translation process is time consuming
#' and error-prone. R is demonstrably capable of serving models
#' in production environments as long as submillisecond latency is not a
#' requirement. This means it should be possible to push analysis performed in
#' R to directly score records in production systems without an intermediary.
#' This significantly decreases the cost of iterating on machine learning
#' models.
#'
#' A tundraContainer is a simple bundling of the two critical components of
#' any machine learning model.
#'
#' \itemize{
#' \item{The data preparation required to convert raw production data to
#' a record that is acceptable to a trained classifier. For example,
#' a regression-based model may need discretization of non-categorical
#' variables or imputation of missing values.}
#' \item{The trained classifier, usually a native R S3 object with
#' a \code{train} method.}
#' }
#'
#' The former is provided by the \href{https://github.com/robertzk/mungebits}{mungebits}
#' package, while the latter is fully customizable to any R function. This
#' approach allows arbitrary data preparation and statistical methods, unlike
#' attempts such as PMML (Predictive Modeling Markup Language) which constrain
#' the space of possible data preparation methodologies and statistical
#' methodologies to a very limited subset.
#'
#' @name tundra
#' @docType package
NULL
5 changes: 0 additions & 5 deletions R/tundra-package.r

This file was deleted.

49 changes: 49 additions & 0 deletions R/tundraContainer-initialize.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#' Initialize a tundraContainer object.
#'
#' @param keyword character. The name of the classifier; for example,
#' "lm" or "knn".
#' @param train_function function. The function used to train the model.
#' Its first argument will be a data.frame, and the second argument
#' a list of additional parameters used for training the model.
#' @param predict_function function. The function used to predict
#' on new datasets. Its first argument will be a data.frame,
#' the dataset to predict on, and its second (optional)
#' argument will be additional parameters used for prediction
#' output (such as whether to return a probabilistic or absolute
#' value).
#' @param munge_procedure list. A list of trained
#' \code{\link[mungebits]{mungepiece}}s to apply to data sets
#' during prediction.
#' @param default_args list. A list of default arguments to provide to
#' the second argument of the \code{train_function}. The additional
#' arguments provided to the \code{tundraContainer}'s \code{train}
#' method will be merged on top of these defaults.
#' @param internal list. Internal metadata that should accompany the
#' model. Usually this is domain/organization specific, and can
#' include things such as a list of primary keys used for training
#' the model, identifiers or names of data sources used for
#' training the model, etc. It is a playground entirely under
#' your control, and can be used by other packages or a production
#' server hosting the model to achieve additional behavior.
initialize <- function(keyword, train_function = identity,
predict_function = identity, munge_procedure = list(),
default_args = list(), internal = list()) {
if (!(is.list(munge_procedure) || is(munge_procedure, "stageRunner"))) {
stop("The ", sQuote("munge_procedure"), " parameter must be a list or ",
"stageRunner object.")
}

self$.keyword <<- keyword
self$.train_function <<- train_function
self$.predict_function <<- predict_function
self$.munge_procedure <<- munge_procedure
self$.default_args <<- default_args
self$.internal <<- internal

self$.input <<- list_to_env(list())
lockEnvironment(self$.input)
self$.output <<- list_to_env(list())
self$.internal <<- list_to_env(list())
self$.hooks <<- list()
}

53 changes: 53 additions & 0 deletions R/tundraContainer-predict.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#' Predict on a dataset using a trained tundraContainer.
#'
#' @param dataframe data.frame. The dataset to generate predictions on
#' with the trained model. The data will be preprocessed with the
#' \code{tundraContainer}'s trained \code{munge_procedure} and
#' then passed as the first argument to the \code{tundraContainer}'s
#' \code{predict_function}.
#' @param predict_args list. A list of arguments to pass to the
#' \code{tundraContainer}'s \code{predict_function} as its second argument.
#' @param verbose logical. Either \code{TRUE} or \code{FALSE}, by
#' default the latter. If \code{TRUE}, then output produced by
#' running the \code{munge_procedure} or the \code{predict_function}
#' will not be silenced.
#' @param munge logical. Either \code{TRUE} or \code{FALSE}, by
#' default the former. If \code{TRUE}, the \code{munge_procedure}
#' provided to the container during initialization will be used to
#' preprocess the given \code{dataframe}.
#' @return The value returned by the \code{tundraContainer}'s
#' \code{predict_function}, usually a numeric vector or
#' \code{data.frame} of predictions.
predict <- function(dataframe, predict_args = list(), verbose = FALSE, munge = TRUE) {
if (!isTRUE(self$.trained)) {
stop("Tundra model ", sQuote(self$.keyword), " has not been trained yet.")
}

force(verbose)
force(munge)
force(predict_args)

private$run_hooks("predict_pre_munge")
if (isTRUE(munge) && length(self$.munge_procedure) > 0) {
initial_nrow <- NROW(datafram)
dataframe <- munge(dataframe, self$.munge_procedure, verbose)
if (NROW(dataframe) != initial_nrow) {
warning("Some rows were removed during data preparation. ",
"Predictions will not match input dataframe.")
}
}
private$run_hooks("predict_post_munge")

if (length(formals(self$.predict_function) < 2 || missing(predict_args))) {
args <- list(dataframe)
} else {
args <- list(dataframe, predict_args)
}

call_with(
self$.predict_function,
args,
list(input = self$.input, output = self$.output)
)
}

61 changes: 61 additions & 0 deletions R/tundraContainer-train.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#' Train a model encapsulated within a tundraContainer.
#'
#' @param dataframe data.frame. The dataset to train the model on. This
#' will be preprocessed with the \code{tundraContainer}'s
#' \code{munge_procedure} and then passed as the first argument to
#' the \code{tundraContainer}'s \code{train_function}.
#' @param train_args list. A list of arguments to make available
#' to the \code{tundraContainer}'s \code{train_function} through
#' use of the \code{input} keyword. See the examples.
#' @param verbose logical. Either \code{TRUE} or \code{FALSE}, by
#' default the latter. If \code{TRUE}, then output produced by
#' running the \code{munge_procedure} or the \code{train_function}
#' will not be silenced.
#' @param munge logical. Either \code{TRUE} or \code{FALSE}, by
#' default the former. If \code{FALSE}, the \code{munge_procedure}
#' provided to the container during initialization will be assumed
#' to have been trained, and the \code{dataframe} provided will not
#' be run through it.
#' @return The value returned by the \code{tundraContainer}'s
#' \code{train_function}. Since the \code{train_function} has side effects
#' on the container as its primary purpose, this can usually be
#' \code{invisible(NULL)}.
train <- function(dataframe, train_args = list(), verbose = FALSE, munge = TRUE) {
if (isTRUE(self$.trained)) {
stop("The tundra ", sQuote(self$.keyword), " model has already been trained.")
}

force(train_args)
force(verbose)
force(munge)

private$run_hooks("train_pre_munge")
if (isTRUE(munge) && length(self$.munge_procedure) > 0) {
dataframe <- munge(dataframe, self$.munge_procedure, verbose)
attr(dataframe, "mungepieces") <- NULL
}
private$run_hooks("train_post_munge")

output <- call_with(
self$.train_function,
list(dataframe),
list(
input = list_to_env(list_merge(self$.default_args, train_args), self$.input),
output = self$.output
)
)

private$run_hooks("train_finalize")
self$.trained <<- TRUE

output
}

munge <- function(dataframe, munge_procedure, verbose) {
if (isTRUE(verbose)) {
capture.output(Recall(dataframe, munge_procedure, FALSE))
} else {
mungebits::munge(dataframe, munge_procedure)
}
}

37 changes: 37 additions & 0 deletions R/tundraContainer.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#' A standard container format for classifiers developed in R.
#'
#' @docType class
#' @name tundraContainer
#' @export
tundraContainer <- R6::R6Class("tundraContainer",
public = list(
.keyword = NULL, # character
.train_function = NULL, # function
.predict_function = NULL, # function
.munge_procedure = NULL, # list of mungepieces
.default_args = NULL, # list
.trained = FALSE, # logical
.input = NULL, # environment
.output = NULL, # environment
.internal = NULL, # environment
.hooks = NULL, # list

initialize = initialize,
train = train,
predict = predict,
add_hook = add_hook,

munge = function(dataframe, steps = TRUE) {
mungebits::munge(dataframe, munge_procedure[steps])
},
show = function() {
cat("A tundraContainer of type ", sQuote(self$.keyword), "\n")
}
),

private = list(
run_hooks = run_hooks
)

)

Loading

0 comments on commit e929a99

Please sign in to comment.