-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
31 changed files
with
845 additions
and
27 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
#' Add a hook to a tundraContainer. | ||
#' | ||
#' Hooks are useful for defining additional checks that should be | ||
#' performed prior to and during training and prediction. For example, | ||
#' one might want to issue a warning if the user is predicting on | ||
#' rows that were used for training, or a sanity check might be | ||
#' present prior to training to ensure a dependent variable is present. | ||
#' | ||
#' The following hooks are available. | ||
#' | ||
#' \enumerate{ | ||
#' \item{train_pre_munge}{This hook runs during a call to the | ||
#' container's \code{train} method, just prior to invoking the | ||
#' \code{munge_procedure} to clean up the dataset. It could be | ||
#' useful for defining pre-conditions on the dataset to ensure | ||
#' it can be munged successfully.} | ||
#' \item{train_post_munge}{This hook runs during a call to the | ||
#' container's \code{train} method, just after invoking the | ||
#' \code{munge_procedure} to clean up the dataset. It could be | ||
#' useful for defining post-conditions on the dataset to ensure | ||
#' it was munged successfully.} | ||
#' \item{train_finalize}{This hook runs just after the \code{train} | ||
#' method calls the \code{train_function}. It could be used to | ||
#' verify presence or validate properties of the trained model.} | ||
#' \item{predict_pre_munge}{This hook runs during a call to the | ||
#' container's \code{predict} method, just prior to invoking the | ||
#' \code{munge_procedure} to clean up the dataset. It could be | ||
#' useful for defining pre-conditions on the dataset to ensure | ||
#' it can be munged successfully.} | ||
#' \item{predict_post_munge}{This hook runs during a call to the | ||
#' container's \code{predict} method, just after invoking the | ||
#' \code{munge_procedure} to clean up the dataset. It could be | ||
#' useful for defining post-conditions on the dataset to ensure | ||
#' it was munged successfully.} | ||
#' } | ||
#' | ||
#' Each hook will be provided the \code{tundraContainer} as input | ||
#' (unless it has no arguments, in which case it will simply be called). | ||
#' | ||
#' @name hooks | ||
#' @param hook_name character. The hook to run. Must be one of the available | ||
#' hooks. | ||
run_hooks <- function(hook_name) { | ||
for (hook in self$.hooks[[hook_name]]) { | ||
if (length(formals(hook)) > 0) { | ||
hook(self) | ||
} else { | ||
hook() | ||
} | ||
} | ||
} | ||
|
||
#' Add a hook to a tundraContainer. | ||
#' | ||
#' @param hook_function function. The hook to execute. It will be provided | ||
#' the \code{tundraContainer} as its only argument. | ||
#' @rdname hooks | ||
add_hook <- function(hook_name, hook_function) { | ||
stopifnot(is.simple_string(hook_name), | ||
is.function(hook_function)) | ||
|
||
allowed_types <- c("train_pre_munge", "predict_pre_munge", | ||
"train_post_munge", "predict_post_munge", | ||
"train_finalize") | ||
hook_name <- match.arg(hook_name, allowed_types) | ||
|
||
self$.hooks[[hook_name]] <- c(self$.hooks[[hook_name]], hook_function) | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#' Tundra is a standardized classifier container format for R. | ||
#' | ||
#' Deploying models in production systems is generally a cumbersome process. | ||
#' If analysis is performed in a language like R or SAS, the coefficients of the | ||
#' model are usually extracted and translated to a "production-ready" language like | ||
#' R or Java. | ||
#' | ||
#' However, this approach is flawed. The translation process is time consuming | ||
#' and error-prone. R is demonstrably capable of serving models | ||
#' in production environments as long as submillisecond latency is not a | ||
#' requirement. This means it should be possible to push analysis performed in | ||
#' R to directly score records in production systems without an intermediary. | ||
#' This significantly decreases the cost of iterating on machine learning | ||
#' models. | ||
#' | ||
#' A tundraContainer is a simple bundling of the two critical components of | ||
#' any machine learning model. | ||
#' | ||
#' \itemize{ | ||
#' \item{The data preparation required to convert raw production data to | ||
#' a record that is acceptable to a trained classifier. For example, | ||
#' a regression-based model may need discretization of non-categorical | ||
#' variables or imputation of missing values.} | ||
#' \item{The trained classifier, usually a native R S3 object with | ||
#' a \code{train} method.} | ||
#' } | ||
#' | ||
#' The former is provided by the \href{https://github.com/robertzk/mungebits}{mungebits} | ||
#' package, while the latter is fully customizable to any R function. This | ||
#' approach allows arbitrary data preparation and statistical methods, unlike | ||
#' attempts such as PMML (Predictive Modeling Markup Language) which constrain | ||
#' the space of possible data preparation methodologies and statistical | ||
#' methodologies to a very limited subset. | ||
#' | ||
#' @name tundra | ||
#' @docType package | ||
NULL |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
#' Initialize a tundraContainer object. | ||
#' | ||
#' @param keyword character. The name of the classifier; for example, | ||
#' "lm" or "knn". | ||
#' @param train_function function. The function used to train the model. | ||
#' Its first argument will be a data.frame, and the second argument | ||
#' a list of additional parameters used for training the model. | ||
#' @param predict_function function. The function used to predict | ||
#' on new datasets. Its first argument will be a data.frame, | ||
#' the dataset to predict on, and its second (optional) | ||
#' argument will be additional parameters used for prediction | ||
#' output (such as whether to return a probabilistic or absolute | ||
#' value). | ||
#' @param munge_procedure list. A list of trained | ||
#' \code{\link[mungebits]{mungepiece}}s to apply to data sets | ||
#' during prediction. | ||
#' @param default_args list. A list of default arguments to provide to | ||
#' the second argument of the \code{train_function}. The additional | ||
#' arguments provided to the \code{tundraContainer}'s \code{train} | ||
#' method will be merged on top of these defaults. | ||
#' @param internal list. Internal metadata that should accompany the | ||
#' model. Usually this is domain/organization specific, and can | ||
#' include things such as a list of primary keys used for training | ||
#' the model, identifiers or names of data sources used for | ||
#' training the model, etc. It is a playground entirely under | ||
#' your control, and can be used by other packages or a production | ||
#' server hosting the model to achieve additional behavior. | ||
initialize <- function(keyword, train_function = identity, | ||
predict_function = identity, munge_procedure = list(), | ||
default_args = list(), internal = list()) { | ||
if (!(is.list(munge_procedure) || is(munge_procedure, "stageRunner"))) { | ||
stop("The ", sQuote("munge_procedure"), " parameter must be a list or ", | ||
"stageRunner object.") | ||
} | ||
|
||
self$.keyword <<- keyword | ||
self$.train_function <<- train_function | ||
self$.predict_function <<- predict_function | ||
self$.munge_procedure <<- munge_procedure | ||
self$.default_args <<- default_args | ||
self$.internal <<- internal | ||
|
||
self$.input <<- list_to_env(list()) | ||
lockEnvironment(self$.input) | ||
self$.output <<- list_to_env(list()) | ||
self$.internal <<- list_to_env(list()) | ||
self$.hooks <<- list() | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
#' Predict on a dataset using a trained tundraContainer. | ||
#' | ||
#' @param dataframe data.frame. The dataset to generate predictions on | ||
#' with the trained model. The data will be preprocessed with the | ||
#' \code{tundraContainer}'s trained \code{munge_procedure} and | ||
#' then passed as the first argument to the \code{tundraContainer}'s | ||
#' \code{predict_function}. | ||
#' @param predict_args list. A list of arguments to pass to the | ||
#' \code{tundraContainer}'s \code{predict_function} as its second argument. | ||
#' @param verbose logical. Either \code{TRUE} or \code{FALSE}, by | ||
#' default the latter. If \code{TRUE}, then output produced by | ||
#' running the \code{munge_procedure} or the \code{predict_function} | ||
#' will not be silenced. | ||
#' @param munge logical. Either \code{TRUE} or \code{FALSE}, by | ||
#' default the former. If \code{TRUE}, the \code{munge_procedure} | ||
#' provided to the container during initialization will be used to | ||
#' preprocess the given \code{dataframe}. | ||
#' @return The value returned by the \code{tundraContainer}'s | ||
#' \code{predict_function}, usually a numeric vector or | ||
#' \code{data.frame} of predictions. | ||
predict <- function(dataframe, predict_args = list(), verbose = FALSE, munge = TRUE) { | ||
if (!isTRUE(self$.trained)) { | ||
stop("Tundra model ", sQuote(self$.keyword), " has not been trained yet.") | ||
} | ||
|
||
force(verbose) | ||
force(munge) | ||
force(predict_args) | ||
|
||
private$run_hooks("predict_pre_munge") | ||
if (isTRUE(munge) && length(self$.munge_procedure) > 0) { | ||
initial_nrow <- NROW(datafram) | ||
dataframe <- munge(dataframe, self$.munge_procedure, verbose) | ||
if (NROW(dataframe) != initial_nrow) { | ||
warning("Some rows were removed during data preparation. ", | ||
"Predictions will not match input dataframe.") | ||
} | ||
} | ||
private$run_hooks("predict_post_munge") | ||
|
||
if (length(formals(self$.predict_function) < 2 || missing(predict_args))) { | ||
args <- list(dataframe) | ||
} else { | ||
args <- list(dataframe, predict_args) | ||
} | ||
|
||
call_with( | ||
self$.predict_function, | ||
args, | ||
list(input = self$.input, output = self$.output) | ||
) | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
#' Train a model encapsulated within a tundraContainer. | ||
#' | ||
#' @param dataframe data.frame. The dataset to train the model on. This | ||
#' will be preprocessed with the \code{tundraContainer}'s | ||
#' \code{munge_procedure} and then passed as the first argument to | ||
#' the \code{tundraContainer}'s \code{train_function}. | ||
#' @param train_args list. A list of arguments to make available | ||
#' to the \code{tundraContainer}'s \code{train_function} through | ||
#' use of the \code{input} keyword. See the examples. | ||
#' @param verbose logical. Either \code{TRUE} or \code{FALSE}, by | ||
#' default the latter. If \code{TRUE}, then output produced by | ||
#' running the \code{munge_procedure} or the \code{train_function} | ||
#' will not be silenced. | ||
#' @param munge logical. Either \code{TRUE} or \code{FALSE}, by | ||
#' default the former. If \code{FALSE}, the \code{munge_procedure} | ||
#' provided to the container during initialization will be assumed | ||
#' to have been trained, and the \code{dataframe} provided will not | ||
#' be run through it. | ||
#' @return The value returned by the \code{tundraContainer}'s | ||
#' \code{train_function}. Since the \code{train_function} has side effects | ||
#' on the container as its primary purpose, this can usually be | ||
#' \code{invisible(NULL)}. | ||
train <- function(dataframe, train_args = list(), verbose = FALSE, munge = TRUE) { | ||
if (isTRUE(self$.trained)) { | ||
stop("The tundra ", sQuote(self$.keyword), " model has already been trained.") | ||
} | ||
|
||
force(train_args) | ||
force(verbose) | ||
force(munge) | ||
|
||
private$run_hooks("train_pre_munge") | ||
if (isTRUE(munge) && length(self$.munge_procedure) > 0) { | ||
dataframe <- munge(dataframe, self$.munge_procedure, verbose) | ||
attr(dataframe, "mungepieces") <- NULL | ||
} | ||
private$run_hooks("train_post_munge") | ||
|
||
output <- call_with( | ||
self$.train_function, | ||
list(dataframe), | ||
list( | ||
input = list_to_env(list_merge(self$.default_args, train_args), self$.input), | ||
output = self$.output | ||
) | ||
) | ||
|
||
private$run_hooks("train_finalize") | ||
self$.trained <<- TRUE | ||
|
||
output | ||
} | ||
|
||
munge <- function(dataframe, munge_procedure, verbose) { | ||
if (isTRUE(verbose)) { | ||
capture.output(Recall(dataframe, munge_procedure, FALSE)) | ||
} else { | ||
mungebits::munge(dataframe, munge_procedure) | ||
} | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#' A standard container format for classifiers developed in R. | ||
#' | ||
#' @docType class | ||
#' @name tundraContainer | ||
#' @export | ||
tundraContainer <- R6::R6Class("tundraContainer", | ||
public = list( | ||
.keyword = NULL, # character | ||
.train_function = NULL, # function | ||
.predict_function = NULL, # function | ||
.munge_procedure = NULL, # list of mungepieces | ||
.default_args = NULL, # list | ||
.trained = FALSE, # logical | ||
.input = NULL, # environment | ||
.output = NULL, # environment | ||
.internal = NULL, # environment | ||
.hooks = NULL, # list | ||
|
||
initialize = initialize, | ||
train = train, | ||
predict = predict, | ||
add_hook = add_hook, | ||
|
||
munge = function(dataframe, steps = TRUE) { | ||
mungebits::munge(dataframe, munge_procedure[steps]) | ||
}, | ||
show = function() { | ||
cat("A tundraContainer of type ", sQuote(self$.keyword), "\n") | ||
} | ||
), | ||
|
||
private = list( | ||
run_hooks = run_hooks | ||
) | ||
|
||
) | ||
|
Oops, something went wrong.