Skip to content

Commit

Permalink
Merge pull request #272 from schalkdaniel/automatic_oob_splitting
Browse files Browse the repository at this point in the history
add inbag vs oob
  • Loading branch information
Daniel Schalk committed Dec 13, 2018
2 parents f77368e + 6a949e0 commit 25c2d29
Show file tree
Hide file tree
Showing 9 changed files with 160 additions and 22 deletions.
7 changes: 5 additions & 2 deletions R/boost_linear.R
Expand Up @@ -40,6 +40,8 @@
#' @param data.target [\code{S4 Data}]\cr
#' Uninitialized \code{S4 Data} object which is used to store the data. At the moment
#' just in memory training is supported.
#' @param oob.fraction [\code{numeric(1)}]\cr
#' Fraction of how much data we want to use to track the out of bag risk.
#' @examples
#' mod = boostLinear(data = iris, target = "Sepal.Length", loss = LossQuadratic$new())
#' mod$getBaselearnerNames()
Expand All @@ -50,9 +52,10 @@
#' @export
boostLinear = function(data, target, optimizer = OptimizerCoordinateDescent$new(), loss,
learning.rate = 0.05, iterations = 100, trace = -1, intercept = TRUE,
data.source = InMemoryData, data.target = InMemoryData)
data.source = InMemoryData, data.target = InMemoryData, oob.fraction = NULL)
{
model = Compboost$new(data = data, target = target, optimizer = optimizer, loss = loss, learning.rate = learning.rate)
model = Compboost$new(data = data, target = target, optimizer = optimizer, loss = loss,
learning.rate = learning.rate, oob.fraction = oob.fraction)
features = setdiff(colnames(data), target)

# This loop could be replaced with foreach???
Expand Down
7 changes: 5 additions & 2 deletions R/boost_splines.R
Expand Up @@ -50,6 +50,8 @@
#' @param data.target [\code{S4 Data}]\cr
#' Uninitialized \code{S4 Data} object which is used to store the data. At the moment
#' just in memory training is supported.
#' @param oob.fraction [\code{numeric(1)}]\cr
#' Fraction of how much data we want to use to track the out of bag risk.
#' @examples
#' mod = boostSplines(data = iris, target = "Sepal.Length", loss = LossQuadratic$new())
#' mod$getBaselearnerNames()
Expand All @@ -60,9 +62,10 @@
#' @export
boostSplines = function(data, target, optimizer = OptimizerCoordinateDescent$new(), loss,
learning.rate = 0.05, iterations = 100, trace = -1, degree = 3, n.knots = 20,
penalty = 2, differences = 2, data.source = InMemoryData, data.target = InMemoryData)
penalty = 2, differences = 2, data.source = InMemoryData, data.target = InMemoryData, oob.fraction = NULL)
{
model = Compboost$new(data = data, target = target, optimizer = optimizer, loss = loss, learning.rate = learning.rate)
model = Compboost$new(data = data, target = target, optimizer = optimizer, loss = loss,
learning.rate = learning.rate, oob.fraction = oob.fraction)
features = setdiff(colnames(data), target)

# This loop could be replaced with foreach???
Expand Down
72 changes: 64 additions & 8 deletions R/compboost.R
Expand Up @@ -303,7 +303,10 @@ NULL
Compboost = R6::R6Class("Compboost",
public = list(
data = NULL,
data.oob = NULL,
oob.fraction = NULL,
response = NULL,
response.oob = NULL,
target = NULL,
id = NULL,
optimizer = NULL,
Expand All @@ -313,11 +316,12 @@ Compboost = R6::R6Class("Compboost",
bl.factory.list = NULL,
positive.category = NULL,
stop.if.all.stoppers.fulfilled = FALSE,
initialize = function(data, target, optimizer = OptimizerCoordinateDescent$new(), loss, learning.rate = 0.05) {
initialize = function(data, target, optimizer = OptimizerCoordinateDescent$new(), loss, learning.rate = 0.05, oob.fraction = NULL) {
checkmate::assertDataFrame(data, any.missing = FALSE, min.rows = 1)
checkmate::assertCharacter(target)
checkmate::assertNumeric(learning.rate, lower = 0, upper = 1, len = 1)

checkmate::assertNumeric(learning.rate, lower = 0, upper = 1, any.missing = FALSE, len = 1)
checkmate::assertNumeric(oob.fraction, lower = 0, upper = 1, any.missing = FALSE, len = 1, null.ok = TRUE)

if (! target %in% names(data)) {
stop ("The target ", target, " is not present within the data")
}
Expand All @@ -342,14 +346,24 @@ Compboost = R6::R6Class("Compboost",
# Transform to vector with -1 and 1:
response = as.integer(response) * (1 - as.integer(response)) + 1
}


if (! is.null(oob.fraction)) {
private$oob.idx = sample(x = seq_len(nrow(data)), size = floor(oob.fraction * nrow(data)), replace = FALSE)
}
private$train.idx = setdiff(seq_len(nrow(data)), private$oob.idx)

self$oob.fraction = oob.fraction
self$target = target
self$response = response
self$data = data[, !colnames(data) %in% target, drop = FALSE]
self$response = response[private$train.idx]
self$data = data[private$train.idx, !colnames(data) %in% target, drop = FALSE]
self$optimizer = optimizer
self$loss = loss
self$learning.rate = learning.rate

if (! is.null(self$oob.fraction)) {
self$data.oob = data[private$oob.idx, !colnames(data) %in% target, drop = FALSE]
self$response.oob = response[private$oob.idx]
}

# Initialize new base-learner factory list. All factories which are defined in
# `addBaselearners` are registered here:
self$bl.factory.list = BlearnerFactoryList$new()
Expand Down Expand Up @@ -407,14 +421,15 @@ Compboost = R6::R6Class("Compboost",
# If iteration is NULL, then there is no new iteration logger defined. This could be
# used, for example, to train the algorithm an break it after a defined number of
# hours or minutes.
if (!is.null(iteration)) {
if (! is.null(iteration)) {
# Add new logger in the case that there isn't already a custom defined one:
if ("Rcpp_LoggerIteration" %in% vapply(private$l.list, class, character(1))) {
warning("Training iterations are ignored since custom iteration logger is already defined")
} else {
self$addLogger(LoggerIteration, TRUE, logger.id = "_iterations", iter.max = iteration)
}
}
if (! is.null(self$oob.fraction)) private$addOobLogger()
# After calling `initializeModel` it isn't possible to add base-learner or logger.
private$initializeModel()
}
Expand Down Expand Up @@ -660,6 +675,37 @@ Compboost = R6::R6Class("Compboost",
} else {
warning("Train the model to get logger data.")
}
},
plotInbagVsOobRisk = function () {
if (! is.null(self$model)) {
if (requireNamespace("ggplot2", quietly = TRUE)) {
inbag.trace = self$getInbagRisk()
oob.data = self$getLoggerData()
if ("oob_risk" %in% names(oob.data)) {
oob.trace = oob.data[["oob_risk"]]

risk.data = data.frame(
risk = c(inbag.trace, oob.trace),
type = rep(c("inbag", "oob"), times = c(length(inbag.trace), length(oob.trace))),
iter = c(seq_along(inbag.trace), seq_along(oob.trace))
)

gg = ggplot2::ggplot(risk.data, ggplot2::aes(x = iter, y = risk, color = type)) +
ggplot2::geom_line(size = 1.1) +
ggplot2::xlab("Iteration") +
ggplot2::ylab("Risk")# + labs(color = "")

return(gg)
} else {
stop("Model was not trained with an out of bag risk logger called 'oob_risk'.")
}
} else {
message("Please install ggplot2 to create plots.")
return(NULL)
}
} else {
warning("Train the model to get logger data.")
}
}
),
private = list(
Expand All @@ -668,6 +714,8 @@ Compboost = R6::R6Class("Compboost",
l.list = list(),
bl.list = list(),
logger.list = list(),
oob.idx = NULL,
train.idx = NULL,

initializeModel = function() {

Expand All @@ -679,6 +727,14 @@ Compboost = R6::R6Class("Compboost",
self$model = Compboost_internal$new(self$response, self$learning.rate,
self$stop.if.all.stoppers.fulfilled, self$bl.factory.list, self$loss, private$logger.list, self$optimizer)
},
addOobLogger = function () {

if (! is.null(self$oob.fraction)) {
self$addLogger(logger = LoggerOobRisk, logger.id = "oob_risk",
used.loss = self$loss, eps.for.break = 0, oob.data = self$prepareData(self$data.oob),
oob.response = self$response.oob)
}
},
addSingleNumericBl = function(data.columns, feature, id.fac, id, bl.factory, data.source, data.target, ...) {

private$bl.list[[id]] = list()
Expand Down
6 changes: 6 additions & 0 deletions man/Compboost.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion man/boostLinear.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 5 additions & 1 deletion man/boostSplines.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions tests/testthat/test_feature_importance.R
@@ -1,9 +1,9 @@
context("Feature importance of 'compboost'")

test_that("feature importance works", {

cboost = boostSplines(data = mtcars, target = "mpg", loss = LossQuadratic$new())

nuisance = capture.output(suppressWarnings({
cboost = boostSplines(data = mtcars, target = "mpg", loss = LossQuadratic$new())
}))
expect_error(cboost$calculateFeatureImportance(20L))
expect_silent(cboost$calculateFeatureImportance())

Expand Down
57 changes: 57 additions & 0 deletions tests/testthat/test_inbag_vs_oob.R
@@ -0,0 +1,57 @@
context("Intrinsic inbag vs. oob works")

test_that("Internal oob is the same as the logger", {

df = mtcars
target_var = "mpg"
char_vars = c("cyl", "vs", "am", "gear", "carb")

for (feature in char_vars) {
df[[feature]] = as.factor(df[[feature]])
}

n_data = nrow(df)

set.seed(31415)
idx_test = sample(x = seq_len(n_data), size = floor(n_data * 0.25))
idx_train = setdiff(x = seq_len(n_data), idx_test)

cboost = Compboost$new(data = df[idx_train, ], target = target_var,
loss = LossQuadratic$new(), learning.rate = 0.005)

for (feature_name in setdiff(names(df), target_var)) {
if (feature_name %in% char_vars) {
cboost$addBaselearner(feature = feature_name, id = "category",
bl.factory = BaselearnerPolynomial, intercept = FALSE)
} else {
cboost$addBaselearner(feature = feature_name, id = "spline",
bl.factory = BaselearnerPSpline, degree = 3, n.knots = 10)
}
}

oob_data = cboost$prepareData(df[idx_test,])
oob_response = df[[target_var]][idx_test]

cboost$addLogger(logger = LoggerOobRisk, logger.id = "oob_risk",
used.loss = LossQuadratic$new(), eps.for.break = 0,
oob.data = oob_data, oob.response = oob_response)

nuisance = capture.output(suppressWarnings({
cboost$train(6000)
}))
set.seed(31415)
nuisance = capture.output(suppressWarnings({
cboost1 = boostSplines(data = df, target = target_var, loss = LossQuadratic$new(), learning.rate = 0.005,
iterations = 6000L, degree = 3, n.knots = 10, oob.fraction = 0.25)
}))
expect_equal(rownames(df)[idx_train], rownames(cboost1$data))
expect_equal(rownames(df)[idx_test], rownames(cboost1$data.oob))
expect_equal(cboost$getLoggerData(), cboost1$getLoggerData())

gg = cboost$plotInbagVsOobRisk()
gg1 = cboost1$plotInbagVsOobRisk()

expect_true(inherits(gg, "ggplot"))
expect_true(inherits(gg1, "ggplot"))
expect_equal(gg, gg1)
})
16 changes: 11 additions & 5 deletions tests/testthat/test_optimizer.R
Expand Up @@ -11,18 +11,24 @@ test_that("Coordinate Descent with line search works", {
cboost$addBaselearner("disp", "linear", BaselearnerPolynomial)
cboost$addBaselearner("hp", "linear", BaselearnerPolynomial)

cboost$train(n.train)
nuisance = capture.output(suppressWarnings({
cboost$train(n.train)
}))

used.optimizer.ls = OptimizerCoordinateDescentLineSearch$new()

cboost1 = Compboost$new(data = mtcars, target = "mpg", optimizer = used.optimizer.ls, loss = LossQuadratic$new(), learning.rate = 0.05)

nuisance = capture.output(suppressWarnings({
cboost1 = Compboost$new(data = mtcars, target = "mpg", optimizer = used.optimizer.ls, loss = LossQuadratic$new(), learning.rate = 0.05)
}))

cboost1$addBaselearner("wt", "linear", BaselearnerPolynomial)
cboost1$addBaselearner("disp", "linear", BaselearnerPolynomial)
cboost1$addBaselearner("hp", "linear", BaselearnerPolynomial)

cboost1$train(n.train)

nuisance = capture.output(suppressWarnings({
cboost1$train(n.train)
}))

expect_equal(cboost$predict(), cboost1$predict())
expect_equal(cboost$getInbagRisk(), cboost1$getInbagRisk())
expect_true(all(abs(used.optimizer.ls$getStepSize() - 1) < 1e10))
Expand Down

0 comments on commit 25c2d29

Please sign in to comment.