Skip to content

Commit

Permalink
add visualization for base-learner traces
Browse files Browse the repository at this point in the history
  • Loading branch information
schalkdaniel committed Oct 31, 2019
1 parent 1d5f527 commit 6aed9ef
Show file tree
Hide file tree
Showing 5 changed files with 157 additions and 45 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Expand Up @@ -40,7 +40,8 @@ Suggests:
rmarkdown,
titanic,
mlr,
gridExtra
gridExtra,
ggrepel
RcppModules:
baselearner_module,
compboost_module,
Expand Down
62 changes: 40 additions & 22 deletions R/compboost.R
Expand Up @@ -49,6 +49,8 @@
#'
#' cboost$plotInbagVsOobRisk()
#'
#' cboost$plotBlearnerTraces(value = 1, n_legend = 5L)
#'
#' }
#' @section Arguments:
#' \strong{For Compboost$new()}:
Expand Down Expand Up @@ -137,34 +139,44 @@
#'
#' \strong{For cboost$predict()}:
#' \describe{
#' \item{\code{newdata}}{[\code{data.frame()}]\cr
#' Data to predict on. If newdata equals \code{NULL} predictions on the training data are returned.
#' }
#' \item{\code{newdata}}{[\code{data.frame()}]\cr
#' Data to predict on. If newdata equals \code{NULL} predictions on the training data are returned.
#' }
#' }
#' \strong{For cboost$plot()}:
#' \describe{
#' \item{\code{blearner_name}}{[\code{character(1)}]\cr
#' Character name of the base-learner to plot the contribution to the response. Available choices for
#' \code{blearner_name} use \code{cboost$getBaselearnerNames()}.
#' }
#' \item{\code{iters}}{[\code{integer()}]\cr
#' Integer vector containing the iterations the user wants to visualize.
#' }
#' \item{\code{from}}{[\code{numeric(1)}]\cr
#' Lower bound for the x axis (should be smaller than \code{to}).
#' }
#' \item{\code{to}}{[\code{numeric(1)}]\cr
#' Upper bound for the x axis (should be greater than \code{from}).
#' }
#' \item{\code{length_out}}{[\code{integer(1)}]\cr
#' Number of equidistant points between \code{from} and \code{to} used for plotting.
#' }
#' \item{\code{blearner_name}}{[\code{character(1)}]\cr
#' Character name of the base-learner to plot the contribution to the response. Available choices for
#' \code{blearner_name} use \code{cboost$getBaselearnerNames()}.
#' }
#' \item{\code{iters}}{[\code{integer()}]\cr
#' Integer vector containing the iterations the user wants to visualize.
#' }
#' \item{\code{from}}{[\code{numeric(1)}]\cr
#' Lower bound for the x axis (should be smaller than \code{to}).
#' }
#' \item{\code{to}}{[\code{numeric(1)}]\cr
#' Upper bound for the x axis (should be greater than \code{from}).
#' }
#' \item{\code{length_out}}{[\code{integer(1)}]\cr
#' Number of equidistant points between \code{from} and \code{to} used for plotting.
#' }
#' }
#' \strong{For cboost$calculateFeatureImportance() and cboost$plotFeatureImportance()}:
#' \describe{
#' \item{\code{num_feats}}{[\code{integer(1)}]\cr
#' Number of features for which the Importance will be returned.
#' \item{\code{num_feats}}{[\code{integer(1)}]\cr
#' Number of features for which the Importance will be returned.
#' }
#' }
#' \strong{For cboost$plotBlearnerTraces}:
#' \describe{
#' \item{\code{value}}{[\code{numeric()}]\cr
#' Numeric value of length 1 or same length as the number iterations which is accumulated by the selected base-learner.
#' }
#' \item{\code{n_legend}}{[\code{integer(1L)}]\cr
#' Number of how many base-learner are highlighted (base-learner are highlighted by choosing the top \code{n_legend}
#' accumulated values).
#' }
#' }
#' @section Details:
#' \strong{Loss}\cr
Expand Down Expand Up @@ -333,17 +345,20 @@
#' \item{\code{calculateFeatureImportance}}{method to calculate feature importance.}
#' \item{\code{plotFeatureImportance}}{method to plot the feature importance calculated by \code{calulateFeatureImportance}.}
#' \item{\code{plotInbagVsOobRisk}}{method to plot the inbag vs the out of bag behavior. This is just applicable if a logger with name \code{oob_logger} was registered. This is automatically done if the \code{oob_fraction} is set.}
#' \item{\code{plotBlearnerTraces}}{method to plot traces how the base-learner are selected in combination with a measure of interest, e.g. how the empirical risk was minimized throughout the selection process.}
#' }
#'
#' @examples
#' cboost = Compboost$new(mtcars, "mpg", loss = LossQuadratic$new(), oob_fraction = 0.3)
#' cboost$addBaselearner("hp", "spline", BaselearnerPSpline, degree = 3,
#' n.knots = 10, penalty = 2, differences = 2)
#' n_knots = 10, penalty = 2, differences = 2)
#' cboost$addBaselearner("wt", "spline", BaselearnerPSpline)
#' cboost$train(1000)
#'
#' table(cboost$getSelectedBaselearner())
#' cboost$plot("hp_spline")
#' cboost$plotInbagVsOobRisk()
#' cboost$plotBlearnerTraces()
NULL

#' @export
Expand Down Expand Up @@ -649,6 +664,9 @@ Compboost = R6::R6Class("Compboost",
ggplot2::ylab("Risk")

return(gg)
},
plotBlearnerTraces = function (value = 1L, n_legend = 5L) {
plotBlearnerTraces(cboost_obj = self, value = value, n_legend = n_legend)
}
),
private = list(
Expand Down
56 changes: 56 additions & 0 deletions R/plot_cboost.R
Expand Up @@ -104,3 +104,59 @@ plotFeatEffect = function (cboost_obj, bl_list, blearner_name, iters, from, to,
return(gg)
}

plotBlearnerTraces = function (cboost_obj, value = 1, n_legend = 5L)
{
if (! requireNamespace("ggplot2", quietly = TRUE)) { stop("Please install ggplot2 to create plots.") }
if (! requireNamespace("ggrepel", quietly = TRUE)) { stop("Please install ggrepel to create plots.") }

if (is.null(cboost_obj$model)) stop("Model needs to be trained first.")

# Creating the base dataframe which is used to calculate the traces for the selected base-learner:
bl = as.factor(cboost_obj$getSelectedBaselearner())
df_plot = data.frame(iters = seq_along(bl), blearner = bl, value = value)

if (length(value) %in% c(1L, length(bl))) {
checkmate::assertNumeric(value)
} else {
stop("Assertion on 'value' failed: Must have length 1 or ", length(bl), ".")
}
checkmate::assertCount(n_legend, positive = TRUE)

# Aggregate value by calculating the cumulative sum grouped by base-learner:
df_plot = do.call(rbind, lapply(X = levels(bl), FUN = function (lab) {
df_temp = df_plot[df_plot$blearner == lab, ]
df_temp = df_temp[order(df_temp$iters), ]
df_temp$value = cumsum(df_temp$value) / length(bl)

return(df_temp)
}))

# Get top 'n_legend' base-learner that are highlighted:
top_values = vapply(X = levels(bl), FUN.VALUE = numeric(1L), FUN = function (lab) {
df_temp = df_plot[df_plot$blearner == lab, ]
return (max(df_temp$value))
})
top_labs = as.factor(names(sort(top_values, decreasing = TRUE)))[seq_len(n_legend)]

idx_top_lab = df_plot$blearner %in% top_labs

df_plot_top = df_plot[idx_top_lab, ]
df_plot_nottop = df_plot[! idx_top_lab, ]

df_label = do.call(rbind, lapply(X = top_labs, FUN = function (lab) {
df_temp = df_plot[df_plot$blearner == lab, ]
df_temp[which.max(df_temp$iters), ]
}))

gg = ggplot2::ggplot() +
ggplot2::geom_line(data = df_plot_top, ggplot2::aes(x = iters, y = value, color = blearner), show.legend = FALSE) +
ggplot2::geom_line(data = df_plot_nottop, ggplot2::aes(x = iters, y = value, group = blearner), alpha = 0.2, show.legend = FALSE) +
ggrepel::geom_label_repel(data = df_label, ggplot2::aes(x = iters, y = value, label = round(value, 4), fill = blearner),
colour = "white", fontface = "bold", show.legend = TRUE) +
ggplot2::xlab("Iteration") +
ggplot2::ylab("Cumulated Value\nof Included Base-Learner") +
ggplot2::scale_fill_discrete(name = paste0("Top ", n_legend, " Base-Learner")) +
ggplot2::guides(color = FALSE)

return(gg)
}
59 changes: 37 additions & 22 deletions man/Compboost.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 22 additions & 0 deletions tests/testthat/test_base_learner_traces.R
@@ -0,0 +1,22 @@
context("Base-learner traces works")

test_that("Visualization works", {

expect_silent({
cboost = Compboost$new(mtcars, "mpg", loss = LossQuadratic$new())
cboost$addBaselearner("hp", "spline", BaselearnerPSpline)
cboost$addBaselearner(c("hp", "wt"), "quadratic", BaselearnerPolynomial)
cboost$addBaselearner("wt", "linear", BaselearnerPolynomial)
})

expect_error(cboost$plot("hp_spline"))

expect_output(cboost$train(2000, trace = 0))

expect_error(cboost$plotBlearnerTraces(n_legend = "bls"))
expect_error(cboost$plotBlearnerTraces(value = "bls"))
expect_error(cboost$plotBlearnerTraces(value = c(1,2)))

expect_silent({gg = cboost$plotBlearnerTraces()})
expect_s3_class(cboost$plotBlearnerTraces(), "ggplot")
})

0 comments on commit 6aed9ef

Please sign in to comment.