From 6aed9ef20d58c3cdcb692cdc8b78527e6e816bee Mon Sep 17 00:00:00 2001 From: schalkdaniel Date: Thu, 31 Oct 2019 13:48:33 +0100 Subject: [PATCH] add visualization for base-learner traces --- DESCRIPTION | 3 +- R/compboost.R | 62 +++++++++++++++-------- R/plot_cboost.R | 56 ++++++++++++++++++++ man/Compboost.Rd | 59 +++++++++++++-------- tests/testthat/test_base_learner_traces.R | 22 ++++++++ 5 files changed, 157 insertions(+), 45 deletions(-) create mode 100644 tests/testthat/test_base_learner_traces.R diff --git a/DESCRIPTION b/DESCRIPTION index 5faae694..2c03d5ff 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -40,7 +40,8 @@ Suggests: rmarkdown, titanic, mlr, - gridExtra + gridExtra, + ggrepel RcppModules: baselearner_module, compboost_module, diff --git a/R/compboost.R b/R/compboost.R index 14c20f5f..d8b02a41 100644 --- a/R/compboost.R +++ b/R/compboost.R @@ -49,6 +49,8 @@ #' #' cboost$plotInbagVsOobRisk() #' +#' cboost$plotBlearnerTraces(value = 1, n_legend = 5L) +#' #' } #' @section Arguments: #' \strong{For Compboost$new()}: @@ -137,34 +139,44 @@ #' #' \strong{For cboost$predict()}: #' \describe{ -#' \item{\code{newdata}}{[\code{data.frame()}]\cr -#' Data to predict on. If newdata equals \code{NULL} predictions on the training data are returned. -#' } +#' \item{\code{newdata}}{[\code{data.frame()}]\cr +#' Data to predict on. If newdata equals \code{NULL} predictions on the training data are returned. +#' } #' } #' \strong{For cboost$plot()}: #' \describe{ -#' \item{\code{blearner_name}}{[\code{character(1)}]\cr -#' Character name of the base-learner to plot the contribution to the response. Available choices for -#' \code{blearner_name} use \code{cboost$getBaselearnerNames()}. -#' } -#' \item{\code{iters}}{[\code{integer()}]\cr -#' Integer vector containing the iterations the user wants to visualize. -#' } -#' \item{\code{from}}{[\code{numeric(1)}]\cr -#' Lower bound for the x axis (should be smaller than \code{to}). -#' } -#' \item{\code{to}}{[\code{numeric(1)}]\cr -#' Upper bound for the x axis (should be greater than \code{from}). -#' } -#' \item{\code{length_out}}{[\code{integer(1)}]\cr -#' Number of equidistant points between \code{from} and \code{to} used for plotting. -#' } +#' \item{\code{blearner_name}}{[\code{character(1)}]\cr +#' Character name of the base-learner to plot the contribution to the response. Available choices for +#' \code{blearner_name} use \code{cboost$getBaselearnerNames()}. +#' } +#' \item{\code{iters}}{[\code{integer()}]\cr +#' Integer vector containing the iterations the user wants to visualize. +#' } +#' \item{\code{from}}{[\code{numeric(1)}]\cr +#' Lower bound for the x axis (should be smaller than \code{to}). +#' } +#' \item{\code{to}}{[\code{numeric(1)}]\cr +#' Upper bound for the x axis (should be greater than \code{from}). +#' } +#' \item{\code{length_out}}{[\code{integer(1)}]\cr +#' Number of equidistant points between \code{from} and \code{to} used for plotting. +#' } #' } #' \strong{For cboost$calculateFeatureImportance() and cboost$plotFeatureImportance()}: #' \describe{ -#' \item{\code{num_feats}}{[\code{integer(1)}]\cr -#' Number of features for which the Importance will be returned. +#' \item{\code{num_feats}}{[\code{integer(1)}]\cr +#' Number of features for which the Importance will be returned. +#' } #' } +#' \strong{For cboost$plotBlearnerTraces}: +#' \describe{ +#' \item{\code{value}}{[\code{numeric()}]\cr +#' Numeric value of length 1 or same length as the number iterations which is accumulated by the selected base-learner. +#' } +#' \item{\code{n_legend}}{[\code{integer(1L)}]\cr +#' Number of how many base-learner are highlighted (base-learner are highlighted by choosing the top \code{n_legend} +#' accumulated values). +#' } #' } #' @section Details: #' \strong{Loss}\cr @@ -333,17 +345,20 @@ #' \item{\code{calculateFeatureImportance}}{method to calculate feature importance.} #' \item{\code{plotFeatureImportance}}{method to plot the feature importance calculated by \code{calulateFeatureImportance}.} #' \item{\code{plotInbagVsOobRisk}}{method to plot the inbag vs the out of bag behavior. This is just applicable if a logger with name \code{oob_logger} was registered. This is automatically done if the \code{oob_fraction} is set.} +#' \item{\code{plotBlearnerTraces}}{method to plot traces how the base-learner are selected in combination with a measure of interest, e.g. how the empirical risk was minimized throughout the selection process.} #' } #' #' @examples #' cboost = Compboost$new(mtcars, "mpg", loss = LossQuadratic$new(), oob_fraction = 0.3) #' cboost$addBaselearner("hp", "spline", BaselearnerPSpline, degree = 3, -#' n.knots = 10, penalty = 2, differences = 2) +#' n_knots = 10, penalty = 2, differences = 2) +#' cboost$addBaselearner("wt", "spline", BaselearnerPSpline) #' cboost$train(1000) #' #' table(cboost$getSelectedBaselearner()) #' cboost$plot("hp_spline") #' cboost$plotInbagVsOobRisk() +#' cboost$plotBlearnerTraces() NULL #' @export @@ -649,6 +664,9 @@ Compboost = R6::R6Class("Compboost", ggplot2::ylab("Risk") return(gg) + }, + plotBlearnerTraces = function (value = 1L, n_legend = 5L) { + plotBlearnerTraces(cboost_obj = self, value = value, n_legend = n_legend) } ), private = list( diff --git a/R/plot_cboost.R b/R/plot_cboost.R index f1c202d1..bebcbc1d 100644 --- a/R/plot_cboost.R +++ b/R/plot_cboost.R @@ -104,3 +104,59 @@ plotFeatEffect = function (cboost_obj, bl_list, blearner_name, iters, from, to, return(gg) } +plotBlearnerTraces = function (cboost_obj, value = 1, n_legend = 5L) +{ + if (! requireNamespace("ggplot2", quietly = TRUE)) { stop("Please install ggplot2 to create plots.") } + if (! requireNamespace("ggrepel", quietly = TRUE)) { stop("Please install ggrepel to create plots.") } + + if (is.null(cboost_obj$model)) stop("Model needs to be trained first.") + + # Creating the base dataframe which is used to calculate the traces for the selected base-learner: + bl = as.factor(cboost_obj$getSelectedBaselearner()) + df_plot = data.frame(iters = seq_along(bl), blearner = bl, value = value) + + if (length(value) %in% c(1L, length(bl))) { + checkmate::assertNumeric(value) + } else { + stop("Assertion on 'value' failed: Must have length 1 or ", length(bl), ".") + } + checkmate::assertCount(n_legend, positive = TRUE) + + # Aggregate value by calculating the cumulative sum grouped by base-learner: + df_plot = do.call(rbind, lapply(X = levels(bl), FUN = function (lab) { + df_temp = df_plot[df_plot$blearner == lab, ] + df_temp = df_temp[order(df_temp$iters), ] + df_temp$value = cumsum(df_temp$value) / length(bl) + + return(df_temp) + })) + + # Get top 'n_legend' base-learner that are highlighted: + top_values = vapply(X = levels(bl), FUN.VALUE = numeric(1L), FUN = function (lab) { + df_temp = df_plot[df_plot$blearner == lab, ] + return (max(df_temp$value)) + }) + top_labs = as.factor(names(sort(top_values, decreasing = TRUE)))[seq_len(n_legend)] + + idx_top_lab = df_plot$blearner %in% top_labs + + df_plot_top = df_plot[idx_top_lab, ] + df_plot_nottop = df_plot[! idx_top_lab, ] + + df_label = do.call(rbind, lapply(X = top_labs, FUN = function (lab) { + df_temp = df_plot[df_plot$blearner == lab, ] + df_temp[which.max(df_temp$iters), ] + })) + + gg = ggplot2::ggplot() + + ggplot2::geom_line(data = df_plot_top, ggplot2::aes(x = iters, y = value, color = blearner), show.legend = FALSE) + + ggplot2::geom_line(data = df_plot_nottop, ggplot2::aes(x = iters, y = value, group = blearner), alpha = 0.2, show.legend = FALSE) + + ggrepel::geom_label_repel(data = df_label, ggplot2::aes(x = iters, y = value, label = round(value, 4), fill = blearner), + colour = "white", fontface = "bold", show.legend = TRUE) + + ggplot2::xlab("Iteration") + + ggplot2::ylab("Cumulated Value\nof Included Base-Learner") + + ggplot2::scale_fill_discrete(name = paste0("Top ", n_legend, " Base-Learner")) + + ggplot2::guides(color = FALSE) + + return(gg) +} diff --git a/man/Compboost.Rd b/man/Compboost.Rd index 33644f1b..da1d4dd8 100644 --- a/man/Compboost.Rd +++ b/man/Compboost.Rd @@ -53,6 +53,8 @@ cboost$plotFeatureImportance(num_feats = NULL) cboost$plotInbagVsOobRisk() +cboost$plotBlearnerTraces(value = 1, n_legend = 5L) + } } @@ -144,34 +146,44 @@ cboost$plotInbagVsOobRisk() \strong{For cboost$predict()}: \describe{ -\item{\code{newdata}}{[\code{data.frame()}]\cr - Data to predict on. If newdata equals \code{NULL} predictions on the training data are returned. -} + \item{\code{newdata}}{[\code{data.frame()}]\cr + Data to predict on. If newdata equals \code{NULL} predictions on the training data are returned. + } } \strong{For cboost$plot()}: \describe{ -\item{\code{blearner_name}}{[\code{character(1)}]\cr - Character name of the base-learner to plot the contribution to the response. Available choices for - \code{blearner_name} use \code{cboost$getBaselearnerNames()}. -} -\item{\code{iters}}{[\code{integer()}]\cr - Integer vector containing the iterations the user wants to visualize. -} -\item{\code{from}}{[\code{numeric(1)}]\cr - Lower bound for the x axis (should be smaller than \code{to}). -} -\item{\code{to}}{[\code{numeric(1)}]\cr - Upper bound for the x axis (should be greater than \code{from}). -} -\item{\code{length_out}}{[\code{integer(1)}]\cr - Number of equidistant points between \code{from} and \code{to} used for plotting. -} + \item{\code{blearner_name}}{[\code{character(1)}]\cr + Character name of the base-learner to plot the contribution to the response. Available choices for + \code{blearner_name} use \code{cboost$getBaselearnerNames()}. + } + \item{\code{iters}}{[\code{integer()}]\cr + Integer vector containing the iterations the user wants to visualize. + } + \item{\code{from}}{[\code{numeric(1)}]\cr + Lower bound for the x axis (should be smaller than \code{to}). + } + \item{\code{to}}{[\code{numeric(1)}]\cr + Upper bound for the x axis (should be greater than \code{from}). + } + \item{\code{length_out}}{[\code{integer(1)}]\cr + Number of equidistant points between \code{from} and \code{to} used for plotting. + } } \strong{For cboost$calculateFeatureImportance() and cboost$plotFeatureImportance()}: \describe{ -\item{\code{num_feats}}{[\code{integer(1)}]\cr - Number of features for which the Importance will be returned. + \item{\code{num_feats}}{[\code{integer(1)}]\cr + Number of features for which the Importance will be returned. + } } +\strong{For cboost$plotBlearnerTraces}: +\describe{ + \item{\code{value}}{[\code{numeric()}]\cr + Numeric value of length 1 or same length as the number iterations which is accumulated by the selected base-learner. + } + \item{\code{n_legend}}{[\code{integer(1L)}]\cr + Number of how many base-learner are highlighted (base-learner are highlighted by choosing the top \code{n_legend} + accumulated values). + } } } @@ -241,16 +253,19 @@ cboost$plotInbagVsOobRisk() \item{\code{calculateFeatureImportance}}{method to calculate feature importance.} \item{\code{plotFeatureImportance}}{method to plot the feature importance calculated by \code{calulateFeatureImportance}.} \item{\code{plotInbagVsOobRisk}}{method to plot the inbag vs the out of bag behavior. This is just applicable if a logger with name \code{oob_logger} was registered. This is automatically done if the \code{oob_fraction} is set.} + \item{\code{plotBlearnerTraces}}{method to plot traces how the base-learner are selected in combination with a measure of interest, e.g. how the empirical risk was minimized throughout the selection process.} } } \examples{ cboost = Compboost$new(mtcars, "mpg", loss = LossQuadratic$new(), oob_fraction = 0.3) cboost$addBaselearner("hp", "spline", BaselearnerPSpline, degree = 3, - n.knots = 10, penalty = 2, differences = 2) + n_knots = 10, penalty = 2, differences = 2) +cboost$addBaselearner("wt", "spline", BaselearnerPSpline) cboost$train(1000) table(cboost$getSelectedBaselearner()) cboost$plot("hp_spline") cboost$plotInbagVsOobRisk() +cboost$plotBlearnerTraces() } diff --git a/tests/testthat/test_base_learner_traces.R b/tests/testthat/test_base_learner_traces.R new file mode 100644 index 00000000..64848fc4 --- /dev/null +++ b/tests/testthat/test_base_learner_traces.R @@ -0,0 +1,22 @@ +context("Base-learner traces works") + +test_that("Visualization works", { + + expect_silent({ + cboost = Compboost$new(mtcars, "mpg", loss = LossQuadratic$new()) + cboost$addBaselearner("hp", "spline", BaselearnerPSpline) + cboost$addBaselearner(c("hp", "wt"), "quadratic", BaselearnerPolynomial) + cboost$addBaselearner("wt", "linear", BaselearnerPolynomial) + }) + + expect_error(cboost$plot("hp_spline")) + + expect_output(cboost$train(2000, trace = 0)) + + expect_error(cboost$plotBlearnerTraces(n_legend = "bls")) + expect_error(cboost$plotBlearnerTraces(value = "bls")) + expect_error(cboost$plotBlearnerTraces(value = c(1,2))) + + expect_silent({gg = cboost$plotBlearnerTraces()}) + expect_s3_class(cboost$plotBlearnerTraces(), "ggplot") +}) \ No newline at end of file