add visualization for base-learner traces

schalkdaniel · Oct 31, 2019 · 6aed9ef · 6aed9ef
1 parent 1d5f527
commit 6aed9ef
Show file tree

Hide file tree

Showing 5 changed files with 157 additions and 45 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -40,7 +40,8 @@ Suggests:
   rmarkdown,
   titanic,
   mlr,
-  gridExtra
+  gridExtra,
+  ggrepel
 RcppModules:
   baselearner_module,
   compboost_module,

diff --git a/R/compboost.R b/R/compboost.R
@@ -49,6 +49,8 @@
 #'
 #' cboost$plotInbagVsOobRisk()
 #'
+#' cboost$plotBlearnerTraces(value = 1, n_legend = 5L)
+#'
 #' }
 #' @section Arguments:
 #' \strong{For Compboost$new()}:
@@ -137,34 +139,44 @@
 #'
 #' \strong{For cboost$predict()}:
 #' \describe{
-#' \item{\code{newdata}}{[\code{data.frame()}]\cr
-#' 	 Data to predict on. If newdata equals \code{NULL} predictions on the training data are returned.
-#' }
+#'   \item{\code{newdata}}{[\code{data.frame()}]\cr
+#'   	 Data to predict on. If newdata equals \code{NULL} predictions on the training data are returned.
+#'   }
 #' }
 #' \strong{For cboost$plot()}:
 #' \describe{
-#' \item{\code{blearner_name}}{[\code{character(1)}]\cr
-#' 	 Character name of the base-learner to plot the contribution to the response. Available choices for
-#'   \code{blearner_name} use \code{cboost$getBaselearnerNames()}.
-#' }
-#' \item{\code{iters}}{[\code{integer()}]\cr
-#' 	 Integer vector containing the iterations the user wants to visualize.
-#' }
-#' \item{\code{from}}{[\code{numeric(1)}]\cr
-#' 	 Lower bound for the x axis (should be smaller than \code{to}).
-#' }
-#' \item{\code{to}}{[\code{numeric(1)}]\cr
-#' 	 Upper bound for the x axis (should be greater than \code{from}).
-#' }
-#' \item{\code{length_out}}{[\code{integer(1)}]\cr
-#' 	 Number of equidistant points between \code{from} and \code{to} used for plotting.
-#' }
+#'   \item{\code{blearner_name}}{[\code{character(1)}]\cr
+#'   	 Character name of the base-learner to plot the contribution to the response. Available choices for
+#'     \code{blearner_name} use \code{cboost$getBaselearnerNames()}.
+#'   }
+#'   \item{\code{iters}}{[\code{integer()}]\cr
+#'   	 Integer vector containing the iterations the user wants to visualize.
+#'   }
+#'   \item{\code{from}}{[\code{numeric(1)}]\cr
+#'   	 Lower bound for the x axis (should be smaller than \code{to}).
+#'   }
+#'   \item{\code{to}}{[\code{numeric(1)}]\cr
+#'   	 Upper bound for the x axis (should be greater than \code{from}).
+#'   }
+#'   \item{\code{length_out}}{[\code{integer(1)}]\cr
+#'   	 Number of equidistant points between \code{from} and \code{to} used for plotting.
+#'   }
 #' }
 #' \strong{For cboost$calculateFeatureImportance() and cboost$plotFeatureImportance()}:
 #' \describe{
-#' \item{\code{num_feats}}{[\code{integer(1)}]\cr
-#'   Number of features for which the Importance will be returned.
+#'   \item{\code{num_feats}}{[\code{integer(1)}]\cr
+#'     Number of features for which the Importance will be returned.
+#'   }
 #' }
+#' \strong{For cboost$plotBlearnerTraces}:
+#' \describe{
+#'   \item{\code{value}}{[\code{numeric()}]\cr
+#'     Numeric value of length 1 or same length as the number iterations which is accumulated by the selected base-learner.
+#'   }
+#'   \item{\code{n_legend}}{[\code{integer(1L)}]\cr
+#'     Number of how many base-learner are highlighted (base-learner are highlighted by choosing the top \code{n_legend}
+#'     accumulated values).
+#'   }
 #' }
 #' @section Details:
 #'   \strong{Loss}\cr
@@ -333,17 +345,20 @@
 #'   \item{\code{calculateFeatureImportance}}{method to calculate feature importance.}
 #'   \item{\code{plotFeatureImportance}}{method to plot the feature importance calculated by \code{calulateFeatureImportance}.}
 #'   \item{\code{plotInbagVsOobRisk}}{method to plot the inbag vs the out of bag behavior. This is just applicable if a logger with name \code{oob_logger} was registered. This is automatically done if the \code{oob_fraction} is set.}
+#'   \item{\code{plotBlearnerTraces}}{method to plot traces how the base-learner are selected in combination with a measure of interest, e.g. how the empirical risk was minimized throughout the selection process.}
 #' }
 #'
 #' @examples
 #' cboost = Compboost$new(mtcars, "mpg", loss = LossQuadratic$new(), oob_fraction = 0.3)
 #' cboost$addBaselearner("hp", "spline", BaselearnerPSpline, degree = 3,
-#'   n.knots = 10, penalty = 2, differences = 2)
+#'   n_knots = 10, penalty = 2, differences = 2)
+#' cboost$addBaselearner("wt", "spline", BaselearnerPSpline)
 #' cboost$train(1000)
 #'
 #' table(cboost$getSelectedBaselearner())
 #' cboost$plot("hp_spline")
 #' cboost$plotInbagVsOobRisk()
+#' cboost$plotBlearnerTraces()
 NULL
 
 #' @export
@@ -649,6 +664,9 @@ Compboost = R6::R6Class("Compboost",
         ggplot2::ylab("Risk")
 
       return(gg)
+    },
+    plotBlearnerTraces = function (value = 1L, n_legend = 5L) {
+      plotBlearnerTraces(cboost_obj = self, value = value, n_legend = n_legend)
     }
   ),
   private = list(

diff --git a/R/plot_cboost.R b/R/plot_cboost.R
@@ -104,3 +104,59 @@ plotFeatEffect = function (cboost_obj, bl_list, blearner_name, iters, from, to,
   return(gg)
 }
 
+plotBlearnerTraces = function (cboost_obj, value = 1, n_legend = 5L)
+{
+  if (! requireNamespace("ggplot2", quietly = TRUE)) { stop("Please install ggplot2 to create plots.") }
+  if (! requireNamespace("ggrepel", quietly = TRUE)) { stop("Please install ggrepel to create plots.") }
+
+  if (is.null(cboost_obj$model)) stop("Model needs to be trained first.")
+
+  # Creating the base dataframe which is used to calculate the traces for the selected base-learner:
+  bl       = as.factor(cboost_obj$getSelectedBaselearner())
+  df_plot  = data.frame(iters = seq_along(bl), blearner = bl, value = value)
+
+  if (length(value) %in% c(1L, length(bl))) {
+    checkmate::assertNumeric(value)
+  } else {
+    stop("Assertion on 'value' failed: Must have length 1 or ", length(bl), ".")
+  }
+  checkmate::assertCount(n_legend, positive = TRUE)
+
+  # Aggregate value by calculating the cumulative sum grouped by base-learner:
+  df_plot = do.call(rbind, lapply(X = levels(bl), FUN = function (lab) {
+    df_temp = df_plot[df_plot$blearner == lab, ]
+    df_temp = df_temp[order(df_temp$iters), ]
+    df_temp$value = cumsum(df_temp$value) / length(bl)
+
+    return(df_temp)
+  }))
+
+  # Get top 'n_legend' base-learner that are highlighted:
+  top_values = vapply(X = levels(bl), FUN.VALUE = numeric(1L), FUN = function (lab) {
+    df_temp = df_plot[df_plot$blearner == lab, ]
+    return (max(df_temp$value))
+  })
+  top_labs = as.factor(names(sort(top_values, decreasing = TRUE)))[seq_len(n_legend)]
+
+  idx_top_lab = df_plot$blearner %in% top_labs
+
+  df_plot_top    = df_plot[idx_top_lab, ]
+  df_plot_nottop = df_plot[! idx_top_lab, ]
+
+  df_label = do.call(rbind, lapply(X = top_labs, FUN = function (lab) {
+    df_temp = df_plot[df_plot$blearner == lab, ]
+    df_temp[which.max(df_temp$iters), ]
+  }))
+
+  gg = ggplot2::ggplot() +
+    ggplot2::geom_line(data = df_plot_top, ggplot2::aes(x = iters, y = value, color = blearner), show.legend = FALSE) +
+    ggplot2::geom_line(data = df_plot_nottop, ggplot2::aes(x = iters, y = value, group = blearner), alpha = 0.2, show.legend = FALSE) +
+    ggrepel::geom_label_repel(data = df_label, ggplot2::aes(x = iters, y = value, label = round(value, 4), fill = blearner),
+      colour = "white", fontface = "bold", show.legend = TRUE) +
+    ggplot2::xlab("Iteration") +
+    ggplot2::ylab("Cumulated Value\nof Included Base-Learner") +
+    ggplot2::scale_fill_discrete(name = paste0("Top ", n_legend, " Base-Learner")) +
+    ggplot2::guides(color = FALSE)
+
+  return(gg)
+}
diff --git a/man/Compboost.Rd b/man/Compboost.Rd
diff --git a/tests/testthat/test_base_learner_traces.R b/tests/testthat/test_base_learner_traces.R
@@ -0,0 +1,22 @@
+context("Base-learner traces works")
+
+test_that("Visualization works", {
+
+  expect_silent({
+    cboost = Compboost$new(mtcars, "mpg", loss = LossQuadratic$new())
+    cboost$addBaselearner("hp", "spline", BaselearnerPSpline)
+    cboost$addBaselearner(c("hp", "wt"), "quadratic", BaselearnerPolynomial)
+    cboost$addBaselearner("wt", "linear", BaselearnerPolynomial)
+  })
+
+  expect_error(cboost$plot("hp_spline"))
+
+  expect_output(cboost$train(2000, trace = 0))
+
+  expect_error(cboost$plotBlearnerTraces(n_legend = "bls"))
+  expect_error(cboost$plotBlearnerTraces(value = "bls"))
+  expect_error(cboost$plotBlearnerTraces(value = c(1,2)))
+
+  expect_silent({gg = cboost$plotBlearnerTraces()})
+  expect_s3_class(cboost$plotBlearnerTraces(), "ggplot")
+})