From 6aed9ef20d58c3cdcb692cdc8b78527e6e816bee Mon Sep 17 00:00:00 2001
From: schalkdaniel <d-schalk@t-online.de>
Date: Thu, 31 Oct 2019 13:48:33 +0100
Subject: [PATCH] add visualization for base-learner traces

---
 DESCRIPTION                               |  3 +-
 R/compboost.R                             | 62 +++++++++++++++--------
 R/plot_cboost.R                           | 56 ++++++++++++++++++++
 man/Compboost.Rd                          | 59 +++++++++++++--------
 tests/testthat/test_base_learner_traces.R | 22 ++++++++
 5 files changed, 157 insertions(+), 45 deletions(-)
 create mode 100644 tests/testthat/test_base_learner_traces.R

diff --git a/DESCRIPTION b/DESCRIPTION
index 5faae694..2c03d5ff 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -40,7 +40,8 @@ Suggests:
   rmarkdown,
   titanic,
   mlr,
-  gridExtra
+  gridExtra,
+  ggrepel
 RcppModules:
   baselearner_module,
   compboost_module,
diff --git a/R/compboost.R b/R/compboost.R
index 14c20f5f..d8b02a41 100644
--- a/R/compboost.R
+++ b/R/compboost.R
@@ -49,6 +49,8 @@
 #'
 #' cboost$plotInbagVsOobRisk()
 #'
+#' cboost$plotBlearnerTraces(value = 1, n_legend = 5L)
+#'
 #' }
 #' @section Arguments:
 #' \strong{For Compboost$new()}:
@@ -137,34 +139,44 @@
 #'
 #' \strong{For cboost$predict()}:
 #' \describe{
-#' \item{\code{newdata}}{[\code{data.frame()}]\cr
-#' 	 Data to predict on. If newdata equals \code{NULL} predictions on the training data are returned.
-#' }
+#'   \item{\code{newdata}}{[\code{data.frame()}]\cr
+#'   	 Data to predict on. If newdata equals \code{NULL} predictions on the training data are returned.
+#'   }
 #' }
 #' \strong{For cboost$plot()}:
 #' \describe{
-#' \item{\code{blearner_name}}{[\code{character(1)}]\cr
-#' 	 Character name of the base-learner to plot the contribution to the response. Available choices for
-#'   \code{blearner_name} use \code{cboost$getBaselearnerNames()}.
-#' }
-#' \item{\code{iters}}{[\code{integer()}]\cr
-#' 	 Integer vector containing the iterations the user wants to visualize.
-#' }
-#' \item{\code{from}}{[\code{numeric(1)}]\cr
-#' 	 Lower bound for the x axis (should be smaller than \code{to}).
-#' }
-#' \item{\code{to}}{[\code{numeric(1)}]\cr
-#' 	 Upper bound for the x axis (should be greater than \code{from}).
-#' }
-#' \item{\code{length_out}}{[\code{integer(1)}]\cr
-#' 	 Number of equidistant points between \code{from} and \code{to} used for plotting.
-#' }
+#'   \item{\code{blearner_name}}{[\code{character(1)}]\cr
+#'   	 Character name of the base-learner to plot the contribution to the response. Available choices for
+#'     \code{blearner_name} use \code{cboost$getBaselearnerNames()}.
+#'   }
+#'   \item{\code{iters}}{[\code{integer()}]\cr
+#'   	 Integer vector containing the iterations the user wants to visualize.
+#'   }
+#'   \item{\code{from}}{[\code{numeric(1)}]\cr
+#'   	 Lower bound for the x axis (should be smaller than \code{to}).
+#'   }
+#'   \item{\code{to}}{[\code{numeric(1)}]\cr
+#'   	 Upper bound for the x axis (should be greater than \code{from}).
+#'   }
+#'   \item{\code{length_out}}{[\code{integer(1)}]\cr
+#'   	 Number of equidistant points between \code{from} and \code{to} used for plotting.
+#'   }
 #' }
 #' \strong{For cboost$calculateFeatureImportance() and cboost$plotFeatureImportance()}:
 #' \describe{
-#' \item{\code{num_feats}}{[\code{integer(1)}]\cr
-#'   Number of features for which the Importance will be returned.
+#'   \item{\code{num_feats}}{[\code{integer(1)}]\cr
+#'     Number of features for which the Importance will be returned.
+#'   }
 #' }
+#' \strong{For cboost$plotBlearnerTraces}:
+#' \describe{
+#'   \item{\code{value}}{[\code{numeric()}]\cr
+#'     Numeric value of length 1 or same length as the number iterations which is accumulated by the selected base-learner.
+#'   }
+#'   \item{\code{n_legend}}{[\code{integer(1L)}]\cr
+#'     Number of how many base-learner are highlighted (base-learner are highlighted by choosing the top \code{n_legend}
+#'     accumulated values).
+#'   }
 #' }
 #' @section Details:
 #'   \strong{Loss}\cr
@@ -333,17 +345,20 @@
 #'   \item{\code{calculateFeatureImportance}}{method to calculate feature importance.}
 #'   \item{\code{plotFeatureImportance}}{method to plot the feature importance calculated by \code{calulateFeatureImportance}.}
 #'   \item{\code{plotInbagVsOobRisk}}{method to plot the inbag vs the out of bag behavior. This is just applicable if a logger with name \code{oob_logger} was registered. This is automatically done if the \code{oob_fraction} is set.}
+#'   \item{\code{plotBlearnerTraces}}{method to plot traces how the base-learner are selected in combination with a measure of interest, e.g. how the empirical risk was minimized throughout the selection process.}
 #' }
 #'
 #' @examples
 #' cboost = Compboost$new(mtcars, "mpg", loss = LossQuadratic$new(), oob_fraction = 0.3)
 #' cboost$addBaselearner("hp", "spline", BaselearnerPSpline, degree = 3,
-#'   n.knots = 10, penalty = 2, differences = 2)
+#'   n_knots = 10, penalty = 2, differences = 2)
+#' cboost$addBaselearner("wt", "spline", BaselearnerPSpline)
 #' cboost$train(1000)
 #'
 #' table(cboost$getSelectedBaselearner())
 #' cboost$plot("hp_spline")
 #' cboost$plotInbagVsOobRisk()
+#' cboost$plotBlearnerTraces()
 NULL
 
 #' @export
@@ -649,6 +664,9 @@ Compboost = R6::R6Class("Compboost",
         ggplot2::ylab("Risk")
 
       return(gg)
+    },
+    plotBlearnerTraces = function (value = 1L, n_legend = 5L) {
+      plotBlearnerTraces(cboost_obj = self, value = value, n_legend = n_legend)
     }
   ),
   private = list(
diff --git a/R/plot_cboost.R b/R/plot_cboost.R
index f1c202d1..bebcbc1d 100644
--- a/R/plot_cboost.R
+++ b/R/plot_cboost.R
@@ -104,3 +104,59 @@ plotFeatEffect = function (cboost_obj, bl_list, blearner_name, iters, from, to,
   return(gg)
 }
 
+plotBlearnerTraces = function (cboost_obj, value = 1, n_legend = 5L)
+{
+  if (! requireNamespace("ggplot2", quietly = TRUE)) { stop("Please install ggplot2 to create plots.") }
+  if (! requireNamespace("ggrepel", quietly = TRUE)) { stop("Please install ggrepel to create plots.") }
+
+  if (is.null(cboost_obj$model)) stop("Model needs to be trained first.")
+
+  # Creating the base dataframe which is used to calculate the traces for the selected base-learner:
+  bl       = as.factor(cboost_obj$getSelectedBaselearner())
+  df_plot  = data.frame(iters = seq_along(bl), blearner = bl, value = value)
+
+  if (length(value) %in% c(1L, length(bl))) {
+    checkmate::assertNumeric(value)
+  } else {
+    stop("Assertion on 'value' failed: Must have length 1 or ", length(bl), ".")
+  }
+  checkmate::assertCount(n_legend, positive = TRUE)
+
+  # Aggregate value by calculating the cumulative sum grouped by base-learner:
+  df_plot = do.call(rbind, lapply(X = levels(bl), FUN = function (lab) {
+    df_temp = df_plot[df_plot$blearner == lab, ]
+    df_temp = df_temp[order(df_temp$iters), ]
+    df_temp$value = cumsum(df_temp$value) / length(bl)
+
+    return(df_temp)
+  }))
+
+  # Get top 'n_legend' base-learner that are highlighted:
+  top_values = vapply(X = levels(bl), FUN.VALUE = numeric(1L), FUN = function (lab) {
+    df_temp = df_plot[df_plot$blearner == lab, ]
+    return (max(df_temp$value))
+  })
+  top_labs = as.factor(names(sort(top_values, decreasing = TRUE)))[seq_len(n_legend)]
+
+  idx_top_lab = df_plot$blearner %in% top_labs
+
+  df_plot_top    = df_plot[idx_top_lab, ]
+  df_plot_nottop = df_plot[! idx_top_lab, ]
+
+  df_label = do.call(rbind, lapply(X = top_labs, FUN = function (lab) {
+    df_temp = df_plot[df_plot$blearner == lab, ]
+    df_temp[which.max(df_temp$iters), ]
+  }))
+
+  gg = ggplot2::ggplot() +
+    ggplot2::geom_line(data = df_plot_top, ggplot2::aes(x = iters, y = value, color = blearner), show.legend = FALSE) +
+    ggplot2::geom_line(data = df_plot_nottop, ggplot2::aes(x = iters, y = value, group = blearner), alpha = 0.2, show.legend = FALSE) +
+    ggrepel::geom_label_repel(data = df_label, ggplot2::aes(x = iters, y = value, label = round(value, 4), fill = blearner),
+      colour = "white", fontface = "bold", show.legend = TRUE) +
+    ggplot2::xlab("Iteration") +
+    ggplot2::ylab("Cumulated Value\nof Included Base-Learner") +
+    ggplot2::scale_fill_discrete(name = paste0("Top ", n_legend, " Base-Learner")) +
+    ggplot2::guides(color = FALSE)
+
+  return(gg)
+}
diff --git a/man/Compboost.Rd b/man/Compboost.Rd
index 33644f1b..da1d4dd8 100644
--- a/man/Compboost.Rd
+++ b/man/Compboost.Rd
@@ -53,6 +53,8 @@ cboost$plotFeatureImportance(num_feats = NULL)
 
 cboost$plotInbagVsOobRisk()
 
+cboost$plotBlearnerTraces(value = 1, n_legend = 5L)
+
 }
 }
 
@@ -144,34 +146,44 @@ cboost$plotInbagVsOobRisk()
 
 \strong{For cboost$predict()}:
 \describe{
-\item{\code{newdata}}{[\code{data.frame()}]\cr
-	 Data to predict on. If newdata equals \code{NULL} predictions on the training data are returned.
-}
+  \item{\code{newdata}}{[\code{data.frame()}]\cr
+  	 Data to predict on. If newdata equals \code{NULL} predictions on the training data are returned.
+  }
 }
 \strong{For cboost$plot()}:
 \describe{
-\item{\code{blearner_name}}{[\code{character(1)}]\cr
-	 Character name of the base-learner to plot the contribution to the response. Available choices for
-  \code{blearner_name} use \code{cboost$getBaselearnerNames()}.
-}
-\item{\code{iters}}{[\code{integer()}]\cr
-	 Integer vector containing the iterations the user wants to visualize.
-}
-\item{\code{from}}{[\code{numeric(1)}]\cr
-	 Lower bound for the x axis (should be smaller than \code{to}).
-}
-\item{\code{to}}{[\code{numeric(1)}]\cr
-	 Upper bound for the x axis (should be greater than \code{from}).
-}
-\item{\code{length_out}}{[\code{integer(1)}]\cr
-	 Number of equidistant points between \code{from} and \code{to} used for plotting.
-}
+  \item{\code{blearner_name}}{[\code{character(1)}]\cr
+  	 Character name of the base-learner to plot the contribution to the response. Available choices for
+    \code{blearner_name} use \code{cboost$getBaselearnerNames()}.
+  }
+  \item{\code{iters}}{[\code{integer()}]\cr
+  	 Integer vector containing the iterations the user wants to visualize.
+  }
+  \item{\code{from}}{[\code{numeric(1)}]\cr
+  	 Lower bound for the x axis (should be smaller than \code{to}).
+  }
+  \item{\code{to}}{[\code{numeric(1)}]\cr
+  	 Upper bound for the x axis (should be greater than \code{from}).
+  }
+  \item{\code{length_out}}{[\code{integer(1)}]\cr
+  	 Number of equidistant points between \code{from} and \code{to} used for plotting.
+  }
 }
 \strong{For cboost$calculateFeatureImportance() and cboost$plotFeatureImportance()}:
 \describe{
-\item{\code{num_feats}}{[\code{integer(1)}]\cr
-  Number of features for which the Importance will be returned.
+  \item{\code{num_feats}}{[\code{integer(1)}]\cr
+    Number of features for which the Importance will be returned.
+  }
 }
+\strong{For cboost$plotBlearnerTraces}:
+\describe{
+  \item{\code{value}}{[\code{numeric()}]\cr
+    Numeric value of length 1 or same length as the number iterations which is accumulated by the selected base-learner.
+  }
+  \item{\code{n_legend}}{[\code{integer(1L)}]\cr
+    Number of how many base-learner are highlighted (base-learner are highlighted by choosing the top \code{n_legend}
+    accumulated values).
+  }
 }
 }
 
@@ -241,16 +253,19 @@ cboost$plotInbagVsOobRisk()
   \item{\code{calculateFeatureImportance}}{method to calculate feature importance.}
   \item{\code{plotFeatureImportance}}{method to plot the feature importance calculated by \code{calulateFeatureImportance}.}
   \item{\code{plotInbagVsOobRisk}}{method to plot the inbag vs the out of bag behavior. This is just applicable if a logger with name \code{oob_logger} was registered. This is automatically done if the \code{oob_fraction} is set.}
+  \item{\code{plotBlearnerTraces}}{method to plot traces how the base-learner are selected in combination with a measure of interest, e.g. how the empirical risk was minimized throughout the selection process.}
 }
 }
 
 \examples{
 cboost = Compboost$new(mtcars, "mpg", loss = LossQuadratic$new(), oob_fraction = 0.3)
 cboost$addBaselearner("hp", "spline", BaselearnerPSpline, degree = 3,
-  n.knots = 10, penalty = 2, differences = 2)
+  n_knots = 10, penalty = 2, differences = 2)
+cboost$addBaselearner("wt", "spline", BaselearnerPSpline)
 cboost$train(1000)
 
 table(cboost$getSelectedBaselearner())
 cboost$plot("hp_spline")
 cboost$plotInbagVsOobRisk()
+cboost$plotBlearnerTraces()
 }
diff --git a/tests/testthat/test_base_learner_traces.R b/tests/testthat/test_base_learner_traces.R
new file mode 100644
index 00000000..64848fc4
--- /dev/null
+++ b/tests/testthat/test_base_learner_traces.R
@@ -0,0 +1,22 @@
+context("Base-learner traces works")
+
+test_that("Visualization works", {
+
+  expect_silent({
+    cboost = Compboost$new(mtcars, "mpg", loss = LossQuadratic$new())
+    cboost$addBaselearner("hp", "spline", BaselearnerPSpline)
+    cboost$addBaselearner(c("hp", "wt"), "quadratic", BaselearnerPolynomial)
+    cboost$addBaselearner("wt", "linear", BaselearnerPolynomial)
+  })
+
+  expect_error(cboost$plot("hp_spline"))
+
+  expect_output(cboost$train(2000, trace = 0))
+
+  expect_error(cboost$plotBlearnerTraces(n_legend = "bls"))
+  expect_error(cboost$plotBlearnerTraces(value = "bls"))
+  expect_error(cboost$plotBlearnerTraces(value = c(1,2)))
+
+  expect_silent({gg = cboost$plotBlearnerTraces()})
+  expect_s3_class(cboost$plotBlearnerTraces(), "ggplot")
+})
\ No newline at end of file