diff --git a/NAMESPACE b/NAMESPACE index d739e3fa..a36609b2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -20,6 +20,8 @@ export(LossCustomCpp) export(LossQuadratic) export(OptimizerCoordinateDescent) export(OptimizerCoordinateDescentLineSearch) +export(ResponseBinaryClassif) +export(ResponseRegr) export(boostLinear) export(boostSplines) export(getCustomCppExample) diff --git a/R/RcppExports.R b/R/RcppExports.R index ad8cf4e9..89f2f1c8 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -135,9 +135,9 @@ NULL #' data.target2 = InMemoryData$new() #' #' # Create new linear base-learner factory: -#' lin.factory = BaselearnerPolynomial$new(data.source, data.target1, +#' lin.factory = BaselearnerPolynomial$new(data.source, data.target1, #' list(degree = 2, intercept = FALSE)) -#' lin.factory.int = BaselearnerPolynomial$new(data.source, data.target2, +#' lin.factory.int = BaselearnerPolynomial$new(data.source, data.target2, #' list(degree = 2, intercept = TRUE)) #' #' # Get the transformed data: @@ -250,7 +250,7 @@ NULL #' #' @section Usage: #' \preformatted{ -#' BaselearnerCustom$new(data_source, data_target, list(instantiate.fun, +#' BaselearnerCustom$new(data_source, data_target, list(instantiate.fun, #' train.fun, predict.fun, param.fun)) #' } #' @@ -342,7 +342,7 @@ NULL #' #' # Create new custom linear base-learner factory: #' custom.lin.factory = BaselearnerCustom$new(data.source, data.target, -#' list(instantiate.fun = instantiateDataFun, train.fun = trainFun, +#' list(instantiate.fun = instantiateDataFun, train.fun = trainFun, #' predict.fun = predictFun, param.fun = extractParameter)) #' #' # Get the transformed data: @@ -487,9 +487,9 @@ NULL #' data.target1 = InMemoryData$new() #' data.target2 = InMemoryData$new() #' -#' lin.factory = BaselearnerPolynomial$new(data.source, data.target1, +#' lin.factory = BaselearnerPolynomial$new(data.source, data.target1, #' list(degree = 1, intercept = TRUE)) -#' poly.factory = BaselearnerPolynomial$new(data.source, data.target2, +#' poly.factory = BaselearnerPolynomial$new(data.source, data.target2, #' list(degree = 2, intercept = TRUE)) #' #' # Create new base-learner list: @@ -661,7 +661,7 @@ NULL #' \url{https://schalkdaniel.github.io/compboost/cpp_man/html/classloss_1_1_binomial_loss.html}. #' #' @examples -#' +#' #' # Create new loss object: #' bin.loss = LossBinomial$new() #' bin.loss @@ -788,6 +788,40 @@ NULL #' @export LossCustomCpp NULL +#' Create response object for regression. +#' +#' \code{ResponseRegr} creates a response object that are used as target during the +#' fitting process. +#' +#' @format \code{\link{S4}} object. +#' @name ResponseRegr +#' +#' @section Usage: +#' \preformatted{ +#' ResponseRegr$new(target_name, response) +#' ResponseRegr$new(target_name, response, weights) +#' } +#' +#' @export ResponseRegr +NULL + +#' Create response object for binary classification. +#' +#' \code{ResponseBinaryClassif} creates a response object that are used as target during the +#' fitting process. +#' +#' @format \code{\link{S4}} object. +#' @name ResponseBinaryClassif +#' +#' @section Usage: +#' \preformatted{ +#' ResponseBinaryClassif$new(target_name, response) +#' ResponseBinaryClassif$new(target_name, response, weights) +#' } +#' +#' @export ResponseBinaryClassif +NULL + #' Logger class to log the current iteration #' #' @format \code{\link{S4}} object. @@ -936,7 +970,7 @@ NULL #' #' @section Usage: #' \preformatted{ -#' LoggerOobRisk$new(logger_id, use_as_stopper, used_loss, eps_for_break, +#' LoggerOobRisk$new(logger_id, use_as_stopper, used_loss, eps_for_break, #' oob_data, oob_response) #' } #' @@ -1032,8 +1066,11 @@ NULL #' # Used loss: #' log.bin = LossBinomial$new() #' +#' # Define response object of oob data: +#' oob.response = ResponseRegr$new("oob_response", as.matrix(y.oob)) +#' #' # Define logger: -#' log.oob.risk = LoggerOobRisk$new("oob", FALSE, log.bin, 0.05, oob.list, y.oob) +#' log.oob.risk = LoggerOobRisk$new("oob", FALSE, log.bin, 0.05, oob.list, oob.response) #' #' # Summarize logger: #' log.oob.risk$summarizeLogger() @@ -1190,7 +1227,7 @@ NULL #' Coordinate Descent with line search #' -#' This class defines a new object which is used to conduct Coordinate Descent with line search. +#' This class defines a new object which is used to conduct Coordinate Descent with line search. #' The optimizer just calculates for each base-learner the sum of squared error and returns #' the base-learner with the smallest SSE. In addition, this optimizer computes #' a line search to find the optimal step size in each iteration. @@ -1319,6 +1356,7 @@ NULL #' #' # Target variable: #' y = df[["mpg.cat"]] +#' response = ResponseBinaryClassif$new("mpg.cat", as.matrix(y)) #' #' data.source.hp = InMemoryData$new(X.hp, "hp") #' data.source.wt = InMemoryData$new(X.wt, "wt") @@ -1335,13 +1373,13 @@ NULL #' test.data = oob.data #' #' # Factories: -#' linear.factory.hp = BaselearnerPolynomial$new(data.source.hp, data.target.hp1, +#' linear.factory.hp = BaselearnerPolynomial$new(data.source.hp, data.target.hp1, #' list(degree = 1, intercept = TRUE)) -#' linear.factory.wt = BaselearnerPolynomial$new(data.source.wt, data.target.wt1, +#' linear.factory.wt = BaselearnerPolynomial$new(data.source.wt, data.target.wt1, #' list(degree = 1, intercept = TRUE)) -#' quadratic.factory.hp = BaselearnerPolynomial$new(data.source.hp, data.target.hp2, +#' quadratic.factory.hp = BaselearnerPolynomial$new(data.source.hp, data.target.hp2, #' list(degree = 2, intercept = TRUE)) -#' spline.factory.wt = BaselearnerPSpline$new(data.source.wt, data.target.wt2, +#' spline.factory.wt = BaselearnerPSpline$new(data.source.wt, data.target.wt2, #' list(degree = 3, n.knots = 10, penalty = 2, differences = 2)) #' #' # Create new factory list: @@ -1365,8 +1403,6 @@ NULL #' # time, inbag risk and oob risk: #' log.iterations = LoggerIteration$new(" iteration.logger", TRUE, 500) #' log.time = LoggerTime$new("time.logger", FALSE, 500, "microseconds") -#' log.inbag = LoggerInbagRisk$new("inbag.binomial", FALSE, loss.bin, 0.05) -#' log.oob = LoggerOobRisk$new("oob.binomial", FALSE, loss.bin, 0.05, oob.data, y) #' #' # Define new logger list: #' logger.list = LoggerList$new() @@ -1374,15 +1410,13 @@ NULL #' # Register the logger: #' logger.list$registerLogger(log.iterations) #' logger.list$registerLogger(log.time) -#' logger.list$registerLogger(log.inbag) -#' logger.list$registerLogger(log.oob) #' #' # Run compboost: #' # -------------- #' #' # Initialize object: #' cboost = Compboost_internal$new( -#' response = y, +#' response = response, #' learning_rate = 0.05, #' stop_if_all_stopper_fulfilled = FALSE, #' factory_list = factory.list, diff --git a/R/compboost.R b/R/compboost.R index aa1c5d5b..047dc09f 100644 --- a/R/compboost.R +++ b/R/compboost.R @@ -26,21 +26,21 @@ #' cboost$getInbagRisk() #' #' cboost$getSelectedBaselearner() -#' +#' #' cboost$getEstimatedCoef() -#' +#' #' cboost$plot(blearner.type = NULL, iters = NULL, from = NULL, to = NULL, length.out = 1000) -#' +#' #' cboost$getBaselearnerNames() -#' +#' #' cboost$prepareData(newdata) -#' +#' #' cboost$getLoggerData() -#' +#' #' cboost$calculateFeatureImportance(num.feats = NULL) -#' +#' #' cboost$plotFeatureImportance(num.feats = NULL) -#' +#' #' cboost$plotInbagVsOobRisk() #' #' } @@ -51,7 +51,7 @@ #' A data frame containing the data. #' } #' \item{\code{target}}{[\code{character(1)}]\cr -#' Character value containing the target variable. Note that the loss must match the +#' Character value containing the target variable. Note that the loss must match the #' data type of the target. #' } #' \item{\code{optimizer}}{[\code{S4 Optimizer}]\cr @@ -59,7 +59,7 @@ #' to select features at each iteration. #' } #' \item{\code{loss}}{[\code{S4 Loss}]\cr -#' Initialized \code{S4 Loss} object exposed by Rcpp that is used to calculate the risk and pseudo +#' Initialized \code{S4 Loss} object exposed by Rcpp that is used to calculate the risk and pseudo #' residuals (e.g. \code{LossQuadratic$new()}). #' } #' \item{\code{learning.rage}}{[\code{numeric(1)}]\cr @@ -73,11 +73,11 @@ #' \strong{For cboost$addLogger()}: #' \describe{ #' \item{\code{logger}}{[\code{S4 Logger}]\cr -#' Uninitialized \code{S4 Logger} class object that is registered in the model. +#' Uninitialized \code{S4 Logger} class object that is registered in the model. #' See the details for possible choices. #' } #' \item{\code{use.as.stopper}}{[\code{logical(1)}]\cr -#' Logical value indicating whether the new logger should also be used as stopper +#' Logical value indicating whether the new logger should also be used as stopper #' (early stopping). Default value is \code{FALSE}. #' } #' \item{\code{logger.id}}{[\code{character(1)}]\cr @@ -91,31 +91,28 @@ #' #' \strong{For cboost$addBaselearner()}: #' \describe{ -#' \item{\code{features}}{[\code{character()}]\cr -#' Vector of column names that are used as input data matrix for a single base-learner. Note that not -#' every base-learner supports the use of multiple features (e.g. the spline base-learner does not). -#' } -#' \item{\code{id}}{[\code{character(1)}]\cr -#' Identifier of the base-learners. This is necessary since it is possible to define multiple -#' base-learner with the same underlying data. -#' } -#' \item{\code{bl.factory}}{[\code{S4 Factory}]\cr -#' Uninitialized base-learner factory given as \code{S4 Factory} class. See the details -#' for possible choices. -#' } -#' \item{\code{data.source}}{[\code{S4 Data}]\cr -#' Uninitialized \code{S4 Data} object which is used to store the data. At the moment -#' just in memory training is supported. -#' } -#' \item{\code{data.target}}{[\code{S4 Data}]\cr -#' Uninitialized \code{S4 Data} object which is used to store the data. At the moment -#' just in memory training is supported. -#' } -#' \item{}{\code{...}\cr -#' Further arguments passed to the constructor of the \code{S4 Factory} class specified in -#' \code{bl.factory}. For possible arguments see the help pages (e.g. \code{?BaselearnerPSplineFactory}) -#' of the \code{S4} classes. -#' } +#' \item{\code{features}}{[\code{character()}]\cr +#' Vector of column names which are used as input data matrix for a single base-learner. Note that not +#' every base-learner supports the use of multiple features (e.g. the spline base-learner does not). +#' } +#' \item{\code{id}}{[\code{character(1)}]\cr +#' Id of the base-learners. This is necessary since it is possible to define multiple learners with the same underlying data. +#' } +#' \item{\code{bl.factory}}{[\code{S4 Factory}]\cr +#' Uninitialized base-learner factory given as \code{S4 Factory} class. See the details +#' for possible choices. +#' } +#' \item{\code{data.source}}{[\code{S4 Data}]\cr +#' Data source object. At the moment just in memory is supported. +#' } +#' \item{\code{data.target}}{[\code{S4 Data}]\cr +#' Data target object. At the moment just in memory is supported. +#' } +#' \item{}{\code{...}\cr +#' Further arguments passed to the constructor of the \code{S4 Factory} class specified in +#' \code{bl.factory}. For possible arguments see the help pages (e.g. \code{?BaselearnerPSplineFactory}) +#' of the \code{S4} classes. +#' } #' } #' #' \strong{For cboost$train()}: @@ -140,21 +137,21 @@ #' } #' \strong{For cboost$plot()}: #' \describe{ -#' \item{\code{blearner.type}}{[\code{character(1)}]\cr -#' Character value containing the name of the base-learner which should be visualized. -#' } -#' \item{\code{iters}}{[\code{integer()}]\cr -#' Integer vector containing the iterations at stages for that the effect should be visualized. -#' } -#' \item{\code{from}}{[\code{numeric(1)}]\cr -#' Lower bound for plotting (should be smaller than \code{to}). -#' } -#' \item{\code{to}}{[\code{numeric(1)}]\cr -#' Upper bound for plotting (should be greater than \code{from}). -#' } -#' \item{\code{length.out}}{[\code{integer(1)}]\cr -#' Number of equidistant points between \code{from} and \code{to} used for plotting. -#' } +#' \item{\code{blearner.type}}{[\code{character(1)}]\cr +#' Character name of the base-learner to plot the additional contribution to the response. +#' } +#' \item{\code{iters}}{[\code{integer()}]\cr +#' Integer vector containing the iterations the user wants to illustrate. +#' } +#' \item{\code{from}}{[\code{numeric(1)}]\cr +#' Lower bound for plotting (should be smaller than \code{to}). +#' } +#' \item{\code{to}}{[\code{numeric(1)}]\cr +#' Upper bound for plotting (should be greater than \code{from}). +#' } +#' \item{\code{length.out}}{[\code{integer(1)}]\cr +#' Number of equidistant points between \code{from} and \code{to} used for plotting. +#' } #' } #' @section Details: #' \strong{Loss}\cr @@ -162,16 +159,16 @@ #' \itemize{ #' \item #' \code{LossQuadratic} (Regression) -#' +#' #' \item #' \code{LossAbsolute} (Regression) -#' +#' #' \item #' \code{LossBinomial} (Binary Classification) -#' +#' #' \item #' \code{LossCustom} (Custom) -#' +#' # \item # \code{LossCustomCpp} (Custom) #' } @@ -234,7 +231,7 @@ #' } #' } #' } -#' +#' #' \strong{Note}: #' \itemize{ #' \item @@ -263,7 +260,7 @@ #' Name of the target variable #' } #' \item{\code{id} [\code{character(1)}]}{ -#' Name of the given dataset. +#' Name of the given dataset. #' } #' \item{\code{optimizer} [\code{S4 Optimizer}]}{ #' Optimizer used within the fitting process. @@ -312,7 +309,7 @@ #' cboost$addBaselearner("hp", "spline", BaselearnerPSpline, degree = 3, #' n.knots = 10, penalty = 2, differences = 2) #' cboost$train(1000) -#' +#' #' table(cboost$getSelectedBaselearner()) #' cboost$plot("hp_spline") #' cboost$plotInbagVsOobRisk() @@ -337,7 +334,6 @@ Compboost = R6::R6Class("Compboost", stop.if.all.stoppers.fulfilled = FALSE, initialize = function(data, target, optimizer = OptimizerCoordinateDescent$new(), loss, learning.rate = 0.05, oob.fraction = NULL) { checkmate::assertDataFrame(data, any.missing = FALSE, min.rows = 1) - checkmate::assertCharacter(target) checkmate::assertNumeric(learning.rate, lower = 0, upper = 1, any.missing = FALSE, len = 1) checkmate::assertNumeric(oob.fraction, lower = 0, upper = 1, any.missing = FALSE, len = 1, null.ok = TRUE) @@ -347,46 +343,46 @@ Compboost = R6::R6Class("Compboost", if (inherits(loss, "C++Class")) { stop ("Loss should be an initialized loss object by calling the constructor: ", deparse(substitute(loss)), "$new()") } - + self$id = deparse(substitute(data)) - data = droplevels(as.data.frame(data)) - response = data[[target]] - - # Transform factor or character labels to -1 and 1 - if (! is.numeric(response)) { - response = as.factor(response) - - if (length(levels(response)) > 2) { - stop("Multiclass classification is not supported.") - } - self$positive.category = levels(response)[1] - - # Transform to vector with -1 and 1: - response = as.integer(response) * (1 - as.integer(response)) + 1 - } if (! is.null(oob.fraction)) { private$oob.idx = sample(x = seq_len(nrow(data)), size = floor(oob.fraction * nrow(data)), replace = FALSE) } private$train.idx = setdiff(seq_len(nrow(data)), private$oob.idx) + if (is.character(target)) { + checkmate::assertCharacter(target) + if (! target %in% names(data)) + stop ("The target ", target, " is not present within the data") + + # With .vectorToRespone we are very restricted to the task types. We can just guess for regression or classification. For every + # other task one should use the Response interface! + self$response = .vectorToResponse(data[[target]], target) + } else { + .assertRcppClass(target, "Response") + if (nrow(target$getResponse()) != nrow(data)) + stop("Response must have same number of observations as the given dataset.") + self$response = target + } + self$oob.fraction = oob.fraction - self$target = target - self$response = response[private$train.idx] + self$target = self$response$getTargetName() self$data = data[private$train.idx, !colnames(data) %in% target, drop = FALSE] self$optimizer = optimizer self$loss = loss self$learning.rate = learning.rate - if (! is.null(self$oob.fraction)) { + if (! is.null(self$oob.fraction)) { self$data.oob = data[private$oob.idx, !colnames(data) %in% target, drop = FALSE] - self$response.oob = response[private$oob.idx] + self$response.oob = .vectorToResponse(self$response$getResponse()[private$oob.idx, , drop = FALSE], "oob_response") + self$response$filter(private$train.idx) } # Initialize new base-learner factory list. All factories which are defined in # `addBaselearners` are registered here: self$bl.factory.list = BlearnerFactoryList$new() - + }, addLogger = function(logger, use.as.stopper = FALSE, logger.id, ...) { private$l.list[[logger.id]] = logger$new(logger.id, use.as.stopper = use.as.stopper, ...) @@ -402,7 +398,7 @@ Compboost = R6::R6Class("Compboost", if (!is.null(self$model)) { stop("No base-learners can be added after training is started") } - + # Clear base-learners which are within the bl.list but not registered: idx.remove = ! names(private$bl.list) %in% self$bl.factory.list$getRegisteredFactoryNames() if (any(idx.remove)) { @@ -410,10 +406,10 @@ Compboost = R6::R6Class("Compboost", private$bl.list[[i]] = NULL } } - + data.columns = self$data[, feature, drop = FALSE] id.fac = paste(paste(feature, collapse = "_"), id, sep = "_") #USE stringi - + if (ncol(data.columns) == 1 && !is.numeric(data.columns[, 1])) { private$addSingleCatBl(data.columns, feature, id, id.fac, bl.factory, data.source, data.target, ...) } else { @@ -421,19 +417,19 @@ Compboost = R6::R6Class("Compboost", } }, train = function(iteration = 100, trace = -1) { - + if (self$bl.factory.list$getNumberOfRegisteredFactories() == 0) { stop("Could not train without any registered base-learner.") } - + checkmate::assertIntegerish(iteration, lower = 1, len = 1, null.ok = TRUE) # checkmate::assertFlag(trace) checkmate::assertIntegerish(trace, lower = -1, upper = iteration, len = 1, null.ok = FALSE) - + if (trace == -1) { trace = round(iteration / 40) } - + # Check if it is necessary to add a initial iteration logger. This is not the case # when the user already has add one by calling `addLogger`: if (is.null(self$model)) { @@ -463,20 +459,20 @@ Compboost = R6::R6Class("Compboost", }, prepareData = function (newdata) { new.source.features = unique(lapply(private$bl.list, function (x) x$feature)) - + new.sources = list() data.names = character() - + # Remove lapply due to categorical feature handling which needs to return multiple data objects # at once. for (ns in new.source.features) { - + data.columns = newdata[, ns, drop = FALSE] - + if (ncol(data.columns) == 1 && !is.numeric(data.columns[, 1])) { - + lvls = unlist(unique(data.columns)) - + # Create dummy variable for each category and use that vector as data matrix. Hence, # if a categorical feature has 3 groups, then these 3 groups are added as 3 different # base-learners (unbiased feature selection). @@ -520,13 +516,13 @@ Compboost = R6::R6Class("Compboost", Learning rate: {self$learning.rate} Iterations: {self$getCurrentIteration()} ") - - if (! is.null(self$positive.category)) + + if (! is.null(self$positive.category)) p = glue::glue(p, "\nPositive class: {self$positive.category}") - - if(!is.null(self$model)) + + if(! is.null(self$model)) p = glue::glue(p, "\nOffset: {round(self$model$getOffset(), 4)}") - + print(p) print(self$loss) }, @@ -537,15 +533,15 @@ Compboost = R6::R6Class("Compboost", return(NULL) }, plot = function (blearner.type = NULL, iters = NULL, from = NULL, to = NULL, length.out = 1000) { - + if (requireNamespace("ggplot2", quietly = TRUE)) { - + if (is.null(self$model)) { stop("Model needs to be trained first.") } checkmate::assertIntegerish(iters, min.len = 1, any.missing = FALSE, null.ok = TRUE) checkmate::assertCharacter(blearner.type, len = 1, null.ok = TRUE) - + if (is.null(blearner.type)) { stop("Please specify a valid base-learner plus feature.") } @@ -565,12 +561,12 @@ Compboost = R6::R6Class("Compboost", } } feat.name = private$bl.list[[blearner.type]]$target$getIdentifier() - + checkmate::assertNumeric(x = self$data[[feat.name]], min.len = 2, null.ok = FALSE) checkmate::assertNumeric(from, lower = min(self$data[[feat.name]]), upper = max(self$data[[feat.name]]), len = 1, null.ok = TRUE) checkmate::assertNumeric(to, lower = min(self$data[[feat.name]]), upper = max(self$data[[feat.name]]), len = 1, null.ok = TRUE) - - if (is.null(from)) { + + if (is.null(from)) { from = min(self$data[[feat.name]]) } if (is.null(to)) { @@ -582,10 +578,10 @@ Compboost = R6::R6Class("Compboost", from = to to = temp } - + plot.data = as.matrix(seq(from = from, to = to, length.out = length.out)) feat.map = private$bl.list[[blearner.type]]$factory$transformData(plot.data) - + # Create data.frame for plotting depending if iters is specified: if (!is.null(iters[1])) { preds = lapply(iters, function (x) { @@ -596,41 +592,41 @@ Compboost = R6::R6Class("Compboost", } }) names(preds) = iters - + df.plot = data.frame( effect = unlist(preds), iteration = as.factor(rep(iters, each = length.out)), feature = plot.data ) - + gg = ggplot2::ggplot(df.plot, ggplot2::aes(feature, effect, color = iteration)) - + } else { df.plot = data.frame( effect = feat.map %*% self$getEstimatedCoef()[[blearner.type]], feature = plot.data ) - + gg = ggplot2::ggplot(df.plot, ggplot2::aes(feature, effect)) } - + # If there are too much rows we need to take just a sample or completely remove rugs: if (nrow(self$data) > 1000) { idx.rugs = sample(seq_len(nrow(self$data)), 1000, FALSE) } else { idx.rugs = seq_len(nrow(self$data)) } - - gg = gg + - ggplot2::geom_line() + - ggplot2::geom_rug(data = self$data[idx.rugs,], ggplot2::aes_string(x = feat.name), inherit.aes = FALSE, - alpha = 0.8) + - ggplot2::xlab(feat.name) + + + gg = gg + + ggplot2::geom_line() + + ggplot2::geom_rug(data = self$data[idx.rugs,], ggplot2::aes_string(x = feat.name), inherit.aes = FALSE, + alpha = 0.8) + + ggplot2::xlab(feat.name) + ggplot2::xlim(from, to) + - ggplot2::ylab("Additive Contribution") + - ggplot2::labs(title = paste0("Effect of ", blearner.type), + ggplot2::ylab("Additive Contribution") + + ggplot2::labs(title = paste0("Effect of ", blearner.type), subtitle = "Additive contribution of predictor") - + return(gg) } else { message("Please install ggplot2 to create plots.") @@ -645,8 +641,8 @@ Compboost = R6::R6Class("Compboost", if (! is.null(self$model)) { out.list = self$model$getLoggerData() out.mat = out.list[[2]] - colnames(out.mat) = out.list[[1]] - + colnames(out.mat) = out.list[[1]] + return(as.data.frame(out.mat[seq_len(self$getCurrentIteration()),])) } else { warning("Train the model to get logger data.") @@ -657,7 +653,7 @@ Compboost = R6::R6Class("Compboost", max.feats = length(unique(self$getSelectedBaselearner())) checkmate::assert_integerish(x = num.feats, lower = 1, upper = max.feats, any.missing = FALSE, len = 1L, null.ok = TRUE) - + if (is.null(num.feats)) { num.feats = max.feats if (num.feats > 15L) { num.feats = 15L } @@ -682,7 +678,7 @@ Compboost = R6::R6Class("Compboost", data.vip = self$calculateFeatureImportance(num.feats) - gg = ggplot2::ggplot(data.vip, ggplot2::aes(x = reorder(baselearner, relative.risk.reduction), y = relative.risk.reduction)) + + gg = ggplot2::ggplot(data.vip, ggplot2::aes(x = reorder(baselearner, relative.risk.reduction), y = relative.risk.reduction)) + ggplot2::geom_bar(stat = "identity") + ggplot2::coord_flip() + ggplot2::ylab("Importance") + ggplot2::xlab("") return (gg) @@ -699,19 +695,19 @@ Compboost = R6::R6Class("Compboost", if (! is.null(self$model)) { if (requireNamespace("ggplot2", quietly = TRUE)) { inbag.trace = self$getInbagRisk() - oob.data = self$getLoggerData() - if ("oob_risk" %in% names(oob.data)) { + oob.data = self$getLoggerData() + if ("oob_risk" %in% names(oob.data)) { oob.trace = oob.data[["oob_risk"]] - + risk.data = data.frame( risk = c(inbag.trace, oob.trace), type = rep(c("inbag", "oob"), times = c(length(inbag.trace), length(oob.trace))), iter = c(seq_along(inbag.trace), seq_along(oob.trace)) ) - gg = ggplot2::ggplot(risk.data, ggplot2::aes(x = iter, y = risk, color = type)) + - ggplot2::geom_line(size = 1.1) + - ggplot2::xlab("Iteration") + + gg = ggplot2::ggplot(risk.data, ggplot2::aes(x = iter, y = risk, color = type)) + + ggplot2::geom_line(size = 1.1) + + ggplot2::xlab("Iteration") + ggplot2::ylab("Risk")# + labs(color = "") return(gg) @@ -735,9 +731,8 @@ Compboost = R6::R6Class("Compboost", logger.list = list(), oob.idx = NULL, train.idx = NULL, - initializeModel = function() { - + private$logger.list = LoggerList$new() lapply(private$l.list, function (logger) { private$logger.list$registerLogger(logger) }) # for (i in seq_along(private$l.list)) { @@ -750,43 +745,43 @@ Compboost = R6::R6Class("Compboost", if (! is.null(self$oob.fraction)) { self$addLogger(logger = LoggerOobRisk, logger.id = "oob_risk", - used.loss = self$loss, eps.for.break = 0, oob.data = self$prepareData(self$data.oob), + used.loss = self$loss, eps.for.break = 0, oob.data = self$prepareData(self$data.oob), oob.response = self$response.oob) } }, addSingleNumericBl = function(data.columns, feature, id.fac, id, bl.factory, data.source, data.target, ...) { - + private$bl.list[[id]] = list() private$bl.list[[id]]$source = data.source$new(as.matrix(data.columns), paste(feature, collapse = "_")) private$bl.list[[id]]$feature = feature private$bl.list[[id]]$target = data.target$new() - + # Call handler for default arguments and argument handling: # handler.name = paste0(".handle", bl.factory@.Data) # par.set = c(source = private$bl.list[[id]]$source, target = private$bl.list[[id]]$target, id = id.fac, do.call(handler.name, list(...))) # private$bl.list[[id]]$factory = do.call(bl.factory$new, par.set) private$bl.list[[id]]$factory = bl.factory$new(private$bl.list[[id]]$source, private$bl.list[[id]]$target, id.fac, list(...)) - + self$bl.factory.list$registerFactory(private$bl.list[[id]]$factory) private$bl.list[[id]]$source = NULL - - }, + + }, addSingleCatBl = function(data.column, feature, id.fac, id, bl.factory, data.source, data.target, ...) { - + lvls = unlist(unique(data.column)) - + # Create dummy variable for each category and use that vector as data matrix. Hence, # if a categorical feature has 3 groups, then these 3 groups are added as 3 different # base-learners (unbiased feature selection). for (lvl in lvls) { - + list.id = paste(feature, lvl, id.fac, sep = "_") - + private$addSingleNumericBl(data.columns = as.matrix(as.integer(data.column == lvl)), - feature = paste(feature, lvl, sep = "_"), id.fac = id.fac, + feature = paste(feature, lvl, sep = "_"), id.fac = id.fac, id = list.id, bl.factory, data.source, data.target, ...) - + # This is important because of: # 1. feature in addSingleNumericBl needs to be something like cat_feature_Group1 to define the # data objects correctly in a unique way. diff --git a/R/helper.R b/R/helper.R new file mode 100644 index 00000000..9c711e58 --- /dev/null +++ b/R/helper.R @@ -0,0 +1,43 @@ +.assertRcppClass = function (x, x.class, stop.when.error = TRUE) +{ + cls = class(x) + rcpp.class = TRUE + if (! grepl("Rcpp", cls)) { + stop("Object was not exposed by Rcpp.") + } + if (! grepl(x.class, cls)) { + stop("Object does not belong to class ", x.class, ".") + } +} + +.vectorToResponse = function (vec, target) +{ + # Transform factor or character labels to -1 and 1 + if (! is.numeric(vec)) { + vec = as.factor(vec) + + if (length(levels(vec)) > 2) { + stop("Multiclass classification is not supported.") + } + # self$positive.category = levels(vec)[1] + # Transform to vector with -1 and 1: + vec = as.integer(vec) * (1 - as.integer(vec)) + 1 + return (ResponseBinaryClassif$new(target, as.matrix(vec))) + } else { + return (ResponseRegr$new(target, as.matrix(vec))) + } +} + + +# .vectorToResponse = function (vec, target) +# { +# # Classification: +# if (is.character(vec)) { +# vec = as.factor(vec) +# +# if (length(levels(vec)) == 2) return (ResponseBinaryClassif$new(target, as.matrix(vec))) +# if (length(levels(vec)) > 2) stop("Multiclass classification is not supported.") +# } +# # Regression: +# if (is.numeric(vec)) return (ResponseRegr$new(target, as.matrix(vec))) +# } diff --git a/R/load_modules.R b/R/load_modules.R index 7906ae0d..f2c960d4 100644 --- a/R/load_modules.R +++ b/R/load_modules.R @@ -44,3 +44,4 @@ Rcpp::loadModule(module = "baselearner_list_module", what = TRUE) Rcpp::loadModule(module = "logger_module", what = TRUE) Rcpp::loadModule(module = "optimizer_module", what = TRUE) Rcpp::loadModule(module = "data_module", what = TRUE) +Rcpp::loadModule(module = "response_module", what = TRUE) diff --git a/man/BaselearnerCustom.Rd b/man/BaselearnerCustom.Rd index 6bb50960..d6f1c4e7 100644 --- a/man/BaselearnerCustom.Rd +++ b/man/BaselearnerCustom.Rd @@ -12,7 +12,7 @@ \section{Usage}{ \preformatted{ -BaselearnerCustom$new(data_source, data_target, list(instantiate.fun, +BaselearnerCustom$new(data_source, data_target, list(instantiate.fun, train.fun, predict.fun, param.fun)) } } @@ -114,7 +114,7 @@ extractParameter = function (model) { # Create new custom linear base-learner factory: custom.lin.factory = BaselearnerCustom$new(data.source, data.target, - list(instantiate.fun = instantiateDataFun, train.fun = trainFun, + list(instantiate.fun = instantiateDataFun, train.fun = trainFun, predict.fun = predictFun, param.fun = extractParameter)) # Get the transformed data: diff --git a/man/BaselearnerPolynomial.Rd b/man/BaselearnerPolynomial.Rd index f2aba52a..592ade5f 100644 --- a/man/BaselearnerPolynomial.Rd +++ b/man/BaselearnerPolynomial.Rd @@ -74,9 +74,9 @@ data.target1 = InMemoryData$new() data.target2 = InMemoryData$new() # Create new linear base-learner factory: -lin.factory = BaselearnerPolynomial$new(data.source, data.target1, +lin.factory = BaselearnerPolynomial$new(data.source, data.target1, list(degree = 2, intercept = FALSE)) -lin.factory.int = BaselearnerPolynomial$new(data.source, data.target2, +lin.factory.int = BaselearnerPolynomial$new(data.source, data.target2, list(degree = 2, intercept = TRUE)) # Get the transformed data: diff --git a/man/BlearnerFactoryList.Rd b/man/BlearnerFactoryList.Rd index 17356d47..5ed655ea 100644 --- a/man/BlearnerFactoryList.Rd +++ b/man/BlearnerFactoryList.Rd @@ -55,9 +55,9 @@ data.source = InMemoryData$new(data.mat, "my.data.name") data.target1 = InMemoryData$new() data.target2 = InMemoryData$new() -lin.factory = BaselearnerPolynomial$new(data.source, data.target1, +lin.factory = BaselearnerPolynomial$new(data.source, data.target1, list(degree = 1, intercept = TRUE)) -poly.factory = BaselearnerPolynomial$new(data.source, data.target2, +poly.factory = BaselearnerPolynomial$new(data.source, data.target2, list(degree = 2, intercept = TRUE)) # Create new base-learner list: diff --git a/man/Compboost.Rd b/man/Compboost.Rd index 5c947ab0..994ce821 100644 --- a/man/Compboost.Rd +++ b/man/Compboost.Rd @@ -58,7 +58,7 @@ cboost$plotInbagVsOobRisk() A data frame containing the data. } \item{\code{target}}{[\code{character(1)}]\cr - Character value containing the target variable. Note that the loss must match the + Character value containing the target variable. Note that the loss must match the data type of the target. } \item{\code{optimizer}}{[\code{S4 Optimizer}]\cr @@ -66,7 +66,7 @@ cboost$plotInbagVsOobRisk() to select features at each iteration. } \item{\code{loss}}{[\code{S4 Loss}]\cr - Initialized \code{S4 Loss} object exposed by Rcpp that is used to calculate the risk and pseudo + Initialized \code{S4 Loss} object exposed by Rcpp that is used to calculate the risk and pseudo residuals (e.g. \code{LossQuadratic$new()}). } \item{\code{learning.rage}}{[\code{numeric(1)}]\cr @@ -80,11 +80,11 @@ cboost$plotInbagVsOobRisk() \strong{For cboost$addLogger()}: \describe{ \item{\code{logger}}{[\code{S4 Logger}]\cr - Uninitialized \code{S4 Logger} class object that is registered in the model. + Uninitialized \code{S4 Logger} class object that is registered in the model. See the details for possible choices. } \item{\code{use.as.stopper}}{[\code{logical(1)}]\cr - Logical value indicating whether the new logger should also be used as stopper + Logical value indicating whether the new logger should also be used as stopper (early stopping). Default value is \code{FALSE}. } \item{\code{logger.id}}{[\code{character(1)}]\cr @@ -98,31 +98,28 @@ cboost$plotInbagVsOobRisk() \strong{For cboost$addBaselearner()}: \describe{ - \item{\code{features}}{[\code{character()}]\cr - Vector of column names that are used as input data matrix for a single base-learner. Note that not - every base-learner supports the use of multiple features (e.g. the spline base-learner does not). - } - \item{\code{id}}{[\code{character(1)}]\cr - Identifier of the base-learners. This is necessary since it is possible to define multiple - base-learner with the same underlying data. - } - \item{\code{bl.factory}}{[\code{S4 Factory}]\cr - Uninitialized base-learner factory given as \code{S4 Factory} class. See the details - for possible choices. - } - \item{\code{data.source}}{[\code{S4 Data}]\cr - Uninitialized \code{S4 Data} object which is used to store the data. At the moment - just in memory training is supported. - } - \item{\code{data.target}}{[\code{S4 Data}]\cr - Uninitialized \code{S4 Data} object which is used to store the data. At the moment - just in memory training is supported. - } - \item{}{\code{...}\cr - Further arguments passed to the constructor of the \code{S4 Factory} class specified in - \code{bl.factory}. For possible arguments see the help pages (e.g. \code{?BaselearnerPSplineFactory}) - of the \code{S4} classes. - } +\item{\code{features}}{[\code{character()}]\cr + Vector of column names which are used as input data matrix for a single base-learner. Note that not + every base-learner supports the use of multiple features (e.g. the spline base-learner does not). +} +\item{\code{id}}{[\code{character(1)}]\cr + Id of the base-learners. This is necessary since it is possible to define multiple learners with the same underlying data. +} +\item{\code{bl.factory}}{[\code{S4 Factory}]\cr + Uninitialized base-learner factory given as \code{S4 Factory} class. See the details + for possible choices. +} +\item{\code{data.source}}{[\code{S4 Data}]\cr + Data source object. At the moment just in memory is supported. +} +\item{\code{data.target}}{[\code{S4 Data}]\cr + Data target object. At the moment just in memory is supported. +} +\item{}{\code{...}\cr + Further arguments passed to the constructor of the \code{S4 Factory} class specified in + \code{bl.factory}. For possible arguments see the help pages (e.g. \code{?BaselearnerPSplineFactory}) + of the \code{S4} classes. +} } \strong{For cboost$train()}: @@ -147,21 +144,21 @@ cboost$plotInbagVsOobRisk() } \strong{For cboost$plot()}: \describe{ - \item{\code{blearner.type}}{[\code{character(1)}]\cr - Character value containing the name of the base-learner which should be visualized. - } - \item{\code{iters}}{[\code{integer()}]\cr - Integer vector containing the iterations at stages for that the effect should be visualized. - } - \item{\code{from}}{[\code{numeric(1)}]\cr - Lower bound for plotting (should be smaller than \code{to}). - } - \item{\code{to}}{[\code{numeric(1)}]\cr - Upper bound for plotting (should be greater than \code{from}). - } - \item{\code{length.out}}{[\code{integer(1)}]\cr - Number of equidistant points between \code{from} and \code{to} used for plotting. - } +\item{\code{blearner.type}}{[\code{character(1)}]\cr + Character name of the base-learner to plot the additional contribution to the response. +} +\item{\code{iters}}{[\code{integer()}]\cr + Integer vector containing the iterations the user wants to illustrate. +} +\item{\code{from}}{[\code{numeric(1)}]\cr + Lower bound for plotting (should be smaller than \code{to}). +} +\item{\code{to}}{[\code{numeric(1)}]\cr + Upper bound for plotting (should be greater than \code{from}). +} +\item{\code{length.out}}{[\code{integer(1)}]\cr + Number of equidistant points between \code{from} and \code{to} used for plotting. +} } } @@ -172,16 +169,16 @@ cboost$plotInbagVsOobRisk() \itemize{ \item \code{LossQuadratic} (Regression) - + \item \code{LossAbsolute} (Regression) - + \item \code{LossBinomial} (Binary Classification) - + \item \code{LossCustom} (Custom) - + } (For each loss take also a look at the help pages (e.g. \code{?LossBinomial}) and the \code{C++} documentation for details) @@ -242,7 +239,7 @@ cboost$plotInbagVsOobRisk() } } } - + \strong{Note}: \itemize{ \item @@ -273,7 +270,7 @@ cboost$plotInbagVsOobRisk() Name of the target variable } \item{\code{id} [\code{character(1)}]}{ - Name of the given dataset. + Name of the given dataset. } \item{\code{optimizer} [\code{S4 Optimizer}]}{ Optimizer used within the fitting process. diff --git a/man/Compboost_internal.Rd b/man/Compboost_internal.Rd index 13b7782c..24f26649 100644 --- a/man/Compboost_internal.Rd +++ b/man/Compboost_internal.Rd @@ -113,6 +113,7 @@ X.wt = as.matrix(df[["wt"]]) # Target variable: y = df[["mpg.cat"]] +response = ResponseBinaryClassif$new("mpg.cat", as.matrix(y)) data.source.hp = InMemoryData$new(X.hp, "hp") data.source.wt = InMemoryData$new(X.wt, "wt") @@ -129,13 +130,13 @@ oob.data = list(data.source.hp, data.source.wt) test.data = oob.data # Factories: -linear.factory.hp = BaselearnerPolynomial$new(data.source.hp, data.target.hp1, +linear.factory.hp = BaselearnerPolynomial$new(data.source.hp, data.target.hp1, list(degree = 1, intercept = TRUE)) -linear.factory.wt = BaselearnerPolynomial$new(data.source.wt, data.target.wt1, +linear.factory.wt = BaselearnerPolynomial$new(data.source.wt, data.target.wt1, list(degree = 1, intercept = TRUE)) -quadratic.factory.hp = BaselearnerPolynomial$new(data.source.hp, data.target.hp2, +quadratic.factory.hp = BaselearnerPolynomial$new(data.source.hp, data.target.hp2, list(degree = 2, intercept = TRUE)) -spline.factory.wt = BaselearnerPSpline$new(data.source.wt, data.target.wt2, +spline.factory.wt = BaselearnerPSpline$new(data.source.wt, data.target.wt2, list(degree = 3, n.knots = 10, penalty = 2, differences = 2)) # Create new factory list: @@ -159,8 +160,6 @@ optimizer = OptimizerCoordinateDescent$new() # time, inbag risk and oob risk: log.iterations = LoggerIteration$new(" iteration.logger", TRUE, 500) log.time = LoggerTime$new("time.logger", FALSE, 500, "microseconds") -log.inbag = LoggerInbagRisk$new("inbag.binomial", FALSE, loss.bin, 0.05) -log.oob = LoggerOobRisk$new("oob.binomial", FALSE, loss.bin, 0.05, oob.data, y) # Define new logger list: logger.list = LoggerList$new() @@ -168,15 +167,13 @@ logger.list = LoggerList$new() # Register the logger: logger.list$registerLogger(log.iterations) logger.list$registerLogger(log.time) -logger.list$registerLogger(log.inbag) -logger.list$registerLogger(log.oob) # Run compboost: # -------------- # Initialize object: cboost = Compboost_internal$new( - response = y, + response = response, learning_rate = 0.05, stop_if_all_stopper_fulfilled = FALSE, factory_list = factory.list, diff --git a/man/LoggerOobRisk.Rd b/man/LoggerOobRisk.Rd index 60e41a76..f7d869ed 100644 --- a/man/LoggerOobRisk.Rd +++ b/man/LoggerOobRisk.Rd @@ -12,7 +12,7 @@ see the use case or extending compboost vignette. \section{Usage}{ \preformatted{ -LoggerOobRisk$new(logger_id, use_as_stopper, used_loss, eps_for_break, +LoggerOobRisk$new(logger_id, use_as_stopper, used_loss, eps_for_break, oob_data, oob_response) } } @@ -118,8 +118,11 @@ y.oob = rnorm(10) # Used loss: log.bin = LossBinomial$new() +# Define response object of oob data: +oob.response = ResponseRegr$new("oob_response", as.matrix(y.oob)) + # Define logger: -log.oob.risk = LoggerOobRisk$new("oob", FALSE, log.bin, 0.05, oob.list, y.oob) +log.oob.risk = LoggerOobRisk$new("oob", FALSE, log.bin, 0.05, oob.list, oob.response) # Summarize logger: log.oob.risk$summarizeLogger() diff --git a/man/OptimizerCoordinateDescentLineSearch.Rd b/man/OptimizerCoordinateDescentLineSearch.Rd index 7058844c..768e339e 100644 --- a/man/OptimizerCoordinateDescentLineSearch.Rd +++ b/man/OptimizerCoordinateDescentLineSearch.Rd @@ -5,7 +5,7 @@ \title{Coordinate Descent with line search} \format{\code{\link{S4}} object.} \description{ -This class defines a new object which is used to conduct Coordinate Descent with line search. +This class defines a new object which is used to conduct Coordinate Descent with line search. The optimizer just calculates for each base-learner the sum of squared error and returns the base-learner with the smallest SSE. In addition, this optimizer computes a line search to find the optimal step size in each iteration. diff --git a/man/ResponseBinaryClassif.Rd b/man/ResponseBinaryClassif.Rd new file mode 100644 index 00000000..8a443530 --- /dev/null +++ b/man/ResponseBinaryClassif.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{ResponseBinaryClassif} +\alias{ResponseBinaryClassif} +\title{Create response object for binary classification.} +\format{\code{\link{S4}} object.} +\description{ +\code{ResponseBinaryClassif} creates a response object that are used as target during the +fitting process. +} +\section{Usage}{ + +\preformatted{ +ResponseBinaryClassif$new(target_name, response) +ResponseBinaryClassif$new(target_name, response, weights) +} +} + diff --git a/man/ResponseRegr.Rd b/man/ResponseRegr.Rd new file mode 100644 index 00000000..c29f4978 --- /dev/null +++ b/man/ResponseRegr.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{ResponseRegr} +\alias{ResponseRegr} +\title{Create response object for regression.} +\format{\code{\link{S4}} object.} +\description{ +\code{ResponseRegr} creates a response object that are used as target during the +fitting process. +} +\section{Usage}{ + +\preformatted{ +ResponseRegr$new(target_name, response) +ResponseRegr$new(target_name, response, weights) +} +} + diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 5462f8ed..abf335dc 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -11,6 +11,7 @@ RcppExport SEXP _rcpp_module_boot_data_module(); RcppExport SEXP _rcpp_module_boot_baselearner_factory_module(); RcppExport SEXP _rcpp_module_boot_baselearner_list_module(); RcppExport SEXP _rcpp_module_boot_loss_module(); +RcppExport SEXP _rcpp_module_boot_response_module(); RcppExport SEXP _rcpp_module_boot_logger_module(); RcppExport SEXP _rcpp_module_boot_optimizer_module(); RcppExport SEXP _rcpp_module_boot_compboost_module(); @@ -20,6 +21,7 @@ static const R_CallMethodDef CallEntries[] = { {"_rcpp_module_boot_baselearner_factory_module", (DL_FUNC) &_rcpp_module_boot_baselearner_factory_module, 0}, {"_rcpp_module_boot_baselearner_list_module", (DL_FUNC) &_rcpp_module_boot_baselearner_list_module, 0}, {"_rcpp_module_boot_loss_module", (DL_FUNC) &_rcpp_module_boot_loss_module, 0}, + {"_rcpp_module_boot_response_module", (DL_FUNC) &_rcpp_module_boot_response_module, 0}, {"_rcpp_module_boot_logger_module", (DL_FUNC) &_rcpp_module_boot_logger_module, 0}, {"_rcpp_module_boot_optimizer_module", (DL_FUNC) &_rcpp_module_boot_optimizer_module, 0}, {"_rcpp_module_boot_compboost_module", (DL_FUNC) &_rcpp_module_boot_compboost_module, 0}, diff --git a/src/baselearner.cpp b/src/baselearner.cpp index f0bc0718..f6554dd6 100644 --- a/src/baselearner.cpp +++ b/src/baselearner.cpp @@ -13,23 +13,8 @@ // Compboost is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// MIT License for more details. You should have received a copy of -// the MIT License along with compboost. -// -// Written by: -// ----------- -// -// Daniel Schalk -// Department of Statistics -// Ludwig-Maximilians-University Munich -// Ludwigstrasse 33 -// D-80539 München -// -// https://www.compstat.statistik.uni-muenchen.de -// -// Contact -// e: contact@danielschalk.com -// w: danielschalk.com +// MIT License for more details. You should have received a copy of +// the MIT License along with compboost. // // =========================================================================== # @@ -42,7 +27,7 @@ namespace blearner { // -------------------------------------------------------------------------- // // Copy (or initialize) the members in new copied class: -void Baselearner::copyMembers (const arma::mat& parameter0, +void Baselearner::copyMembers (const arma::mat& parameter0, const std::string& blearner_identifier0, data::Data* data0) { parameter = parameter0; @@ -108,7 +93,7 @@ std::string Baselearner::getBaselearnerType () const Baselearner::~Baselearner () { // Rcpp::Rcout << "Call Baselearner Destructor" << std::endl; - + // delete blearner_type; // delete data_ptr; // delete data_identifier_ptr; @@ -121,8 +106,8 @@ Baselearner::~Baselearner () // BaselearnerPolynomial: // ----------------------- -BaselearnerPolynomial::BaselearnerPolynomial (data::Data* data, const std::string& identifier, - const unsigned int& degree, const bool& intercept) +BaselearnerPolynomial::BaselearnerPolynomial (data::Data* data, const std::string& identifier, + const unsigned int& degree, const bool& intercept) : degree ( degree ), intercept ( intercept ) { @@ -136,20 +121,20 @@ Baselearner* BaselearnerPolynomial::clone () { Baselearner* newbl = new BaselearnerPolynomial(*this); newbl->copyMembers(this->parameter, this->blearner_identifier, this->data_ptr); - + return newbl; } // // Transform data: // arma::mat BaselearnerPolynomial::instantiateData () // { -// +// // return arma::pow(*data_ptr, degree); // } -// +// // Transform data. This is done twice since it makes the prediction // of the whole compboost object so much easier: -arma::mat BaselearnerPolynomial::instantiateData (const arma::mat& newdata) +arma::mat BaselearnerPolynomial::instantiateData (const arma::mat& newdata) const { arma::mat temp = arma::pow(newdata, degree); if (intercept) { @@ -160,20 +145,20 @@ arma::mat BaselearnerPolynomial::instantiateData (const arma::mat& newdata) } // Train the learner: -void BaselearnerPolynomial::train (const arma::vec& response) +void BaselearnerPolynomial::train (const arma::mat& response) { if (data_ptr->getData().n_cols == 1) { double y_mean = 0; if (intercept) { - y_mean = arma::as_scalar(arma::mean(response)); - } + y_mean = arma::as_scalar(arma::accu(response) / response.size()); + } double slope = arma::as_scalar(arma::sum((data_ptr->getData() - data_ptr->XtX_inv(0,0)) % (response - y_mean)) / arma::as_scalar(data_ptr->XtX_inv(0,1))); double intercept = y_mean - slope * data_ptr->XtX_inv(0,0); - + if (intercept) { arma::mat out(2,1); - + out(0,0) = intercept; out(1,0) = slope; @@ -188,7 +173,7 @@ void BaselearnerPolynomial::train (const arma::vec& response) } // Predict the learner: -arma::mat BaselearnerPolynomial::predict () +arma::mat BaselearnerPolynomial::predict () const { if (data_ptr->getData().n_cols == 1) { if (intercept) { @@ -200,7 +185,7 @@ arma::mat BaselearnerPolynomial::predict () return data_ptr->getData() * parameter; } } -arma::mat BaselearnerPolynomial::predict (data::Data* newdata) +arma::mat BaselearnerPolynomial::predict (data::Data* newdata) const { return instantiateData(newdata->getData()) * parameter; } @@ -213,14 +198,14 @@ BaselearnerPolynomial::~BaselearnerPolynomial () {} /** * \brief Constructor of `BaselearnerPSpline` class - * + * * This constructor sets the members such as n_knots etc. The more computational * complex data are stored within the data object which should be initialized - * first (e.g. in the factory or otherwise). - * + * first (e.g. in the factory or otherwise). + * * One note about the used knots. The number of inner knots is specified * by `n_knots`. These inner knots are then wrapped by the minimal and maximal - * value of the given data. For instance we have a feature + * value of the given data. For instance we have a feature * \f[ * x = (1, 2, \dots, 2.5, 6) * \f] @@ -228,35 +213,35 @@ BaselearnerPolynomial::~BaselearnerPolynomial () {} * \f[ * U = (1.00, 2.25, 3.50, 4.75, 6.00) * \f] - * To get a full base these knots are wrapped by `degree` (\f$p\f$) numbers - * on either side. If we choose `degree = 2` then we have + * To get a full base these knots are wrapped by `degree` (\f$p\f$) numbers + * on either side. If we choose `degree = 2` then we have * \f$n_\mathrm{knots} + 2(p + 1) = 3 + 2(2 + 1) 9\f$ final knots: * \f[ * U = (-1.50, -0.25, 1.00, 2.25, 3.50, 4.75, 6.00, 7.25, 8.50) * \f] * Finally we get a \f$9 - (p + 1)\f$ splines for which we can calculate the - * base. - * + * base. + * * \param data `data::Data*` Target data used for training etc. * \param identifier `std::string` Identifier for one specific baselearner * \param degree `unsigned int` Polynomial degree of the splines - * \param n_knots `unsigned int` Number of inner knots used + * \param n_knots `unsigned int` Number of inner knots used * \param penalty `double` Regularization parameter `penalty = 0` yields * b splines while a bigger penalty forces the splines into a global * polynomial form. - * \param differences `unsigned int` Number of differences used for the + * \param differences `unsigned int` Number of differences used for the * penalty matrix. */ BaselearnerPSpline::BaselearnerPSpline (data::Data* data, const std::string& identifier, - const unsigned int& degree, const unsigned int& n_knots, const double& penalty, + const unsigned int& degree, const unsigned int& n_knots, const double& penalty, const unsigned int& differences, const bool& use_sparse_matrices) : degree ( degree ), n_knots ( n_knots ), penalty ( penalty ), differences ( differences ), use_sparse_matrices ( use_sparse_matrices ) -{ +{ // Called from parent class 'Baselearner': Baselearner::setData(data); Baselearner::setIdentifier(identifier); @@ -264,31 +249,31 @@ BaselearnerPSpline::BaselearnerPSpline (data::Data* data, const std::string& ide /** * \brief Clean copy of baselearner - * + * * \returns `Baselearner*` An exact copy of the actual baselearner. */ Baselearner* BaselearnerPSpline::clone () { Baselearner* newbl = new BaselearnerPSpline (*this); newbl->copyMembers(this->parameter, this->blearner_identifier, this->data_ptr); - + return newbl; } /** * \brief Instantiate data matrix (design matrix) - * + * * This function is ment to create the design matrix which is then stored * within the data object. This should be done just once and then reused all * the time. - * + * * Note that this function sets the `data_mat` object of the data object! - * + * * \param newdata `arma::mat` Input data which is transformed to the design matrix - * + * * \returns `arma::mat` of transformed data */ -arma::mat BaselearnerPSpline::instantiateData (const arma::mat& newdata) +arma::mat BaselearnerPSpline::instantiateData (const arma::mat& newdata) const { arma::vec knots = data_ptr->knots; @@ -297,21 +282,21 @@ arma::mat BaselearnerPSpline::instantiateData (const arma::mat& newdata) double range_min = knots[degree]; // minimal value from original data double range_max = knots[n_knots + degree + 1]; // maximal value from original data - arma::mat temp = filterKnotRange(newdata, range_min, range_max, data_ptr->getDataIdentifier()); + arma::mat temp = splines::filterKnotRange(newdata, range_min, range_max, data_ptr->getDataIdentifier()); // Data object has to be created prior! That means that data_ptr must have // initialized knots, and penalty matrix! - return createSplineBasis (temp, degree, knots); + return splines::createSplineBasis (temp, degree, data_ptr->knots); } /** * \brief Training of a baselearner - * + * * This function sets the `parameter` member of the parent class `Baselearner`. - * + * * \param response `arma::vec` Response variable of the training. */ -void BaselearnerPSpline::train (const arma::vec& response) +void BaselearnerPSpline::train (const arma::mat& response) { if (use_sparse_matrices) { parameter = data_ptr->XtX_inv * (data_ptr->sparse_data_mat * response); @@ -322,10 +307,10 @@ void BaselearnerPSpline::train (const arma::vec& response) /** * \brief Predict on training data - * + * * \returns `arma::mat` of predicted values */ -arma::mat BaselearnerPSpline::predict () +arma::mat BaselearnerPSpline::predict () const { // arma::mat out; if (use_sparse_matrices) { @@ -341,12 +326,12 @@ arma::mat BaselearnerPSpline::predict () /** * \brief Predict on newdata - * + * * \param newdata `data::Data*` new source data object - * + * * \returns `arma::mat` of predicted values */ -arma::mat BaselearnerPSpline::predict (data::Data* newdata) +arma::mat BaselearnerPSpline::predict (data::Data* newdata) const { return instantiateData(newdata->getData()) * parameter; } @@ -359,10 +344,10 @@ BaselearnerPSpline::~BaselearnerPSpline () {} // BaselearnerCustom: // ----------------------- -BaselearnerCustom::BaselearnerCustom (data::Data* data, const std::string& identifier, - Rcpp::Function instantiateDataFun, Rcpp::Function trainFun, Rcpp::Function predictFun, - Rcpp::Function extractParameter) - : instantiateDataFun ( instantiateDataFun ), +BaselearnerCustom::BaselearnerCustom (data::Data* data, const std::string& identifier, + Rcpp::Function instantiateDataFun, Rcpp::Function trainFun, Rcpp::Function predictFun, + Rcpp::Function extractParameter) + : instantiateDataFun ( instantiateDataFun ), trainFun ( trainFun ), predictFun ( predictFun ), extractParameter ( extractParameter ) @@ -377,13 +362,13 @@ Baselearner* BaselearnerCustom::clone () { Baselearner* newbl = new BaselearnerCustom (*this); newbl->copyMembers(this->parameter, this->blearner_identifier, this->data_ptr); - + return newbl; } // Transform data. This is done twice since it makes the prediction // of the whole compboost object so much easier: -arma::mat BaselearnerCustom::instantiateData (const arma::mat& newdata) +arma::mat BaselearnerCustom::instantiateData (const arma::mat& newdata) const { Rcpp::NumericMatrix out = instantiateDataFun(newdata); return Rcpp::as(out); @@ -394,19 +379,20 @@ arma::mat BaselearnerCustom::instantiateData (const arma::mat& newdata) // NOTE: It is highly recommended to specify an explicit extractParameter // function! Otherwise, it is not possible to estimate the parameter // during the whole process: -void BaselearnerCustom::train (const arma::vec& response) +void BaselearnerCustom::train (const arma::mat& response) { model = trainFun(response, data_ptr->getData()); parameter = Rcpp::as(extractParameter(model)); } // Predict by using the R function 'predictFun': -arma::mat BaselearnerCustom::predict () +arma::mat BaselearnerCustom::predict () const { Rcpp::NumericMatrix out = predictFun(model, data_ptr->getData()); return Rcpp::as(out); } -arma::mat BaselearnerCustom::predict (data::Data* newdata) + +arma::mat BaselearnerCustom::predict (data::Data* newdata) const { Rcpp::NumericMatrix out = predictFun(model, instantiateData(newdata->getData())); return Rcpp::as(out); @@ -419,20 +405,20 @@ BaselearnerCustom::~BaselearnerCustom () {} // BaselearnerCustomCpp: // ----------------------- -BaselearnerCustomCpp::BaselearnerCustomCpp (data::Data* data, const std::string& identifier, +BaselearnerCustomCpp::BaselearnerCustomCpp (data::Data* data, const std::string& identifier, SEXP instantiateDataFun0, SEXP trainFun0, SEXP predictFun0) { // Called from parent class 'Baselearner': Baselearner::setData (data); Baselearner::setIdentifier (identifier); - + // Set functions: Rcpp::XPtr myTempInstantiation (instantiateDataFun0); instantiateDataFun = *myTempInstantiation; - + Rcpp::XPtr myTempTrain (trainFun0); trainFun = *myTempTrain; - + Rcpp::XPtr myTempPredict (predictFun0); predictFun = *myTempPredict; } @@ -442,13 +428,13 @@ Baselearner* BaselearnerCustomCpp::clone () { Baselearner* newbl = new BaselearnerCustomCpp (*this); newbl->copyMembers(this->parameter, this->blearner_identifier, this->data_ptr); - + return newbl; } // Transform data. This is done twice since it makes the prediction // of the whole compboost object so much easier: -arma::mat BaselearnerCustomCpp::instantiateData (const arma::mat& newdata) +arma::mat BaselearnerCustomCpp::instantiateData (const arma::mat& newdata) const { return instantiateDataFun(newdata); } @@ -460,18 +446,18 @@ arma::mat BaselearnerCustomCpp::instantiateData (const arma::mat& newdata) // NOTE: It is highly recommended to specify an explicit extractParameter // function! Otherwise, it is not possible to estimate the parameter // during the whole process: -void BaselearnerCustomCpp::train (const arma::vec& response) +void BaselearnerCustomCpp::train (const arma::mat& response) { parameter = trainFun(response, data_ptr->getData()); } // Predict by using the external pointer to the function 'predictFun': - -arma::mat BaselearnerCustomCpp::predict () +arma::mat BaselearnerCustomCpp::predict () const { return predictFun (data_ptr->getData(), parameter); } -arma::mat BaselearnerCustomCpp::predict (data::Data* newdata) + +arma::mat BaselearnerCustomCpp::predict (data::Data* newdata) const { arma::mat temp_mat = instantiateData(newdata->getData()); return predictFun (temp_mat, parameter); diff --git a/src/baselearner.h b/src/baselearner.h index 7689a2e7..9508ea61 100644 --- a/src/baselearner.h +++ b/src/baselearner.h @@ -13,23 +13,8 @@ // Compboost is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// MIT License for more details. You should have received a copy of -// the MIT License along with compboost. -// -// Written by: -// ----------- -// -// Daniel Schalk -// Department of Statistics -// Ludwig-Maximilians-University Munich -// Ludwigstrasse 33 -// D-80539 München -// -// https://www.compstat.statistik.uni-muenchen.de -// -// Contact -// e: contact@danielschalk.com -// w: danielschalk.com +// MIT License for more details. You should have received a copy of +// the MIT License along with compboost. // // =========================================================================== # @@ -37,11 +22,14 @@ #define BASELEARNER_H_ #include +#include #include #include "data.h" +#include "response.h" #include "splines.h" + namespace blearner { // -------------------------------------------------------------------------- // @@ -52,55 +40,55 @@ class Baselearner { public: - virtual void train (const arma::vec&) = 0; + virtual void train (const arma::mat&) = 0; arma::mat getParameter () const; - - virtual arma::mat predict () = 0; - virtual arma::mat predict (data::Data*) = 0; - + + virtual arma::mat predict () const = 0; + virtual arma::mat predict (data::Data*) const = 0; + // Specify how the data has to be transformed. E. g. for splines a mapping // to the higher dimension space. The overloading function with the // arma mat as parameter is used for newdata: - virtual arma::mat instantiateData (const arma::mat&) = 0; - + virtual arma::mat instantiateData (const arma::mat&) const = 0; + // Clone function (in some places needed e.g. "optimizer.cpp") and a copy - // function which is called by clone to avoid copy and pasting of the + // function which is called by clone to avoid copy and pasting of the // protected members: void copyMembers (const arma::mat&, const std::string&, data::Data*); virtual Baselearner* clone () = 0; - + // Within 'setData' the pointer will be setted, while 'instantiateData' - // overwrite the object on which 'data_ptr' points. This guarantees that + // overwrite the object on which 'data_ptr' points. This guarantees that // the data is just stored once in the factory and then called by reference // within the baselearner: void setData (data::Data*); // arma::mat getData () const; - + // Get data identifier stored within the data object: std::string getDataIdentifier () const; - + // Set and get identifier of a specific baselearner (this is unique): void setIdentifier (const std::string&); std::string getIdentifier () const; - - // Set and get baselearner type (this can be the same for multiple + + // Set and get baselearner type (this can be the same for multiple // baselearner e.g. linear baselearner for variable x1 and x2). // This one is setted by the factory which later creates the objects: void setBaselearnerType (const std::string&); std::string getBaselearnerType () const; - + // Destructor: virtual ~Baselearner (); - + protected: - + // Members which should be directly accessible through the child classes: arma::mat parameter; std::string blearner_identifier; std::string blearner_type; data::Data* data_ptr; // std::string data_identifier; - + }; // -------------------------------------------------------------------------- // @@ -117,26 +105,26 @@ class Baselearner class BaselearnerPolynomial : public Baselearner { private: - + unsigned int degree; bool intercept; - + public: - - // (data pointer, data identifier, baselearner identifier, degree) + + // (data pointer, data identifier, baselearner identifier, degree) BaselearnerPolynomial (data::Data*, const std::string&, const unsigned int&, const bool&); - + Baselearner* clone (); - + // arma::mat instantiateData (); - arma::mat instantiateData (const arma::mat&); - - void train (const arma::vec&); - arma::mat predict (); - arma::mat predict (data::Data*); + arma::mat instantiateData (const arma::mat&) const; + + void train (const arma::mat&); + arma::mat predict () const; + arma::mat predict (data::Data*) const; ~BaselearnerPolynomial (); - + }; // BaselearnerPSpline: @@ -144,20 +132,20 @@ class BaselearnerPolynomial : public Baselearner /** * \class BaselearnerPSpline - * + * * \brief P-Spline Baselearner - * + * * This class implements the P-Spline baselearners. We have used de Boors * algorithm (from the Nurbs Book) to create the basis. The penalty parameter * can be specified directly or by using the degrees of freedom. If you are * using the degrees of freedom insteat of the penalty parameter, then this is - * transformed to a penalty parameter using the Demmler-Reinsch + * transformed to a penalty parameter using the Demmler-Reinsch * Orthogonalization. - * - * Please note, that this baselearner is just the dummy object. The most + * + * Please note, that this baselearner is just the dummy object. The most * functionality is done while creating the data target which contains the * most object which are used here. - * + * */ class BaselearnerPSpline : public Baselearner @@ -183,23 +171,23 @@ class BaselearnerPSpline : public Baselearner /// Default constructor of `BaselearnerPSpline` class BaselearnerPSpline (data::Data*, const std::string&, const unsigned int&, const unsigned int&, const double&, const unsigned int&, const bool&); - + /// Clean copy of baselearner Baselearner* clone (); - + /// Instatiate data matrix (design matrix) - arma::mat instantiateData (const arma::mat&); - + arma::mat instantiateData (const arma::mat&) const; + /// Trianing of a baselearner - void train (const arma::vec&); - + void train (const arma::mat&); + /// Predict on training data - arma::mat predict (); - + arma::mat predict () const; + /// Predict on newdata - arma::mat predict (data::Data*); - - + arma::mat predict (data::Data*) const; + + /// Destructor ~BaselearnerPSpline (); @@ -214,80 +202,79 @@ class BaselearnerPSpline : public Baselearner class BaselearnerCustom : public Baselearner { private: - + SEXP model; - + // R functions for a custom baselearner: Rcpp::Function instantiateDataFun; Rcpp::Function trainFun; Rcpp::Function predictFun; Rcpp::Function extractParameter; - + public: - + // (data pointer, data identifier, baselearner identifier, R function for // data instantiation, R function for training, R function for prediction, // R function to extract parameter): - BaselearnerCustom (data::Data*, const std::string&, Rcpp::Function, + BaselearnerCustom (data::Data*, const std::string&, Rcpp::Function, Rcpp::Function, Rcpp::Function, Rcpp::Function); - + // Copy constructor: Baselearner* clone (); - + // arma::mat instantiateData (); - arma::mat instantiateData (const arma::mat&); - - void train (const arma::vec&); - arma::mat predict (); - arma::mat predict (data::Data*); - + arma::mat instantiateData (const arma::mat&) const; + + void train (const arma::mat&); + arma::mat predict () const; + arma::mat predict (data::Data*) const; + ~BaselearnerCustom (); - + }; // BaselearnerCustom: // ----------------------- -// This is a bit tricky. The key is that we store the cpp functions as +// This is a bit tricky. The key is that we store the cpp functions as // pointer. Therefore we can go with R and use the XPtr class of Rcpp to -// give the pointer as SEXP. To try a working example see +// give the pointer as SEXP. To try a working example see // "tutorial/stages_of_custom_learner.html". // Please note, that the result of the train function should be a matrix // containing the estimated parameter. typedef arma::mat (*instantiateDataFunPtr) (const arma::mat& X); -typedef arma::mat (*trainFunPtr) (const arma::vec& y, const arma::mat& X); +typedef arma::mat (*trainFunPtr) (const arma::mat& y, const arma::mat& X); typedef arma::mat (*predictFunPtr) (const arma::mat& newdata, const arma::mat& parameter); class BaselearnerCustomCpp : public Baselearner { private: - + // Cpp functions for a custom baselearner: instantiateDataFunPtr instantiateDataFun; trainFunPtr trainFun; predictFunPtr predictFun; - + public: - + // (data pointer, data identifier, baselearner identifier, R function for // data instantiation, R function for training, R function for prediction, // R function to extract parameter): BaselearnerCustomCpp (data::Data*, const std::string&, SEXP, SEXP, SEXP); - + // Copy constructor: Baselearner* clone (); - + // arma::mat instantiateData (); - arma::mat instantiateData (const arma::mat&); - - void train (const arma::vec&); - arma::mat predict (); - arma::mat predict (data::Data*); - + arma::mat instantiateData (const arma::mat&) const; + + void train (const arma::mat&); + arma::mat predict () const; + arma::mat predict (data::Data*) const; + ~BaselearnerCustomCpp (); - }; } // namespace blearner diff --git a/src/baselearner_factory.cpp b/src/baselearner_factory.cpp index 50d4036e..1183b060 100644 --- a/src/baselearner_factory.cpp +++ b/src/baselearner_factory.cpp @@ -13,8 +13,8 @@ // Compboost is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// MIT License for more details. You should have received a copy of -// the MIT License along with compboost. +// MIT License for more details. You should have received a copy of +// the MIT License along with compboost. // // Written by: // ----------- @@ -61,10 +61,10 @@ void BaselearnerFactory::initializeDataObjects (data::Data* data_source0, { data_source = data_source0; data_target = data_target0; - + // Make sure that the data identifier is setted correctly: data_target->setDataIdentifier(data_source->getDataIdentifier()); - + // Get the data of the source, transform it and write it into the target: data_target->setData(instantiateData(data_source->getData())); } @@ -78,20 +78,20 @@ BaselearnerFactory::~BaselearnerFactory () {} // BaselearnerPolynomial: // ----------------------- -BaselearnerPolynomialFactory::BaselearnerPolynomialFactory (const std::string& blearner_type0, - data::Data* data_source0, data::Data* data_target0, const unsigned int& degree, +BaselearnerPolynomialFactory::BaselearnerPolynomialFactory (const std::string& blearner_type0, + data::Data* data_source0, data::Data* data_target0, const unsigned int& degree, const bool& intercept) : degree ( degree ), intercept ( intercept ) { blearner_type = blearner_type0; - + data_source = data_source0; data_target = data_target0; - + // Make sure that the data identifier is setted correctly: data_target->setDataIdentifier(data_source->getDataIdentifier()); - + // Prepare computation of intercept and slope of an ordinary linear regression: if (data_source->getData().n_cols == 1) { // Store centered x values for faster computation: @@ -111,26 +111,26 @@ BaselearnerPolynomialFactory::BaselearnerPolynomialFactory (const std::string& b data_target->setData(instantiateData(data_source->getData())); data_target->XtX_inv = arma::inv(data_target->getData().t() * data_target->getData()); } - + // blearner_type = blearner_type + " with degree " + std::to_string(degree); } blearner::Baselearner* BaselearnerPolynomialFactory::createBaselearner (const std::string& identifier) { blearner::Baselearner* blearner_obj; - - // Create new polynomial baselearner. This one will be returned by the + + // Create new polynomial baselearner. This one will be returned by the // factory: blearner_obj = new blearner::BaselearnerPolynomial(data_target, identifier, degree, intercept); blearner_obj->setBaselearnerType(blearner_type); - + // // Check if the data is already set. If not, run 'instantiateData' from the // // baselearner: // if (! is_data_instantiated) { // data = blearner_obj->instantiateData(); - // + // // is_data_instantiated = true; - // + // // // update baselearner type: // blearner_type = blearner_type + " with degree " + std::to_string(degree); // } @@ -139,15 +139,15 @@ blearner::Baselearner* BaselearnerPolynomialFactory::createBaselearner (const st /** * \brief Data getter which always returns an arma::mat - * + * * This function is important to have a unified interface to access the data * matrices. Especially for predicting we have to get the data of each factory * as dense matrix. This is a huge drawback in terms of memory usage. Therefore, * this function should only be used to get temporary matrices which are deleted * when they run out of scope to reduce memory load. Also note that there is a - * dispatch with the getData() function of the Data objects which are mostly + * dispatch with the getData() function of the Data objects which are mostly * called internally. - * + * * \returns `arma::mat` of data used for modelling a single base-learner */ arma::mat BaselearnerPolynomialFactory::getData () const @@ -163,7 +163,7 @@ arma::mat BaselearnerPolynomialFactory::getData () const } } else { return data_target->getData(); - } + } } // Transform data. This is done twice since it makes the prediction @@ -185,27 +185,27 @@ arma::mat BaselearnerPolynomialFactory::instantiateData (const arma::mat& newdat /** * \brief Default constructor of class `PSplineBleanrerFactory` - * + * * The PSpline constructor has some important tasks which are: * - Set the knots * - Initialize the data (knots must be setted prior) - * - Compute and store penalty matrix - * + * - Compute and store penalty matrix + * * \param blearner_type0 `std::string` Name of the baselearner type (setted by * the Rcpp Wrapper classes in `compboost_modules.cpp`) * \param data_source `data::Data*` Source of the data * \param data_target `data::Data*` Object to store the transformed data source * \param degree `unsigned int` Polynomial degree of the splines - * \param n_knots `unsigned int` Number of inner knots used + * \param n_knots `unsigned int` Number of inner knots used * \param penalty `double` Regularization parameter `penalty = 0` yields * b splines while a bigger penalty forces the splines into a global * polynomial form. - * \param differences `unsigned int` Number of differences used for the + * \param differences `unsigned int` Number of differences used for the * penalty matrix. */ -BaselearnerPSplineFactory::BaselearnerPSplineFactory (const std::string& blearner_type0, - data::Data* data_source0, data::Data* data_target0, const unsigned int& degree, +BaselearnerPSplineFactory::BaselearnerPSplineFactory (const std::string& blearner_type0, + data::Data* data_source0, data::Data* data_target0, const unsigned int& degree, const unsigned int& n_knots, const double& penalty, const unsigned int& differences, const bool& use_sparse_matrices) : degree ( degree ), @@ -227,54 +227,54 @@ BaselearnerPSplineFactory::BaselearnerPSplineFactory (const std::string& blearne } } catch ( std::exception &ex ) { forward_exception_to_r( ex ); - } catch (...) { - ::Rf_error( "c++ exception (unknown reason)" ); + } catch (...) { + ::Rf_error( "c++ exception (unknown reason)" ); } // Initialize knots: - data_target->knots = createKnots(data_source->getData(), n_knots, degree); - + data_target->knots = splines::createKnots(data_source->getData(), n_knots, degree); + // Additionally set the penalty matrix: - data_target->penalty_mat = penaltyMat(n_knots + (degree + 1), differences); - + data_target->penalty_mat = splines::penaltyMat(n_knots + (degree + 1), differences); + // Make sure that the data identifier is setted correctly: data_target->setDataIdentifier(data_source->getDataIdentifier()); - + // Get the data of the source, transform it and write it into the target. This needs some explanations: // - If we use sparse matrices we want to store the sparse matrix into the sparse data matrix member of // the data object. This also requires to adopt getData() for that purpose. - // - To get some (very) nice speed ups we store the transposed matrix not the standard one. This also + // - To get some (very) nice speed ups we store the transposed matrix not the standard one. This also // affects how the training in baselearner.cpp is done. Nevertheless, this speed up things dramatically. if (use_sparse_matrices) { - data_target->sparse_data_mat = createSparseSplineBasis (data_source->getData(), degree, data_target->knots).t(); + data_target->sparse_data_mat = splines::createSparseSplineBasis (data_source->getData(), degree, data_target->knots).t(); data_target->XtX_inv = arma::inv(data_target->sparse_data_mat * data_target->sparse_data_mat.t() + penalty * data_target->penalty_mat); } else { data_target->setData(instantiateData(data_source->getData())); data_target->XtX_inv = arma::inv(data_target->getData().t() * data_target->getData() + penalty * data_target->penalty_mat); - } + } } /** * \brief Create new `BaselearnerPSpline` object - * + * * \param identifier `std::string` identifier of that specific baselearner object */ blearner::Baselearner* BaselearnerPSplineFactory::createBaselearner (const std::string& identifier) { blearner::Baselearner* blearner_obj; - - // Create new polynomial baselearner. This one will be returned by the + + // Create new polynomial baselearner. This one will be returned by the // factory: blearner_obj = new blearner::BaselearnerPSpline(data_target, identifier, degree, n_knots, penalty, differences, use_sparse_matrices); blearner_obj->setBaselearnerType(blearner_type); - + // // Check if the data is already set. If not, run 'instantiateData' from the // // baselearner: // if (! is_data_instantiated) { // data = blearner_obj->instantiateData(); - // + // // is_data_instantiated = true; - // + // // // update baselearner type: // blearner_type = blearner_type + " with degree " + std::to_string(degree); // } @@ -283,15 +283,15 @@ blearner::Baselearner* BaselearnerPSplineFactory::createBaselearner (const std:: /** * \brief Data getter which always returns an arma::mat - * + * * This function is important to have a unified interface to access the data * matrices. Especially for predicting we have to get the data of each factory * as dense matrix. This is a huge drawback in terms of memory usage. Therefore, * this function should only be used to get temporary matrices which are deleted * when they run out of scope to reduce memory load. Also note that there is a - * dispatch with the getData() function of the Data objects which are mostly + * dispatch with the getData() function of the Data objects which are mostly * called internally. - * + * * \returns `arma::mat` of data used for modelling a single base-learner */ arma::mat BaselearnerPSplineFactory::getData () const @@ -308,15 +308,15 @@ arma::mat BaselearnerPSplineFactory::getData () const /** * \brief Instantiate data matrix (design matrix) - * + * * This function is ment to create the design matrix which is then stored * within the data object. This should be done just once and then reused all * the time. - * + * * Note that this function sets the `data_mat` object of the data object! - * + * * \param newdata `arma::mat` Input data which is transformed to the design matrix - * + * * \returns `arma::mat` of transformed data */ arma::mat BaselearnerPSplineFactory::instantiateData (const arma::mat& newdata) const @@ -327,18 +327,19 @@ arma::mat BaselearnerPSplineFactory::instantiateData (const arma::mat& newdata) double range_min = knots[degree]; // minimal value from original data double range_max = knots[n_knots + degree + 1]; // maximal value from original data - arma::mat temp = filterKnotRange(newdata, range_min, range_max, data_target->getDataIdentifier()); + arma::mat temp = splines::filterKnotRange(newdata, range_min, range_max, data_target->getDataIdentifier()); // Data object has to be created prior! That means that data_ptr must have // initialized knots, and penalty matrix! - return createSplineBasis (temp, degree, data_target->knots); + arma::mat out = splines::createSplineBasis (temp, degree, data_target->knots); + return out; } // BaselearnerCustom: // ----------------------- -BaselearnerCustomFactory::BaselearnerCustomFactory (const std::string& blearner_type0, - data::Data* data_source, data::Data* data_target, Rcpp::Function instantiateDataFun, +BaselearnerCustomFactory::BaselearnerCustomFactory (const std::string& blearner_type0, + data::Data* data_source, data::Data* data_target, Rcpp::Function instantiateDataFun, Rcpp::Function trainFun, Rcpp::Function predictFun, Rcpp::Function extractParameter) : instantiateDataFun ( instantiateDataFun ), trainFun ( trainFun ), @@ -349,19 +350,19 @@ BaselearnerCustomFactory::BaselearnerCustomFactory (const std::string& blearner_ initializeDataObjects(data_source, data_target); } -blearner::Baselearner *BaselearnerCustomFactory::createBaselearner (const std::string &identifier) +blearner::Baselearner* BaselearnerCustomFactory::createBaselearner (const std::string &identifier) { blearner::Baselearner *blearner_obj; - - blearner_obj = new blearner::BaselearnerCustom(data_target, identifier, + + blearner_obj = new blearner::BaselearnerCustom(data_target, identifier, instantiateDataFun, trainFun, predictFun, extractParameter); blearner_obj->setBaselearnerType(blearner_type); - + // // Check if the data is already set. If not, run 'instantiateData' from the // // baselearner: // if (! is_data_instantiated) { // data = blearner_obj->instantiateData(); - // + // // is_data_instantiated = true; // } return blearner_obj; @@ -369,15 +370,15 @@ blearner::Baselearner *BaselearnerCustomFactory::createBaselearner (const std::s /** * \brief Data getter which always returns an arma::mat - * + * * This function is important to have a unified interface to access the data * matrices. Especially for predicting we have to get the data of each factory * as dense matrix. This is a huge drawback in terms of memory usage. Therefore, * this function should only be used to get temporary matrices which are deleted * when they run out of scope to reduce memory load. Also note that there is a - * dispatch with the getData() function of the Data objects which are mostly + * dispatch with the getData() function of the Data objects which are mostly * called internally. - * + * * \returns `arma::mat` of data used for modelling a single base-learner */ arma::mat BaselearnerCustomFactory::getData () const @@ -396,8 +397,8 @@ arma::mat BaselearnerCustomFactory::instantiateData (const arma::mat& newdata) c // BaselearnerCustomCpp: // ----------------------- -BaselearnerCustomCppFactory::BaselearnerCustomCppFactory (const std::string& blearner_type0, - data::Data* data_source, data::Data* data_target, SEXP instantiateDataFun, +BaselearnerCustomCppFactory::BaselearnerCustomCppFactory (const std::string& blearner_type0, + data::Data* data_source, data::Data* data_target, SEXP instantiateDataFun, SEXP trainFun, SEXP predictFun) : instantiateDataFun ( instantiateDataFun ), trainFun ( trainFun ), @@ -410,16 +411,16 @@ BaselearnerCustomCppFactory::BaselearnerCustomCppFactory (const std::string& ble blearner::Baselearner* BaselearnerCustomCppFactory::createBaselearner (const std::string& identifier) { blearner::Baselearner* blearner_obj; - - blearner_obj = new blearner::BaselearnerCustomCpp(data_target, identifier, + + blearner_obj = new blearner::BaselearnerCustomCpp(data_target, identifier, instantiateDataFun, trainFun, predictFun); blearner_obj->setBaselearnerType(blearner_type); - + // // Check if the data is already set. If not, run 'instantiateData' from the // // baselearner: // if (! is_data_instantiated) { // data = blearner_obj->instantiateData(); - // + // // is_data_instantiated = true; // } return blearner_obj; @@ -427,15 +428,15 @@ blearner::Baselearner* BaselearnerCustomCppFactory::createBaselearner (const std /** * \brief Data getter which always returns an arma::mat - * + * * This function is important to have a unified interface to access the data * matrices. Especially for predicting we have to get the data of each factory * as dense matrix. This is a huge drawback in terms of memory usage. Therefore, * this function should only be used to get temporary matrices which are deleted * when they run out of scope to reduce memory load. Also note that there is a - * dispatch with the getData() function of the Data objects which are mostly + * dispatch with the getData() function of the Data objects which are mostly * called internally. - * + * * \returns `arma::mat` of data used for modelling a single base-learner */ arma::mat BaselearnerCustomCppFactory::getData () const @@ -449,7 +450,7 @@ arma::mat BaselearnerCustomCppFactory::instantiateData (const arma::mat& newdata { Rcpp::XPtr myTempInstantiation (instantiateDataFun); instantiateDataFunPtr instantiateDataFun0 = *myTempInstantiation; - + return instantiateDataFun0(newdata); } diff --git a/src/baselearner_factory.h b/src/baselearner_factory.h index a29d8ed6..fc6ac320 100644 --- a/src/baselearner_factory.h +++ b/src/baselearner_factory.h @@ -13,38 +13,23 @@ // Compboost is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// MIT License for more details. You should have received a copy of -// the MIT License along with compboost. -// -// Written by: -// ----------- -// -// Daniel Schalk -// Department of Statistics -// Ludwig-Maximilians-University Munich -// Ludwigstrasse 33 -// D-80539 München -// -// https://www.compstat.statistik.uni-muenchen.de -// -// Contact -// e: contact@danielschalk.com -// w: danielschalk.com +// MIT License for more details. You should have received a copy of +// the MIT License along with compboost. // // =========================================================================== # -/** +/** * @file baselearner_factory.h * @author Daniel Schalk (github: schalkdaniel) - * + * * @brief Definition of baselearner factory classes * * @section DESCRIPTION - * + * * This file defines the baselearner factory classes. Every baselearner should - * have a corresponding factory class. This factory class just exists to - * crate baselearner objects. - * + * have a corresponding factory class. This factory class just exists to + * crate baselearner objects. + * * the factories are also there to instantiate the data at the moment the * factory is instantiated. This is done by taking a data source and transform * it baselearner dependent into a data target object. This data object is then @@ -73,30 +58,30 @@ namespace blearnerfactory { class BaselearnerFactory { public: - + // Create new baselearner with id: virtual blearner::Baselearner* createBaselearner (const std::string&) = 0; - + // Getter for data, data identifier and the baselearner type: // arma::mat getData () const; std::string getDataIdentifier () const; std::string getBaselearnerType () const; - + virtual arma::mat instantiateData (const arma::mat&) const = 0; virtual arma::mat getData() const = 0; - + void initializeDataObjects (data::Data*, data::Data*); - + // Destructor: virtual ~BaselearnerFactory (); - + protected: - + // Minimal functionality every baselearner should have: std::string blearner_type; data::Data* data_source; data::Data* data_target; - + }; // -------------------------------------------------------------------------- // @@ -109,20 +94,20 @@ class BaselearnerFactory class BaselearnerPolynomialFactory : public BaselearnerFactory { private: - + const unsigned int degree; bool intercept; - + public: - + BaselearnerPolynomialFactory (const std::string&, data::Data*, data::Data*, const unsigned int&, const bool&); - + blearner::Baselearner* createBaselearner (const std::string&); - + /// Get data used for modeling arma::mat getData() const; - + arma::mat instantiateData (const arma::mat&) const; }; @@ -131,39 +116,39 @@ class BaselearnerPolynomialFactory : public BaselearnerFactory /** * \class BaselearnerPSplineFactory - * + * * \brief Factory to create `PSplineBlearner` objects - * + * */ class BaselearnerPSplineFactory : public BaselearnerFactory { private: - + /// Degree of splines const unsigned int degree; - + /// Number of inner knots const unsigned int n_knots; - + /// Regularization parameter const double penalty; - + /// Order of differences used for penalty matrix const unsigned int differences; /// Flag if sparse matrices should be used: const bool use_sparse_matrices; - + public: /// Default constructor of class `PSplineBleanrerFactory` - BaselearnerPSplineFactory (const std::string&, data::Data*, data::Data*, - const unsigned int&, const unsigned int&, const double&, + BaselearnerPSplineFactory (const std::string&, data::Data*, data::Data*, + const unsigned int&, const unsigned int&, const double&, const unsigned int&, const bool&); - + /// Create new `BaselearnerPSpline` object blearner::Baselearner* createBaselearner (const std::string&); - + /// Get data used for modelling arma::mat getData() const; @@ -179,24 +164,24 @@ class BaselearnerPSplineFactory : public BaselearnerFactory class BaselearnerCustomFactory : public BaselearnerFactory { private: - + Rcpp::Function instantiateDataFun; Rcpp::Function trainFun; Rcpp::Function predictFun; Rcpp::Function extractParameter; - + public: - + BaselearnerCustomFactory (const std::string&, data::Data*, data::Data*, Rcpp::Function, Rcpp::Function, Rcpp::Function, Rcpp::Function); - + blearner::Baselearner* createBaselearner (const std::string&); - + /// Get data used for modelling arma::mat getData() const; - + arma::mat instantiateData (const arma::mat&) const; - + }; // BaselearnerCustomCppFactory: @@ -207,24 +192,24 @@ typedef arma::mat (*instantiateDataFunPtr) (const arma::mat& X); class BaselearnerCustomCppFactory : public BaselearnerFactory { private: - + // Cpp functions for a custom baselearner: SEXP instantiateDataFun; SEXP trainFun; SEXP predictFun; - + public: - - BaselearnerCustomCppFactory (const std::string&, data::Data*, data::Data*, + + BaselearnerCustomCppFactory (const std::string&, data::Data*, data::Data*, SEXP, SEXP, SEXP); - + blearner::Baselearner* createBaselearner (const std::string&); - + /// Get data used for modelling arma::mat getData() const; arma::mat instantiateData (const arma::mat&) const; - + }; } // namespace blearnerfactory diff --git a/src/baselearner_factory_list.cpp b/src/baselearner_factory_list.cpp index e85f0131..4eabf8bc 100644 --- a/src/baselearner_factory_list.cpp +++ b/src/baselearner_factory_list.cpp @@ -13,41 +13,26 @@ // Compboost is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// MIT License for more details. You should have received a copy of -// the MIT License along with compboost. -// -// Written by: -// ----------- -// -// Daniel Schalk -// Department of Statistics -// Ludwig-Maximilians-University Munich -// Ludwigstrasse 33 -// D-80539 München -// -// https://www.compstat.statistik.uni-muenchen.de -// -// Contact -// e: contact@danielschalk.com -// w: danielschalk.com +// MIT License for more details. You should have received a copy of +// the MIT License along with compboost. // // =========================================================================== # #include "baselearner_factory_list.h" -namespace blearnerlist +namespace blearnerlist { // Just an empty constructor: BaselearnerFactoryList::BaselearnerFactoryList () {} // Register a factory: -void BaselearnerFactoryList::registerBaselearnerFactory (const std::string& factory_id, +void BaselearnerFactoryList::registerBaselearnerFactory (const std::string& factory_id, blearnerfactory::BaselearnerFactory *blearner_factory) { // Create iterator and check if learner is already registered: std::map::iterator it = my_factory_map.find(factory_id); - + if (it == my_factory_map.end()) { my_factory_map.insert(std::pair(factory_id, blearner_factory)); } else { @@ -64,7 +49,7 @@ void BaselearnerFactoryList::printRegisteredFactories () const } else { Rcpp::Rcout << "No registered base-learner."; } - + // Iterate over all registered factories and print the factory identifier: for (auto& it : my_factory_map) { Rcpp::Rcout << "\t- " << it.first << std::endl; @@ -89,12 +74,12 @@ std::pair, arma::mat> BaselearnerFactoryList::getModelF { arma::mat out_matrix; std::vector rownames; - + for (auto& it : my_factory_map) { arma::mat data_temp(it.second->getData()); out_matrix = arma::join_rows(out_matrix, data_temp); - + if (data_temp.n_cols > 1) { for (unsigned int i = 0; i < data_temp.n_cols; i++) { rownames.push_back(it.first + "x1" + std::to_string(i + 1)); diff --git a/src/baselearner_factory_list.h b/src/baselearner_factory_list.h index 29493543..76c955db 100644 --- a/src/baselearner_factory_list.h +++ b/src/baselearner_factory_list.h @@ -13,23 +13,8 @@ // Compboost is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// MIT License for more details. You should have received a copy of -// the MIT License along with compboost. -// -// Written by: -// ----------- -// -// Daniel Schalk -// Department of Statistics -// Ludwig-Maximilians-University Munich -// Ludwigstrasse 33 -// D-80539 München -// -// https://www.compstat.statistik.uni-muenchen.de -// -// Contact -// e: contact@danielschalk.com -// w: danielschalk.com +// MIT License for more details. You should have received a copy of +// the MIT License along with compboost. // // =========================================================================== # @@ -49,34 +34,34 @@ namespace blearnerlist // Later we will create one static object of this class. This is a workaround // to register new factorys from R. -class BaselearnerFactoryList +class BaselearnerFactoryList { private: - + // Main list object: blearner_factory_map my_factory_map; - + public: - + BaselearnerFactoryList (); - + // Functions to register a baselearner factory and print all registered // factories: void registerBaselearnerFactory (const std::string&, blearnerfactory::BaselearnerFactory*); void printRegisteredFactories () const; - + // Get the actual map: blearner_factory_map getMap () const; - + // Clear all elements wich are registered: void clearMap(); - + // Get the data used for modelling: std::pair, arma::mat> getModelFrame () const; // Get names of registered factories: std::vector getRegisteredFactoryNames () const; - + // ~BaselearnerFactoryList () {Rcpp::Rcout << "Destroy BaselearnerFactoryList!" << std::endl; } }; diff --git a/src/baselearner_track.cpp b/src/baselearner_track.cpp index aab77a41..02d47f52 100644 --- a/src/baselearner_track.cpp +++ b/src/baselearner_track.cpp @@ -13,23 +13,8 @@ // Compboost is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// MIT License for more details. You should have received a copy of -// the MIT License along with compboost. -// -// Written by: -// ----------- -// -// Daniel Schalk -// Department of Statistics -// Ludwig-Maximilians-University Munich -// Ludwigstrasse 33 -// D-80539 München -// -// https://www.compstat.statistik.uni-muenchen.de -// -// Contact -// e: contact@danielschalk.com -// w: danielschalk.com +// MIT License for more details. You should have received a copy of +// the MIT License along with compboost. // // =========================================================================== # @@ -49,32 +34,32 @@ void BaselearnerTrack::insertBaselearner (blearner::Baselearner* blearner, const { // Insert new baselearner: blearner_vector.push_back(blearner); - + step_sizes.push_back(step_size); + std::string insert_id = blearner->getDataIdentifier() + "_" + blearner->getBaselearnerType(); - // std::cout << "Insert base-learner with base-learner id: " << blearner->getBaselearnerType() << std::endl;; // Check if the baselearner is the first one. If so, the parameter // has to be instantiated with a zero matrix: std::map::iterator it = my_parameter_map.find(insert_id); - + // Prune parameter by multiplying it with the learning rate: arma::mat parameter_temp = learning_rate * step_size * blearner->getParameter(); - + // Check if this is the first parameter entry: if (it == my_parameter_map.end()) { - + // If this is the first entry, initialize it with zeros: arma::mat init_parameter(parameter_temp.n_rows, parameter_temp.n_cols, arma::fill::zeros); my_parameter_map.insert(std::pair(insert_id, init_parameter)); } - - // Accumulating parameter. If there is a nan, then this will be ignored and + + // Accumulating parameter. If there is a nan, then this will be ignored and // the non nan entries are added up: // arma::mat parameter_insert = parameter_temp + my_parameter_map.find(blearner->getBaselearnerType())->second; // my_parameter_map.insert(std::pair(blearner->getBaselearnerType(), parameter_insert)); my_parameter_map[ insert_id ] = parameter_temp + my_parameter_map.find(insert_id)->second; - + } // Get the vector of baselearner: @@ -95,7 +80,7 @@ void BaselearnerTrack::clearBaselearnerVector () for (unsigned int i = 0; i < blearner_vector.size(); i++) { delete blearner_vector[i]; - } + } blearner_vector.clear(); } @@ -105,32 +90,32 @@ std::map BaselearnerTrack::getEstimatedParameterOfIterat if (k > blearner_vector.size()) { Rcpp::stop ("You can't get parameter of a state higher then the maximal iterations."); } - + // Create new parameter map: std::map my_new_parameter_map; - + if (k <= blearner_vector.size()) { - + for (unsigned int i = 0; i < k; i++) { std::string insert_id = blearner_vector[i]->getDataIdentifier() + "_" + blearner_vector[i]->getBaselearnerType(); - + // Check if the baselearner is the first one. If so, the parameter // has to be instantiated with a zero matrix: std::map::iterator it = my_new_parameter_map.find(insert_id); - + // Prune parameter by multiplying it with the learning rate: - arma::mat parameter_temp = learning_rate * blearner_vector[i]->getParameter(); - + arma::mat parameter_temp = learning_rate * step_sizes[i] * blearner_vector[i]->getParameter(); + // Check if this is the first parameter entry: if (it == my_new_parameter_map.end()) { - + // If this is the first entry, initialize it with zeros: arma::mat init_parameter(parameter_temp.n_rows, parameter_temp.n_cols, arma::fill::zeros); my_new_parameter_map.insert(std::pair(insert_id, init_parameter)); - + } - - // Accumulating parameter. If there is a nan, then this will be ignored and + + // Accumulating parameter. If there is a nan, then this will be ignored and // the non nan entries are added up: my_new_parameter_map[ insert_id ] = parameter_temp + my_new_parameter_map.find(insert_id)->second; } @@ -143,35 +128,35 @@ std::pair, arma::mat> BaselearnerTrack::getParameterMat { // Instantiate list to iterate: std::map my_new_parameter_map = my_parameter_map; - + unsigned int cols = 0; - + // Set all parameter to zero in new map: for (auto& it : my_new_parameter_map) { arma::mat init_parameter (it.second.n_rows, it.second.n_cols, arma::fill::zeros); my_new_parameter_map[ it.first ] = init_parameter; - + // Note that parameter are stored as col vectors but in the matrix we want // them as row vectors. Therefore we have to use rows to count the columns - // of the paraemter matrix. + // of the paraemter matrix. cols += it.second.n_rows; } // Initialize matrix: arma::mat parameters (blearner_vector.size(), cols, arma::fill::zeros); - + for (unsigned int i = 0; i < blearner_vector.size(); i++) { std::string insert_id = blearner_vector[i]->getDataIdentifier() + "_" + blearner_vector[i]->getBaselearnerType(); // Prune parameter by multiplying it with the learning rate: arma::mat parameter_temp = learning_rate * blearner_vector[i]->getParameter(); - - // Accumulating parameter. If there is a nan, then this will be ignored and + + // Accumulating parameter. If there is a nan, then this will be ignored and // the non nan entries are added up: my_new_parameter_map[ insert_id ] = parameter_temp + my_new_parameter_map.find(insert_id)->second; - + arma::mat param_insert; - + // Join columns to one huge column vector: for (auto& it : my_new_parameter_map) { param_insert = arma::join_cols(param_insert, it.second); @@ -180,9 +165,9 @@ std::pair, arma::mat> BaselearnerTrack::getParameterMat parameters.row(i) = param_insert.t(); } std::pair, arma::mat> out_pair; - + // If a baselearner have more than one parameter, than we rename the parameter - // with a corresponding number (Note: In my_new_parameter_map is a list + // with a corresponding number (Note: In my_new_parameter_map is a list // containing the last state of the parameter, that means a map with an // identifier string and parameter matrix): for (auto& it : my_new_parameter_map) { @@ -195,7 +180,7 @@ std::pair, arma::mat> BaselearnerTrack::getParameterMat } } out_pair.second = parameters; - + return out_pair; } @@ -204,7 +189,7 @@ void BaselearnerTrack::setToIteration (const unsigned int& k) if (k > blearner_vector.size()) { Rcpp::stop ("You can't set the actual state to a higher state then the maximal iterations."); } - + my_parameter_map = getEstimatedParameterOfIteration(k); } diff --git a/src/baselearner_track.h b/src/baselearner_track.h index b908b590..cb096bc0 100644 --- a/src/baselearner_track.h +++ b/src/baselearner_track.h @@ -13,23 +13,8 @@ // Compboost is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// MIT License for more details. You should have received a copy of -// the MIT License along with compboost. -// -// Written by: -// ----------- -// -// Daniel Schalk -// Department of Statistics -// Ludwig-Maximilians-University Munich -// Ludwigstrasse 33 -// D-80539 München -// -// https://www.compstat.statistik.uni-muenchen.de -// -// Contact -// e: contact@danielschalk.com -// w: danielschalk.com +// MIT License for more details. You should have received a copy of +// the MIT License along with compboost. // // =========================================================================== # @@ -45,43 +30,44 @@ namespace blearnertrack class BaselearnerTrack { private: - + // Vector of selected baselearner: std::vector blearner_vector; - + // Parameter map. The first element contains the baselearner type and the // second element the parameter. This one will be updated in every // iteration: std::map my_parameter_map; - + double learning_rate; - - public: - + std::vector step_sizes; + + public: + BaselearnerTrack (); BaselearnerTrack (double); - + // Insert a baselearner into vector and update parameter: void insertBaselearner (blearner::Baselearner*, const double& step_size); - + // Return the vector of baselearner: std::vector getBaselearnerVector () const; - + // Return so far estimated parameter map: std::map getParameterMap () const; - + // Clear the vector of baselearner: void clearBaselearnerVector (); - + // Estimate parameter for specific iteration: std::map getEstimatedParameterOfIteration (const unsigned int&) const; - + // Returns a matrix of parameters for every iteration: std::pair, arma::mat> getParameterMatrix () const; - + // Set parameter map to a given iteration: void setToIteration (const unsigned int&); - + // Destructor: ~BaselearnerTrack (); }; diff --git a/src/compboost.cpp b/src/compboost.cpp index a591c7a9..2b25666b 100644 --- a/src/compboost.cpp +++ b/src/compboost.cpp @@ -13,23 +13,8 @@ // Compboost is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// MIT License for more details. You should have received a copy of -// the MIT License along with compboost. -// -// Written by: -// ----------- -// -// Daniel Schalk -// Department of Statistics -// Ludwig-Maximilians-University Munich -// Ludwigstrasse 33 -// D-80539 München -// -// https://www.compstat.statistik.uni-muenchen.de -// -// Contact -// e: contact@danielschalk.com -// w: danielschalk.com +// MIT License for more details. You should have received a copy of +// the MIT License along with compboost. // // =========================================================================== # @@ -45,129 +30,114 @@ namespace cboost { Compboost::Compboost () {} -Compboost::Compboost (const arma::vec& response, const double& learning_rate, - const bool& stop_if_all_stopper_fulfilled, optimizer::Optimizer* used_optimizer, - loss::Loss* used_loss, loggerlist::LoggerList* used_logger0, - blearnerlist::BaselearnerFactoryList used_baselearner_list) - : response ( response ), +Compboost::Compboost (std::shared_ptr sh_ptr_response, const double& learning_rate, + const bool& stop_if_all_stopper_fulfilled, optimizer::Optimizer* used_optimizer, loss::Loss* used_loss, + loggerlist::LoggerList* logger_map0, blearnerlist::BaselearnerFactoryList used_baselearner_list) + : sh_ptr_response ( sh_ptr_response ), learning_rate ( learning_rate ), stop_if_all_stopper_fulfilled ( stop_if_all_stopper_fulfilled ), used_optimizer ( used_optimizer ), used_loss ( used_loss ), used_baselearner_list ( used_baselearner_list ) { + sh_ptr_response->constantInitialization(used_loss); + sh_ptr_response->initializePrediction(); blearner_track = blearnertrack::BaselearnerTrack(learning_rate); - used_logger["initial.training"] = used_logger0; + logger_map["initial.training"] = logger_map0; } // --------------------------------------------------------------------------- # // Member functions: // --------------------------------------------------------------------------- # -void Compboost::train (const unsigned int& trace, const arma::vec& prediction, loggerlist::LoggerList* logger) +void Compboost::train (const unsigned int& trace, loggerlist::LoggerList* logger_list) { if (used_baselearner_list.getMap().size() == 0) { Rcpp::stop("Could not train without any registered base-learner."); } - - arma::vec pred_temp = prediction; - arma::vec blearner_pred_temp; - + + arma::mat blearner_pred_temp; + bool stop_the_algorithm = false; unsigned int k = 1; - - // Main Algorithm. While the stop criteria isn't fulfilled, run the + + // Main Algorithm. While the stop criteria isn't fulfilled, run the // algorithm: while (! stop_the_algorithm) { actual_iteration = blearner_track.getBaselearnerVector().size() + 1; - - // Define pseudo residuals as negative gradient: - pseudo_residuals = -used_loss->definedGradient(response, pred_temp); - + + sh_ptr_response->updatePseudoResiduals(used_loss); + sh_ptr_response->setActualIteration(k); + // Cast integer k to string for baselearner identifier: std::string temp_string = std::to_string(k); - blearner::Baselearner* selected_blearner = used_optimizer->findBestBaselearner(temp_string, pseudo_residuals, used_baselearner_list.getMap()); + blearner::Baselearner* selected_blearner = used_optimizer->findBestBaselearner(temp_string, sh_ptr_response, used_baselearner_list.getMap()); // Prediction is needed more often, use a temp vector to avoid multiple computations: blearner_pred_temp = selected_blearner->predict(); - used_optimizer->calculateStepSize(used_loss, response, pred_temp, blearner_pred_temp); - - // Insert new baselearner to vector of selected baselearner. The parameter are estimated here, hence + used_optimizer->calculateStepSize(used_loss, sh_ptr_response, blearner_pred_temp); + + // Insert new base-learner to vector of selected base-learner. The parameter are estimated here, hence // the contribution to the old parameter is the estimated parameter times the learning rate times - // the step size. Therefore we have to pass the step size which changes in each iteration: + // the step size. Therefore we have to pass the step size which changes in each iteration: blearner_track.insertBaselearner(selected_blearner, used_optimizer->getStepSize(actual_iteration)); + sh_ptr_response->updatePrediction(learning_rate, used_optimizer->getStepSize(actual_iteration), blearner_pred_temp); - // Update model (prediction) and shrink by learning rate: - pred_temp += learning_rate * used_optimizer->getStepSize(actual_iteration) * blearner_pred_temp; - - // Log the current step: + // Log the current step: // The last term has to be the prediction or anything like that. This is - // important to track the risk (inbag or oob)!!!! - logger->logCurrent(k, response, pred_temp, selected_blearner, initialization, learning_rate, used_optimizer->getStepSize(actual_iteration)); - + // important to track the risk (inbag or oob)!!!! + logger_list->logCurrent(k, sh_ptr_response, selected_blearner, learning_rate, used_optimizer->getStepSize(actual_iteration)); + // Calculate and log risk: - risk.push_back(arma::mean(used_loss->definedLoss(response, pred_temp))); + risk.push_back(sh_ptr_response->calculateEmpiricalRisk(used_loss)); // Get status of the algorithm (is the stopping criteria reached?). The negation here // seems a bit weird, but it makes the while loop easier to read: - stop_the_algorithm = ! logger->getStopperStatus(stop_if_all_stopper_fulfilled); - + stop_the_algorithm = ! logger_list->getStopperStatus(stop_if_all_stopper_fulfilled); + if (trace > 0) { if ((k == 1) || ((k % trace) == 0)) { - logger->printLoggerStatus(risk.back()); + logger_list->printLoggerStatus(risk.back()); } - } + } k += 1; } if (trace) { Rcpp::Rcout << std::endl; - Rcpp::Rcout << std::endl; + Rcpp::Rcout << std::endl; } - - model_prediction = pred_temp; } void Compboost::trainCompboost (const unsigned int& trace) { // Make sure, that the selected baselearner and logger data is empty: blearner_track.clearBaselearnerVector(); - for (auto& it : used_logger) { + for (auto& it : logger_map) { it.second->clearLoggerData(); } - - // Initialize zero model and pseudo residuals: - initialization = used_loss->constantInitializer(response); - arma::vec pseudo_residuals_init (response.size()); - // Rcpp::Rcout << "<> Initialize zero model and pseudo residuals" << std::endl; - - // Initialize prediction and fill with zero model: - arma::vec prediction(response.size()); - prediction.fill(initialization); - // Rcpp::Rcout << "<> Initialize prediction and fill with zero model" << std::endl; - // Calculate risk for initial model: - risk.push_back(arma::mean(used_loss->definedLoss(response, prediction))); + risk.push_back(sh_ptr_response->calculateEmpiricalRisk(used_loss)); // track time: auto t1 = std::chrono::high_resolution_clock::now(); - + // Initial training: - train(trace, prediction, used_logger["initial.training"]); - + train(trace, logger_map["initial.training"]); + // track time: auto t2 = std::chrono::high_resolution_clock::now(); - + // After training call printer for a status: - Rcpp::Rcout << "Train " << std::to_string(actual_iteration) << " iterations in " - << std::chrono::duration_cast(t2 - t1).count() + Rcpp::Rcout << "Train " << std::to_string(actual_iteration) << " iterations in " + << std::chrono::duration_cast(t2 - t1).count() << " Seconds." << std::endl; - Rcpp::Rcout << "Final risk based on the train set: " << std::setprecision(2) + Rcpp::Rcout << "Final risk based on the train set: " << std::setprecision(2) << risk.back() << std::endl << std::endl; - + // Set flag if model is trained: model_is_trained = true; } @@ -179,22 +149,22 @@ void Compboost::continueTraining (loggerlist::LoggerList* logger, const unsigned } // Set state to maximal possible iteration to cleanly continue training: if (actual_iteration != blearner_track.getBaselearnerVector().size()) { - + unsigned int max_iteration = blearner_track.getBaselearnerVector().size(); // Rcpp::Rcout << "Set iteration to maximal possible value: " << std::to_string(max_iteration) << std::endl; - + setToIteration(max_iteration); - + } - + // Continue training: - train(trace, model_prediction, logger); - + train(trace, logger); + // Register logger in hash map to store logging data: - std::string logger_id = "retraining" + std::to_string(used_logger.size()); - used_logger[logger_id] = logger; - + std::string logger_id = "retraining" + std::to_string(logger_map.size()); + logger_map[logger_id] = logger; + // Update actual state: actual_iteration = blearner_track.getBaselearnerVector().size(); } @@ -203,11 +173,10 @@ arma::vec Compboost::getPrediction (const bool& as_response) const { arma::vec pred; if (as_response) { - pred = used_loss->responseTransformation(model_prediction); + return sh_ptr_response->getPredictionTransform(); } else { - pred = model_prediction; + return sh_ptr_response->getPredictionScores(); } - return pred; } std::map Compboost::getParameter () const @@ -218,7 +187,7 @@ std::map Compboost::getParameter () const std::vector Compboost::getSelectedBaselearner () const { std::vector selected_blearner; - + for (unsigned int i = 0; i < actual_iteration; i++) { selected_blearner.push_back(blearner_track.getBaselearnerVector()[i]->getDataIdentifier() + "_" + blearner_track.getBaselearnerVector()[i]->getBaselearnerType()); } @@ -227,12 +196,12 @@ std::vector Compboost::getSelectedBaselearner () const std::map Compboost::getLoggerList () const { - return used_logger; + return logger_map; } -std::map Compboost::getParameterOfIteration (const unsigned int& k) const +std::map Compboost::getParameterOfIteration (const unsigned int& k) const { - // Check is done in function GetEstimatedParameterOfIteration in baselearner_track.cpp + // Check is done in function GetEstimatedParameterOfIteration in baselearner_track.cpp return blearner_track.getEstimatedParameterOfIteration(k); } @@ -244,16 +213,13 @@ std::pair, arma::mat> Compboost::getParameterMatrix () arma::vec Compboost::predict () const { std::map parameter_map = blearner_track.getParameterMap(); - // std::map train_data_map = used_baselearner_list.getDataMap(); - - arma::vec pred(model_prediction.n_elem); - pred.fill(initialization); - + arma::mat pred = sh_ptr_response->calculateInitialPrediction(sh_ptr_response->getResponse()); + // Calculate vector - matrix product for each selected base-learner: - for (auto& it : parameter_map) { + for (auto& it : parameter_map) { std::string sel_factory = it.first; pred += used_baselearner_list.getMap().find(sel_factory)->second->getData() * it.second; - // pred += train_data_map.find(sel_factory)->second * it.second; + // pred += train_data_map.find(sel_factory)->second * it.second; } return pred; } @@ -264,27 +230,25 @@ arma::vec Compboost::predict () const // corresponding parameter. arma::vec Compboost::predict (std::map data_map, const bool& as_response) const { - // IMPROVE THIS FUNCTION!!! See: + // IMPROVE THIS FUNCTION!!! See: // https://github.com/schalkdaniel/compboost/issues/206 - + std::map parameter_map = blearner_track.getParameterMap(); - arma::vec pred(data_map.begin()->second->getData().n_rows); - pred.fill(initialization); - - // Rcpp::Rcout << "initialize pred vec" << std::endl; - - // Idea is simply to calculate the vector matrix product of parameter and + arma::mat pred(data_map.begin()->second->getData().n_rows, sh_ptr_response->getResponse().n_cols, arma::fill::zeros); + pred = sh_ptr_response->calculateInitialPrediction(pred); + + // Idea is simply to calculate the vector matrix product of parameter and // newdata. The problem here is that the newdata comes as raw data and has // to be transformed first: for (auto& it : parameter_map) { - + // Name of current feature: std::string sel_factory = it.first; - + // Find the element with key 'hat' blearnerfactory::BaselearnerFactory* sel_factory_obj = used_baselearner_list.getMap().find(sel_factory)->second; - + // Select newdata corresponding to selected facotry object: std::map::iterator it_newdata; it_newdata = data_map.find(sel_factory_obj->getDataIdentifier()); @@ -296,78 +260,41 @@ arma::vec Compboost::predict (std::map data_map, const } } if (as_response) { - pred = used_loss->responseTransformation(pred); - } - return pred; -} - -arma::vec Compboost::predictionOfIteration (std::map data_map, const unsigned int& k, const bool& as_response) const -{ - // Rcpp::Rcout << "Get into Compboost::predict" << std::endl; - - // Check is done in function GetEstimatedParameterOfIteration in baselearner_track.cpp - std::map parameter_map = blearner_track.getEstimatedParameterOfIteration(k); - - arma::vec pred(data_map.begin()->second->getData().n_rows); - pred.fill(initialization); - - // Rcpp::Rcout << "initialize pred vec" << std::endl; - - for (auto& it : parameter_map) { - - std::string sel_factory = it.first; - - // Rcpp::Rcout << "Fatory id of parameter map: " << sel_factory << std::endl; - - blearnerfactory::BaselearnerFactory* sel_factory_obj = used_baselearner_list.getMap().find(sel_factory)->second; - - // Rcpp::Rcout << "Data of selected factory: " << sel_factory_obj->GetDataIdentifier() << std::endl; - - arma::mat data_trafo = sel_factory_obj->instantiateData((data_map.find(sel_factory_obj->getDataIdentifier())->second->getData())); - pred += data_trafo * it.second; - - } - if (as_response) { - pred = used_loss->responseTransformation(pred); + pred = sh_ptr_response->getPredictionTransform(pred); } return pred; } // Set model to an given iteration. The predictions and everything is then done at this iteration: -void Compboost::setToIteration (const unsigned int& k) +void Compboost::setToIteration (const unsigned int& k) { unsigned int max_iteration = blearner_track.getBaselearnerVector().size(); - + // Set parameter: if (k > max_iteration) { // Define new iteration logger for missing iterations: - unsigned int iteration_diff = k - max_iteration; + unsigned int iteration_diff = k - max_iteration; logger::Logger* temp_logger = new logger::LoggerIteration("_iteration", true, iteration_diff); loggerlist::LoggerList* temp_loggerlist = new loggerlist::LoggerList(); - - // Register that logger: - std::string logger_id = "setToIteration.retraining" + std::to_string(used_logger.size()); + + std::string logger_id = "setToIteration.retraining" + std::to_string(logger_map.size()); temp_loggerlist->registerLogger(temp_logger); - - Rcpp::Rcout << "\nYou have already trained " << std::to_string(max_iteration) << " iterations.\n" + + Rcpp::Rcout << "\nYou have already trained " << std::to_string(max_iteration) << " iterations.\n" <<"Train " << std::to_string(iteration_diff) << " additional iterations." << std::endl << std::endl; - + continueTraining(temp_loggerlist, false); - } - + } + blearner_track.setToIteration(k); - - // Set prediction: - model_prediction = predict(); - - // Set actual state: + sh_ptr_response->setActualPredictionScores(predict(), k); actual_iteration = k; } -double Compboost::getOffset() const +arma::mat Compboost::getOffset() const { - return initialization; + return sh_ptr_response->getInitialization(); } std::vector Compboost::getRiskVector () const @@ -380,24 +307,23 @@ void Compboost::summarizeCompboost () const Rcpp::Rcout << "Compboost object with:" << std::endl; Rcpp::Rcout << "\t- Learning Rate: " << learning_rate << std::endl; Rcpp::Rcout << "\t- Are all logger used as stopper: " << stop_if_all_stopper_fulfilled << std::endl; - + if (model_is_trained) { Rcpp::Rcout << "\t- Model is already trained with " << blearner_track.getBaselearnerVector().size() << " iterations/fitted baselearner" << std::endl; Rcpp::Rcout << "\t- Actual state is at iteration " << actual_iteration << std::endl; - Rcpp::Rcout << "\t- Loss optimal initialization: " << std::fixed << std::setprecision(2) << initialization << std::endl; + // Rcpp::Rcout << "\t- Loss optimal initialization: " << std::fixed << std::setprecision(2) << initialization << std::endl; } Rcpp::Rcout << std::endl; - Rcpp::Rcout << "To get more information check the other objects!" << std::endl; } // Destructor: Compboost::~Compboost () { // blearner_track will be deleted automatically (allocated on the stack) - - // used_logger will be deleted automatically (allocated on the stack). BUT we + + // logger_map will be deleted automatically (allocated on the stack). BUT we // have to care about self registered logger by setToIteration: - for (auto& it : used_logger) { + for (auto& it : logger_map) { if (it.first.find("setToIteration") != std::string::npos) { // Delets the loggerlist: delete it.second; diff --git a/src/compboost.h b/src/compboost.h index 7fdcd96e..e058ce19 100644 --- a/src/compboost.h +++ b/src/compboost.h @@ -13,23 +13,8 @@ // Compboost is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// MIT License for more details. You should have received a copy of -// the MIT License along with compboost. -// -// Written by: -// ----------- -// -// Daniel Schalk -// Department of Statistics -// Ludwig-Maximilians-University Munich -// Ludwigstrasse 33 -// D-80539 München -// -// https://www.compstat.statistik.uni-muenchen.de -// -// Contact -// e: contact@danielschalk.com -// w: danielschalk.com +// MIT License for more details. You should have received a copy of +// the MIT License along with compboost. // // =========================================================================== # @@ -42,42 +27,43 @@ * This manual should give an overview about the structure and functionality * of the `compboost` `C++` classes. To get an insight into the underlying * theory check out the `compboost` vignettes: - * + * * * \section install_sec Installation * * Basically, the `C++` code can be exported and be used within any language. - * The only restriction is to exclude the - * `Rcpp` - * specific parts which includes some `Rcpp::Rcout` printer and the custom - * classes which requires `Rcpp::Function` or external pointer of `R` as well - * as the `RcppArmadillo` + * The only restriction is to exclude the + * `Rcpp` + * specific parts which includes some `Rcpp::Rcout` printer and the custom + * classes which requires `Rcpp::Function` or external pointer of `R` as well + * as the `RcppArmadillo` * package. To get `Armadillo` run * independent of `Rcpp` one has to link the library manually. - * + * * As it can already be suspected, the main intend is to use this package * within `R`. This is achived by wrapping the pure `C++` classes by another - * `C++` wrapper which are then exported as `S4` class using the - * Rcpp + * `C++` wrapper which are then exported as `S4` class using the + * Rcpp * modules. So the easiest way of using `compboost` is to install the * `R` package: - * + * * ``` * devtools::install_github("schalkdaniel/compboost") * ``` - * - * + * + * */ - - #ifndef COMPBOOST_H_ #define COMPBOOST_H_ +#include + #include "baselearner_track.h" #include "optimizer.h" #include "loss.h" #include "loggerlist.h" +#include "response.h" namespace cboost { @@ -85,73 +71,66 @@ namespace cboost { class Compboost { - + private: - - arma::vec response; - arma::vec pseudo_residuals; - arma::vec model_prediction; std::vector risk; - + // Expand learning_rate to vector: double learning_rate; - double initialization; - + bool stop_if_all_stopper_fulfilled; bool model_is_trained = false; - + unsigned int actual_iteration; - - // Pieces to run the algorithm: + + std::shared_ptr sh_ptr_response; blearnertrack::BaselearnerTrack blearner_track; optimizer::Optimizer* used_optimizer; loss::Loss* used_loss; blearnerlist::BaselearnerFactoryList used_baselearner_list; - + // Vector of loggerlists, needed if one want to continue training: - std::map used_logger; - + std::map logger_map; + public: - + Compboost (); - - Compboost (const arma::vec&, const double&, const bool&, optimizer::Optimizer*, loss::Loss*, + + Compboost (std::shared_ptr, const double&, const bool&, optimizer::Optimizer*, loss::Loss*, loggerlist::LoggerList*, blearnerlist::BaselearnerFactoryList); - + // Basic train function used by trainCompbost and continueTraining: - void train (const unsigned int&, const arma::vec&, loggerlist::LoggerList*); - + void train (const unsigned int&, loggerlist::LoggerList*); + // Initial training: void trainCompboost (const unsigned int&); - + // Retraining after initial training: void continueTraining (loggerlist::LoggerList*, const unsigned int&); - + arma::vec getPrediction (const bool&) const; - + std::map getParameter () const; std::vector getSelectedBaselearner () const; - + std::map getLoggerList () const; std::map getParameterOfIteration (const unsigned int&) const; - + std::pair, arma::mat> getParameterMatrix () const; - + arma::vec predict () const; arma::vec predict (std::map, const bool&) const; - arma::vec predictionOfIteration (std::map, const unsigned int&, const bool&) const; - + void setToIteration (const unsigned int&); - double getOffset () const; + arma::mat getOffset () const; std::vector getRiskVector () const; - + void summarizeCompboost () const; - + // Destructor: ~Compboost (); - }; } // namespace cboost diff --git a/src/compboost_modules.cpp b/src/compboost_modules.cpp index ca27f74c..1bf6cf9f 100644 --- a/src/compboost_modules.cpp +++ b/src/compboost_modules.cpp @@ -13,25 +13,10 @@ // Compboost is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// MIT License for more details. You should have received a copy of -// the MIT License along with compboost. +// MIT License for more details. You should have received a copy of +// the MIT License along with compboost. // -// Written by: -// ----------- -// -// Daniel Schalk -// Department of Statistics -// Ludwig-Maximilians-University Munich -// Ludwigstrasse 33 -// D-80539 München -// -// https://www.compstat.statistik.uni-muenchen.de -// -// Contact -// e: contact@danielschalk.com -// w: danielschalk.com -// -// =========================================================================== # +// ========================================================================== // #ifndef COMPBOOST_MODULES_CPP_ #define COMPBOOST_MODULES_CPP_ @@ -43,6 +28,7 @@ #include "data.h" #include "helper.h" #include "optimizer.h" +#include "response.h" // -------------------------------------------------------------------------- // // DATA // @@ -136,7 +122,7 @@ class InMemoryDataWrapper : public DataWrapper // Solve this copying issue: // https://github.com/schalkdaniel/compboost/issues/123 private: - arma::vec data_vec = arma::vec (1, arma::fill::zeros); + // arma::vec data_vec = arma::vec (1, arma::fill::zeros); arma::mat data_mat = arma::mat (1, 1, arma::fill::zeros); public: @@ -201,7 +187,7 @@ RCPP_MODULE (data_module) // -------------------------------------------------------------------------- // -// BASELEARNER FACTORIES // +// BASELEARNER FACTORIES // // -------------------------------------------------------------------------- // // Abstract class. This one is given to the factory list. The factory list then @@ -285,9 +271,9 @@ class BaselearnerFactoryWrapper //' data.target2 = InMemoryData$new() //' //' # Create new linear base-learner factory: -//' lin.factory = BaselearnerPolynomial$new(data.source, data.target1, +//' lin.factory = BaselearnerPolynomial$new(data.source, data.target1, //' list(degree = 2, intercept = FALSE)) -//' lin.factory.int = BaselearnerPolynomial$new(data.source, data.target2, +//' lin.factory.int = BaselearnerPolynomial$new(data.source, data.target2, //' list(degree = 2, intercept = TRUE)) //' //' # Get the transformed data: @@ -306,7 +292,7 @@ class BaselearnerPolynomialFactoryWrapper : public BaselearnerFactoryWrapper { private: Rcpp::List internal_arg_list = Rcpp::List::create( - Rcpp::Named("degree") = 1, + Rcpp::Named("degree") = 1, Rcpp::Named("intercept") = true ); @@ -316,10 +302,10 @@ class BaselearnerPolynomialFactoryWrapper : public BaselearnerFactoryWrapper Rcpp::List arg_list) { // Match defaults with custom arguments: - internal_arg_list = argHandler(internal_arg_list, arg_list, TRUE); + internal_arg_list = helper::argHandler(internal_arg_list, arg_list, TRUE); // We need to converse the SEXP from the element to an integer: - int degree = internal_arg_list["degree"]; + int degree = internal_arg_list["degree"]; std::string blearner_type_temp = "polynomial_degree_" + std::to_string(degree); @@ -330,7 +316,7 @@ class BaselearnerPolynomialFactoryWrapper : public BaselearnerFactoryWrapper BaselearnerPolynomialFactoryWrapper (DataWrapper& data_source, DataWrapper& data_target, const std::string& blearner_type, Rcpp::List arg_list) { - internal_arg_list = argHandler(internal_arg_list, arg_list, TRUE); + internal_arg_list = helper::argHandler(internal_arg_list, arg_list, TRUE); obj = new blearnerfactory::BaselearnerPolynomialFactory(blearner_type, data_source.getDataObj(), data_target.getDataObj(), internal_arg_list["degree"], internal_arg_list["intercept"]); @@ -348,8 +334,8 @@ class BaselearnerPolynomialFactoryWrapper : public BaselearnerFactoryWrapper void summarizeFactory () { // We need to converse the SEXP from the element to an integer: - int degree = internal_arg_list["degree"]; - + int degree = internal_arg_list["degree"]; + if (degree == 1) { Rcpp::Rcout << "Linear base-learner factory:" << std::endl; } @@ -454,7 +440,7 @@ class BaselearnerPSplineFactoryWrapper : public BaselearnerFactoryWrapper { private: Rcpp::List internal_arg_list = Rcpp::List::create( - Rcpp::Named("degree") = 3, + Rcpp::Named("degree") = 3, Rcpp::Named("n.knots") = 20, Rcpp::Named("penalty") = 2, Rcpp::Named("differences") = 2 @@ -465,26 +451,26 @@ class BaselearnerPSplineFactoryWrapper : public BaselearnerFactoryWrapper BaselearnerPSplineFactoryWrapper (DataWrapper& data_source, DataWrapper& data_target, Rcpp::List arg_list) { - internal_arg_list = argHandler(internal_arg_list, arg_list, TRUE); + internal_arg_list = helper::argHandler(internal_arg_list, arg_list, TRUE); // We need to converse the SEXP from the element to an integer: - int degree = internal_arg_list["degree"]; + int degree = internal_arg_list["degree"]; std::string blearner_type_temp = "spline_degree_" + std::to_string(degree); - + obj = new blearnerfactory::BaselearnerPSplineFactory(blearner_type_temp, data_source.getDataObj(), - data_target.getDataObj(), internal_arg_list["degree"], internal_arg_list["n.knots"], - internal_arg_list["penalty"], internal_arg_list["differences"], TRUE); + data_target.getDataObj(), internal_arg_list["degree"], internal_arg_list["n.knots"], + internal_arg_list["penalty"], internal_arg_list["differences"], TRUE); } BaselearnerPSplineFactoryWrapper (DataWrapper& data_source, DataWrapper& data_target, const std::string& blearner_type, Rcpp::List arg_list) { - internal_arg_list = argHandler(internal_arg_list, arg_list, TRUE); + internal_arg_list = helper::argHandler(internal_arg_list, arg_list, TRUE); obj = new blearnerfactory::BaselearnerPSplineFactory(blearner_type, data_source.getDataObj(), - data_target.getDataObj(), internal_arg_list["degree"], internal_arg_list["n.knots"], + data_target.getDataObj(), internal_arg_list["degree"], internal_arg_list["n.knots"], internal_arg_list["penalty"], internal_arg_list["differences"], TRUE); } @@ -500,7 +486,7 @@ class BaselearnerPSplineFactoryWrapper : public BaselearnerFactoryWrapper void summarizeFactory () { // We need to converse the SEXP from the element to an integer: - int degree = internal_arg_list["degree"]; + int degree = internal_arg_list["degree"]; Rcpp::Rcout << "Spline factory of degree" << " " << std::to_string(degree) << std::endl; Rcpp::Rcout << "\t- Name of the used data: " << obj->getDataIdentifier() << std::endl; @@ -519,7 +505,7 @@ class BaselearnerPSplineFactoryWrapper : public BaselearnerFactoryWrapper //' //' @section Usage: //' \preformatted{ -//' BaselearnerCustom$new(data_source, data_target, list(instantiate.fun, +//' BaselearnerCustom$new(data_source, data_target, list(instantiate.fun, //' train.fun, predict.fun, param.fun)) //' } //' @@ -611,7 +597,7 @@ class BaselearnerPSplineFactoryWrapper : public BaselearnerFactoryWrapper //' //' # Create new custom linear base-learner factory: //' custom.lin.factory = BaselearnerCustom$new(data.source, data.target, -//' list(instantiate.fun = instantiateDataFun, train.fun = trainFun, +//' list(instantiate.fun = instantiateDataFun, train.fun = trainFun, //' predict.fun = predictFun, param.fun = extractParameter)) //' //' # Get the transformed data: @@ -628,7 +614,7 @@ class BaselearnerCustomFactoryWrapper : public BaselearnerFactoryWrapper { private: Rcpp::List internal_arg_list = Rcpp::List::create( - Rcpp::Named("instantiate.fun") = 0, + Rcpp::Named("instantiate.fun") = 0, Rcpp::Named("train.fun") = 0, Rcpp::Named("predict.fun") = 0, Rcpp::Named("param.fun") = 0 @@ -640,10 +626,10 @@ class BaselearnerCustomFactoryWrapper : public BaselearnerFactoryWrapper Rcpp::List arg_list) { // Don't check argument types since we don't have a Function placeholder for the default list: - internal_arg_list = argHandler(internal_arg_list, arg_list, FALSE); + internal_arg_list = helper::argHandler(internal_arg_list, arg_list, FALSE); obj = new blearnerfactory::BaselearnerCustomFactory("custom", data_source.getDataObj(), - data_target.getDataObj(), internal_arg_list["instantiate.fun"], internal_arg_list["train.fun"], + data_target.getDataObj(), internal_arg_list["instantiate.fun"], internal_arg_list["train.fun"], internal_arg_list["predict.fun"], internal_arg_list["param.fun"]); } @@ -651,10 +637,10 @@ class BaselearnerCustomFactoryWrapper : public BaselearnerFactoryWrapper const std::string& blearner_type, Rcpp::List arg_list) { // Don't check argument types since we don't have a Function placeholder for the default list: - internal_arg_list = argHandler(internal_arg_list, arg_list, FALSE); + internal_arg_list = helper::argHandler(internal_arg_list, arg_list, FALSE); obj = new blearnerfactory::BaselearnerCustomFactory(blearner_type, data_source.getDataObj(), - data_target.getDataObj(), internal_arg_list["instantiate.fun"], internal_arg_list["train.fun"], + data_target.getDataObj(), internal_arg_list["instantiate.fun"], internal_arg_list["train.fun"], internal_arg_list["predict.fun"], internal_arg_list["param.fun"]); } @@ -762,7 +748,7 @@ class BaselearnerCustomCppFactoryWrapper : public BaselearnerFactoryWrapper { private: Rcpp::List internal_arg_list = Rcpp::List::create( - Rcpp::Named("instantiate.ptr") = 0, + Rcpp::Named("instantiate.ptr") = 0, Rcpp::Named("train.ptr") = 0, Rcpp::Named("predict.ptr") = 0 ); @@ -773,10 +759,10 @@ class BaselearnerCustomCppFactoryWrapper : public BaselearnerFactoryWrapper Rcpp::List arg_list) { // Don't check argument types since we don't have a Function placeholder for the default list: - internal_arg_list = argHandler(internal_arg_list, arg_list, FALSE); + internal_arg_list = helper::argHandler(internal_arg_list, arg_list, FALSE); obj = new blearnerfactory::BaselearnerCustomCppFactory("custom_cpp", data_source.getDataObj(), - data_target.getDataObj(), internal_arg_list["instantiate.ptr"], internal_arg_list["train.ptr"], + data_target.getDataObj(), internal_arg_list["instantiate.ptr"], internal_arg_list["train.ptr"], internal_arg_list["predict.ptr"]); } @@ -784,10 +770,10 @@ class BaselearnerCustomCppFactoryWrapper : public BaselearnerFactoryWrapper const std::string& blearner_type, Rcpp::List arg_list) { // Don't check argument types since we don't have a Function placeholder for the default list: - internal_arg_list = argHandler(internal_arg_list, arg_list, FALSE); + internal_arg_list = helper::argHandler(internal_arg_list, arg_list, FALSE); obj = new blearnerfactory::BaselearnerCustomCppFactory(blearner_type, data_source.getDataObj(), - data_target.getDataObj(), internal_arg_list["instantiate.ptr"], internal_arg_list["train.ptr"], + data_target.getDataObj(), internal_arg_list["instantiate.ptr"], internal_arg_list["train.ptr"], internal_arg_list["predict.ptr"]); } @@ -908,9 +894,9 @@ RCPP_MODULE (baselearner_factory_module) //' data.target1 = InMemoryData$new() //' data.target2 = InMemoryData$new() //' -//' lin.factory = BaselearnerPolynomial$new(data.source, data.target1, +//' lin.factory = BaselearnerPolynomial$new(data.source, data.target1, //' list(degree = 1, intercept = TRUE)) -//' poly.factory = BaselearnerPolynomial$new(data.source, data.target2, +//' poly.factory = BaselearnerPolynomial$new(data.source, data.target2, //' list(degree = 2, intercept = TRUE)) //' //' # Create new base-learner list: @@ -1176,7 +1162,7 @@ class LossAbsoluteWrapper : public LossWrapper //' \url{https://schalkdaniel.github.io/compboost/cpp_man/html/classloss_1_1_binomial_loss.html}. //' //' @examples -//' +//' //' # Create new loss object: //' bin.loss = LossBinomial$new() //' bin.loss @@ -1362,6 +1348,219 @@ RCPP_MODULE (loss_module) ; } + +// -------------------------------------------------------------------------- // +// RESPONSE CLASSES // +// -------------------------------------------------------------------------- // + + +class ResponseWrapper +{ +public: + ResponseWrapper () {} + + std::shared_ptr getResponseObj () { return sh_ptr_response; } + +protected: + std::shared_ptr sh_ptr_response; +}; + +//' Create response object for regression. +//' +//' \code{ResponseRegr} creates a response object that are used as target during the +//' fitting process. +//' +//' @format \code{\link{S4}} object. +//' @name ResponseRegr +//' +//' @section Usage: +//' \preformatted{ +//' ResponseRegr$new(target_name, response) +//' ResponseRegr$new(target_name, response, weights) +//' } +//' +//' @export ResponseRegr +class ResponseRegrWrapper : public ResponseWrapper +{ +public: + ResponseRegrWrapper (std::string target_name, arma::mat response) + { + sh_ptr_response = std::make_shared(target_name, response); + } + ResponseRegrWrapper (std::string target_name, arma::mat response, arma::mat weights) + { + sh_ptr_response = std::make_shared(target_name, response, weights); + } + + std::string getTargetName () const + { + return sh_ptr_response->getTargetName(); + } + + arma::mat getResponse () const + { + return sh_ptr_response->getResponse(); + } + + arma::mat getWeights () const + { + return sh_ptr_response->getWeights(); + } + + arma::mat getPrediction () const + { + return sh_ptr_response->getPredictionScores(); + } + + arma::mat getPredictionTransform () const + { + return sh_ptr_response->getPredictionTransform(); + } + + arma::mat getPredictionResponse () const + { + return sh_ptr_response->getPredictionResponse(); + } + + void filter (const arma::uvec& idx) const + { + // Shift by 1 to transform R index to C++ index: + sh_ptr_response->filter(idx - 1); + } + + double calculateEmpiricalRisk (LossWrapper& loss) const + { + return sh_ptr_response->calculateEmpiricalRisk(loss.getLoss()); + } +}; + +//' Create response object for binary classification. +//' +//' \code{ResponseBinaryClassif} creates a response object that are used as target during the +//' fitting process. +//' +//' @format \code{\link{S4}} object. +//' @name ResponseBinaryClassif +//' +//' @section Usage: +//' \preformatted{ +//' ResponseBinaryClassif$new(target_name, response) +//' ResponseBinaryClassif$new(target_name, response, weights) +//' } +//' +//' @export ResponseBinaryClassif +class ResponseBinaryClassifWrapper : public ResponseWrapper +{ +public: + + ResponseBinaryClassifWrapper (std::string target_name, arma::mat response) + { + sh_ptr_response = std::make_shared(target_name, response); + } + ResponseBinaryClassifWrapper (std::string target_name, arma::mat response, arma::mat weights) + { + sh_ptr_response = std::make_shared(target_name, response, weights); + } + + std::string getTargetName () const + { + return sh_ptr_response->getTargetName(); + } + + arma::mat getResponse () const + { + return sh_ptr_response->getResponse(); + } + + arma::mat getWeights () const + { + return sh_ptr_response->getWeights(); + } + + arma::mat getPrediction () const + { + return sh_ptr_response->getPredictionScores(); + } + + arma::mat getPredictionTransform () const + { + return sh_ptr_response->getPredictionTransform(); + } + + arma::mat getPredictionResponse () const + { + return sh_ptr_response->getPredictionResponse(); + } + + void filter (const arma::uvec& idx) const + { + // Shift by 1 to transform R index to C++ index: + sh_ptr_response->filter(idx); + } + + double calculateEmpiricalRisk (LossWrapper& loss) const + { + return sh_ptr_response->calculateEmpiricalRisk(loss.getLoss()); + } + + double getThreshold () const + { + // B* pB = static_cast(x); + // pB->myNewMethod(); + return std::static_pointer_cast(sh_ptr_response)->threshold; + } + void setThreshold (double thresh) + { + std::static_pointer_cast(sh_ptr_response)->setThreshold(thresh); + } +}; + +RCPP_EXPOSED_CLASS(ResponseWrapper) +RCPP_MODULE (response_module) +{ + using namespace Rcpp; + + class_ ("Response") + .constructor ("Create Response class") + ; + + class_ ("ResponseRegr") + .derives ("Response") + + .constructor () + .constructor () + + .method("getTargetName", &ResponseRegrWrapper::getTargetName, "Get the name of the target variable") + .method("getResponse", &ResponseRegrWrapper::getResponse, "Get the original response") + .method("getWeights", &ResponseRegrWrapper::getWeights, "Get the weights") + .method("getPrediction", &ResponseRegrWrapper::getPrediction, "Get prediction scores") + .method("getPredictionTransform", &ResponseRegrWrapper::getPredictionTransform, "Get transformed prediction scores") + .method("getPredictionResponse", &ResponseRegrWrapper::getPredictionResponse, "Get transformed prediction as response") + .method("filter", &ResponseRegrWrapper::filter, "Filter response elements") + .method("calculateEmpiricalRisk", &ResponseRegrWrapper::calculateEmpiricalRisk, "Calculates the empirical list given a specific loss") + ; + + class_ ("ResponseBinaryClassif") + .derives ("Response") + + .constructor () + .constructor () + + .method("getTargetName", &ResponseBinaryClassifWrapper::getTargetName, "Get the name of the target variable") + .method("getResponse", &ResponseBinaryClassifWrapper::getResponse, "Get the original response") + .method("getWeights", &ResponseBinaryClassifWrapper::getWeights, "Get the weights") + .method("getPrediction", &ResponseBinaryClassifWrapper::getPrediction, "Get prediction scores") + .method("getPredictionTransform", &ResponseBinaryClassifWrapper::getPredictionTransform, "Get transformed prediction scores") + .method("getPredictionResponse", &ResponseBinaryClassifWrapper::getPredictionResponse, "Get transformed prediction as response") + .method("filter", &ResponseBinaryClassifWrapper::filter, "Filter response elements") + .method("calculateEmpiricalRisk", &ResponseBinaryClassifWrapper::calculateEmpiricalRisk, "Calculates the empirical list given a specific loss") + .method("getThreshold", &ResponseBinaryClassifWrapper::getThreshold, "Get threshold used to transform scores to labels") + .method("setThreshold", &ResponseBinaryClassifWrapper::setThreshold, "Set threshold used to transform scores to labels") + ; +} + + + // -------------------------------------------------------------------------- // // LOGGER // // -------------------------------------------------------------------------- // @@ -1587,7 +1786,7 @@ class LoggerInbagRiskWrapper : public LoggerWrapper //' //' @section Usage: //' \preformatted{ -//' LoggerOobRisk$new(logger_id, use_as_stopper, used_loss, eps_for_break, +//' LoggerOobRisk$new(logger_id, use_as_stopper, used_loss, eps_for_break, //' oob_data, oob_response) //' } //' @@ -1683,8 +1882,11 @@ class LoggerInbagRiskWrapper : public LoggerWrapper //' # Used loss: //' log.bin = LossBinomial$new() //' +//' # Define response object of oob data: +//' oob.response = ResponseRegr$new("oob_response", as.matrix(y.oob)) +//' //' # Define logger: -//' log.oob.risk = LoggerOobRisk$new("oob", FALSE, log.bin, 0.05, oob.list, y.oob) +//' log.oob.risk = LoggerOobRisk$new("oob", FALSE, log.bin, 0.05, oob.list, oob.response) //' //' # Summarize logger: //' log.oob.risk$summarizeLogger() @@ -1699,7 +1901,7 @@ class LoggerOobRiskWrapper : public LoggerWrapper public: LoggerOobRiskWrapper (std::string logger_id0, bool use_as_stopper, LossWrapper& used_loss, double eps_for_break, - Rcpp::List oob_data, arma::vec oob_response) + Rcpp::List oob_data, ResponseWrapper& oob_response) { std::map oob_data_map; @@ -1724,7 +1926,7 @@ class LoggerOobRiskWrapper : public LoggerWrapper logger_id = logger_id0; obj = new logger::LoggerOobRisk (logger_id, use_as_stopper, used_loss.getLoss(), eps_for_break, - oob_data_map, oob_response); + oob_data_map, oob_response.getResponseObj()); } void summarizeLogger () @@ -1960,7 +2162,7 @@ RCPP_MODULE(logger_module) class_ ("LoggerOobRisk") .derives ("Logger") - .constructor () + .constructor () .method("summarizeLogger", &LoggerOobRiskWrapper::summarizeLogger, "Summarize logger") ; @@ -2031,7 +2233,7 @@ class OptimizerCoordinateDescent : public OptimizerWrapper //' Coordinate Descent with line search //' -//' This class defines a new object which is used to conduct Coordinate Descent with line search. +//' This class defines a new object which is used to conduct Coordinate Descent with line search. //' The optimizer just calculates for each base-learner the sum of squared error and returns //' the base-learner with the smallest SSE. In addition, this optimizer computes //' a line search to find the optimal step size in each iteration. @@ -2076,7 +2278,7 @@ RCPP_MODULE(optimizer_module) class_ ("OptimizerCoordinateDescent") .derives ("Optimizer") .constructor () - ; + ; class_ ("OptimizerCoordinateDescentLineSearch") .derives ("Optimizer") @@ -2192,6 +2394,7 @@ RCPP_MODULE(optimizer_module) //' //' # Target variable: //' y = df[["mpg.cat"]] +//' response = ResponseBinaryClassif$new("mpg.cat", as.matrix(y)) //' //' data.source.hp = InMemoryData$new(X.hp, "hp") //' data.source.wt = InMemoryData$new(X.wt, "wt") @@ -2208,13 +2411,13 @@ RCPP_MODULE(optimizer_module) //' test.data = oob.data //' //' # Factories: -//' linear.factory.hp = BaselearnerPolynomial$new(data.source.hp, data.target.hp1, +//' linear.factory.hp = BaselearnerPolynomial$new(data.source.hp, data.target.hp1, //' list(degree = 1, intercept = TRUE)) -//' linear.factory.wt = BaselearnerPolynomial$new(data.source.wt, data.target.wt1, +//' linear.factory.wt = BaselearnerPolynomial$new(data.source.wt, data.target.wt1, //' list(degree = 1, intercept = TRUE)) -//' quadratic.factory.hp = BaselearnerPolynomial$new(data.source.hp, data.target.hp2, +//' quadratic.factory.hp = BaselearnerPolynomial$new(data.source.hp, data.target.hp2, //' list(degree = 2, intercept = TRUE)) -//' spline.factory.wt = BaselearnerPSpline$new(data.source.wt, data.target.wt2, +//' spline.factory.wt = BaselearnerPSpline$new(data.source.wt, data.target.wt2, //' list(degree = 3, n.knots = 10, penalty = 2, differences = 2)) //' //' # Create new factory list: @@ -2238,8 +2441,6 @@ RCPP_MODULE(optimizer_module) //' # time, inbag risk and oob risk: //' log.iterations = LoggerIteration$new(" iteration.logger", TRUE, 500) //' log.time = LoggerTime$new("time.logger", FALSE, 500, "microseconds") -//' log.inbag = LoggerInbagRisk$new("inbag.binomial", FALSE, loss.bin, 0.05) -//' log.oob = LoggerOobRisk$new("oob.binomial", FALSE, loss.bin, 0.05, oob.data, y) //' //' # Define new logger list: //' logger.list = LoggerList$new() @@ -2247,15 +2448,13 @@ RCPP_MODULE(optimizer_module) //' # Register the logger: //' logger.list$registerLogger(log.iterations) //' logger.list$registerLogger(log.time) -//' logger.list$registerLogger(log.inbag) -//' logger.list$registerLogger(log.oob) //' //' # Run compboost: //' # -------------- //' //' # Initialize object: //' cboost = Compboost_internal$new( -//' response = y, +//' response = response, //' learning_rate = 0.05, //' stop_if_all_stopper_fulfilled = FALSE, //' factory_list = factory.list, @@ -2290,7 +2489,7 @@ class CompboostWrapper // - double -> const double & // - bool -> const bool& // crashes the compilation? - CompboostWrapper (arma::vec response, double learning_rate, + CompboostWrapper (ResponseWrapper& response, double learning_rate, bool stop_if_all_stopper_fulfilled, BlearnerFactoryListWrapper& factory_list, LossWrapper& loss, LoggerListWrapper& logger_list, OptimizerWrapper& optimizer) { @@ -2300,7 +2499,7 @@ class CompboostWrapper used_optimizer = optimizer.getOptimizer(); blearner_list_ptr = factory_list.getFactoryList(); - obj = new cboost::Compboost(response, learning_rate0, stop_if_all_stopper_fulfilled, + obj = new cboost::Compboost(response.getResponseObj(), learning_rate0, stop_if_all_stopper_fulfilled, used_optimizer, loss.getLoss(), used_logger, *blearner_list_ptr); } @@ -2394,22 +2593,22 @@ class CompboostWrapper return obj->predict(data_map, as_response); } - arma::vec predictAtIteration (Rcpp::List& newdata, unsigned int k, bool as_response) - { - std::map data_map; + // arma::vec predictAtIteration (Rcpp::List& newdata, unsigned int k, bool as_response) + // { + // std::map data_map; - // Create data map (see line 780, same applies here): - for (unsigned int i = 0; i < newdata.size(); i++) { + // // Create data map (see line 780, same applies here): + // for (unsigned int i = 0; i < newdata.size(); i++) { - // Get data wrapper: - DataWrapper* temp = newdata[i]; + // // Get data wrapper: + // DataWrapper* temp = newdata[i]; - // Get the real data pointer: - data_map[ temp->getDataObj()->getDataIdentifier() ] = temp->getDataObj(); + // // Get the real data pointer: + // data_map[ temp->getDataObj()->getDataIdentifier() ] = temp->getDataObj(); - } - return obj->predictionOfIteration(data_map, k, as_response); - } + // } + // return obj->predictionOfIteration(data_map, k, as_response); + // } void summarizeCompboost () { @@ -2421,7 +2620,7 @@ class CompboostWrapper return is_trained; } - double getOffset () + arma::mat getOffset () { return obj->getOffset(); } @@ -2463,7 +2662,7 @@ RCPP_MODULE (compboost_module) using namespace Rcpp; class_ ("Compboost_internal") - .constructor () + .constructor () .method("train", &CompboostWrapper::train, "Run componentwise boosting") .method("continueTraining", &CompboostWrapper::continueTraining, "Continue Training") .method("getPrediction", &CompboostWrapper::getPrediction, "Get prediction") @@ -2473,7 +2672,7 @@ RCPP_MODULE (compboost_module) .method("getParameterAtIteration", &CompboostWrapper::getParameterAtIteration, "Get the estimated parameter for iteration k < iter.max") .method("getParameterMatrix", &CompboostWrapper::getParameterMatrix, "Get matrix of all estimated parameter in each iteration") .method("predict", &CompboostWrapper::predict, "Predict newdata") - .method("predictAtIteration", &CompboostWrapper::predictAtIteration, "Predict newdata for iteration k < iter.max") + // .method("predictAtIteration", &CompboostWrapper::predictAtIteration, "Predict newdata for iteration k < iter.max") .method("summarizeCompboost", &CompboostWrapper::summarizeCompboost, "Sumamrize compboost object.") .method("isTrained", &CompboostWrapper::isTrained, "Status of algorithm if it is already trained.") .method("setToIteration", &CompboostWrapper::setToIteration, "Set state of the model to a given iteration") diff --git a/src/data.h b/src/data.h index 9affda11..c0372066 100644 --- a/src/data.h +++ b/src/data.h @@ -16,21 +16,6 @@ // MIT License for more details. You should have received a copy of // the MIT License along with compboost. // -// Written by: -// ----------- -// -// Daniel Schalk -// Department of Statistics -// Ludwig-Maximilians-University Munich -// Ludwigstrasse 33 -// D-80539 München -// -// https://www.compstat.statistik.uni-muenchen.de -// -// Contact -// e: contact@danielschalk.com -// w: danielschalk.com -// // =========================================================================== # #ifndef DATA_H_ diff --git a/src/helper.cpp b/src/helper.cpp index 5f0c0f21..147f114e 100644 --- a/src/helper.cpp +++ b/src/helper.cpp @@ -13,40 +13,28 @@ // Compboost is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// MIT License for more details. You should have received a copy of -// the MIT License along with compboost. -// -// Written by: -// ----------- -// -// Daniel Schalk -// Department of Statistics -// Ludwig-Maximilians-University Munich -// Ludwigstrasse 33 -// D-80539 München -// -// https://www.compstat.statistik.uni-muenchen.de -// -// Contact -// e: contact@danielschalk.com -// w: danielschalk.com +// MIT License for more details. You should have received a copy of +// the MIT License along with compboost. // // =========================================================================== # #include "helper.h" + +namespace helper +{ /** * \brief Check if a string occurs within a string vector - * + * * This function just takes a string, iterates over a given vector of strings, - * and returns true if the string occurs within the vector. This function is + * and returns true if the string occurs within the vector. This function is * used to check the argument name match up. - * + * * \param str `str::string` String for the lookup. - * - * \param differences `std::vector` Vector of strings which we + * + * \param differences `std::vector` Vector of strings which we ' want to check if str occurs.. - * - * \returns `bool` boolean if the string occurs within the vector. + * + * \returns `bool` boolean if the string occurs within the vector. */ bool stringInNames (std::string str, std::vector names) { @@ -63,18 +51,18 @@ bool stringInNames (std::string str, std::vector names) /** * \brief Check and set list arguments - * + * * This function matches to lists to update an internal list with a new - * match-up list. This function checks which elements are available, + * match-up list. This function checks which elements are available, * if they occurs in the internal list, and if the underlying data types * matches. If so, this function replaces - * the values of the internal list with the new values. If the new list + * the values of the internal list with the new values. If the new list * contains unused elements, this function also throws a warning and prints * the unused ones. - * + * * \param internal_list `Rcpp::List` Internal list with default values. * \param matching_list `Rcpp::List` New list to replace default values. - * \returns `Rcpp::List` of updated default values. + * \returns `Rcpp::List` of updated default values. */ Rcpp::List argHandler (Rcpp::List internal_list, Rcpp::List matching_list, bool type_check = TRUE) { @@ -93,8 +81,8 @@ Rcpp::List argHandler (Rcpp::List internal_list, Rcpp::List matching_list, bool Rcpp::stop("Be sure to specify names within your argument list."); } catch ( std::exception &ex ) { forward_exception_to_r( ex ); - } catch (...) { - ::Rf_error( "c++ exception (unknown reason)" ); + } catch (...) { + ::Rf_error( "c++ exception (unknown reason)" ); } } @@ -127,8 +115,8 @@ Rcpp::List argHandler (Rcpp::List internal_list, Rcpp::List matching_list, bool Rcpp::stop("Argument types for \"" + matching_list_names[i] + "\" does not match. Maybe you should take a look at the documentation."); } catch ( std::exception &ex ) { forward_exception_to_r( ex ); - } catch (...) { - ::Rf_error( "c++ exception (unknown reason)" ); + } catch (...) { + ::Rf_error( "c++ exception (unknown reason)" ); } } } else { @@ -152,4 +140,61 @@ Rcpp::List argHandler (Rcpp::List internal_list, Rcpp::List matching_list, bool Rcpp::warning(message.str()); } return internal_list; -} \ No newline at end of file +} + +double calculateSumOfSquaredError (const arma::mat& response, const arma::mat& prediction) +{ + return arma::accu(arma::pow(response - prediction, 2)); +} + +arma::mat sigmoid (const arma::mat& scores) +{ + return 1 / (1 + arma::exp(-scores)); +} + +arma::mat transformToBinaryResponse (const arma::mat& score_mat, const double& threshold, const double& pos, const double& neg) +{ + arma::mat out = score_mat; + + arma::umat ids_pos = find(score_mat >= threshold); + arma::umat ids_neg = find(score_mat < threshold); + + out.elem(ids_pos).fill(pos); + out.elem(ids_neg).fill(neg); + + return out; +} + +void checkForBinaryClassif (const arma::mat& response, const int& pos, const int& neg) +{ + arma::vec unique_values = arma::unique(response); + try { + if (unique_values.size() != 2) { + Rcpp::stop("Multiple classes detected."); + } + if (! arma::all((unique_values == neg) || (unique_values == pos))) { + std::string msg_stop = "Labels must be coded as " + std::to_string(neg) + " and " + std::to_string(pos) + "."; + Rcpp::stop("Labels must be coded as -1 and 1."); + } + } catch ( std::exception &ex ) { + forward_exception_to_r( ex ); + } catch (...) { + ::Rf_error( "c++ exception (unknown reason)" ); + } +} + +void checkMatrixDim (const arma::mat& X, const arma::mat& Y) +{ + try { + if (X.n_rows != Y.n_rows || X.n_cols != Y.n_cols) { + std::string error_msg = "Dimension does not match " + std::to_string(X.n_rows) + "x" + std::to_string(X.n_cols) + " and " + std::to_string(Y.n_rows) + "x" + std::to_string(Y.n_cols) + "."; + Rcpp::stop(error_msg); + } + } catch ( std::exception &ex ) { + forward_exception_to_r( ex ); + } catch (...) { + ::Rf_error( "c++ exception (unknown reason)" ); + } +} + +} // namespace helper \ No newline at end of file diff --git a/src/helper.h b/src/helper.h index dc5267fc..6fed20f8 100644 --- a/src/helper.h +++ b/src/helper.h @@ -13,34 +13,29 @@ // Compboost is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// MIT License for more details. You should have received a copy of -// the MIT License along with compboost. -// -// Written by: -// ----------- -// -// Daniel Schalk -// Department of Statistics -// Ludwig-Maximilians-University Munich -// Ludwigstrasse 33 -// D-80539 München -// -// https://www.compstat.statistik.uni-muenchen.de -// -// Contact -// e: contact@danielschalk.com -// w: danielschalk.com +// MIT License for more details. You should have received a copy of +// the MIT License along with compboost. // // =========================================================================== # #ifndef HELPER_H_ #define HELPER_H_ -#include +#include #include #include +namespace helper +{ + bool stringInNames (std::string, std::vector); Rcpp::List argHandler (Rcpp::List, Rcpp::List, bool); +double calculateSumOfSquaredError (const arma::mat&, const arma::mat&); +arma::mat sigmoid (const arma::mat&); +arma::mat transformToBinaryResponse (const arma::mat&, const double&, const double&, const double&); +void checkForBinaryClassif (const arma::mat&, const int&, const int&); +void checkMatrixDim (const arma::mat&, const arma::mat&); + +} // namespace helper # endif // HELPER_H_ \ No newline at end of file diff --git a/src/line_search.cpp b/src/line_search.cpp index 81e5a91f..3778b227 100644 --- a/src/line_search.cpp +++ b/src/line_search.cpp @@ -13,24 +13,9 @@ // Compboost is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// MIT License for more details. You should have received a copy of +// MIT License for more details. You should have received a copy of // the MIT License along with compboost. // -// Written by: -// ----------- -// -// Daniel Schalk -// Department of Statistics -// Ludwig-Maximilians-University Munich -// Ludwigstrasse 33 -// D-80539 München -// -// https://www.compstat.statistik.uni-muenchen.de -// -// Contact -// e: contact@danielschalk.com -// w: danielschalk.com -// // =========================================================================== # #include "line_search.h" @@ -38,53 +23,53 @@ namespace linesearch { /** - * \brief Calculate risk for a given step size - * - * This function calculates risk obtained by a given step size (and all the other components). - * Hence, it defines the objective function we want to minimize to get the optimal step size. - * - * \param step_size `double` + * \brief Calculate risk for a given step size + * + * This function calculates risk obtained by a given step size (and all the other components). + * Hence, it defines the objective function we want to minimize to get the optimal step size. + * + * \param step_size `double` * * \param used_loss `loss::Loss*` - * + * * \param target `arma::vec` * * \param model_prediction `arma::vec` * * \param baselearner_prediction `arma::vec` - * + * * \returns `double` Risk evaluated at the given step size */ -double calculateRisk (const double& step_size, loss::Loss* used_loss, const arma::vec& target, const arma::vec& model_prediction, +double calculateRisk (const double& step_size, loss::Loss* used_loss, const arma::vec& target, const arma::vec& model_prediction, const arma::vec& baselearner_prediction) { - return arma::mean(used_loss->definedLoss(target, model_prediction + step_size * baselearner_prediction)); + return arma::accu(used_loss->definedLoss(target, model_prediction + step_size * baselearner_prediction)) / model_prediction.size(); } /** - * \brief Conduct line search - * + * \brief Conduct line search + * * This function calculates the step sized used in boosting to shrink the parameter. - * It uses the Brent methods from boost to find the minimum. Included from the boost library: + * It uses the Brent methods from boost to find the minimum. Included from the boost library: * https://www.boost.org/doc/libs/1_61_0/libs/math/doc/html/math_toolkit/roots/brent_minima.html - * + * * \param used_loss `loss::Loss*` - * + * * \param target `arma::vec` * * \param model_prediction `arma::vec` * * \param baselearner_prediction `arma::vec` - * + * * \returns `double` Optimal step size. */ -double findOptimalStepSize (loss::Loss* used_loss, const arma::vec& target, const arma::vec& model_prediction, - const arma::vec& baselearner_prediction, const double& lower_bound, const double& upper_bound) -{ +double findOptimalStepSize (loss::Loss* used_loss, const arma::vec& target, const arma::vec& model_prediction, + const arma::vec& baselearner_prediction, const double& lower_bound, const double& upper_bound) +{ boost::uintmax_t max_iter = 500; // boost::math::tools::eps_tolerance tol(30); int bits = std::numeric_limits::digits; - + // Conduct the root finding: std::pair r = boost::math::tools::brent_find_minima(std::bind(calculateRisk, std::placeholders::_1, used_loss, target, model_prediction, baselearner_prediction), lower_bound, upper_bound, bits, max_iter); diff --git a/src/logger.cpp b/src/logger.cpp index 9664e7e6..d23e52f2 100644 --- a/src/logger.cpp +++ b/src/logger.cpp @@ -13,23 +13,8 @@ // Compboost is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// MIT License for more details. You should have received a copy of -// the MIT License along with compboost. -// -// Written by: -// ----------- -// -// Daniel Schalk -// Department of Statistics -// Ludwig-Maximilians-University Munich -// Ludwigstrasse 33 -// D-80539 München -// -// https://www.compstat.statistik.uni-muenchen.de -// -// Contact -// e: contact@danielschalk.com -// w: danielschalk.com +// MIT License for more details. You should have received a copy of +// the MIT License along with compboost. // // =========================================================================== # @@ -44,7 +29,7 @@ namespace logger /** * \brief Getter of logger identifier - * + * * \returns `std::string` of logger id */ std::string Logger::getLoggerId () const @@ -54,7 +39,7 @@ std::string Logger::getLoggerId () const /** * \brief Getter if the logger is used as stopper - * + * * \returns `bool` which says `true` if it is a logger, otherwise `false` */ bool Logger::getIfLoggerIsStopper () const @@ -78,19 +63,19 @@ Logger::~Logger () { } /** * \brief Default constructor of class `LoggerIteration` - * + * * Sets the private member `max_iteration` and the tag if the logger should be * used as stopper. - * + * * \param logger_id0 `std::string` unique identifier of the logger * \param is_a_stopper `bool` specify if the logger should be used as stopper * \param max_iterations `unsigned int` sets value of the stopping criteria - * + * */ -LoggerIteration::LoggerIteration (const std::string& logger_id0, const bool& is_a_stopper0, - const unsigned int& max_iterations) - : max_iterations ( max_iterations ) +LoggerIteration::LoggerIteration (const std::string& logger_id0, const bool& is_a_stopper0, + const unsigned int& max_iterations) + : max_iterations ( max_iterations ) { is_a_stopper = is_a_stopper0; logger_id = logger_id0; @@ -98,40 +83,39 @@ LoggerIteration::LoggerIteration (const std::string& logger_id0, const bool& is_ /** * \brief Log current step of compboost iteration of class `LoggerIteration` - * - * This function loggs the current iteration. - * - * \param current_iteration `unsigned int` of current iteration + * + * This function loggs the current iteration. + * + * \param current_iteration `unsigned int` of current iteration * \param response `arma::vec` of the given response used for training - * \param prediction `arma::vec` actual prediction of the boosting model at + * \param prediction `arma::vec` actual prediction of the boosting model at * iteration `current_iteration` - * \param used_blearner `Baselearner*` pointer to the selected baselearner in + * \param used_blearner `Baselearner*` pointer to the selected baselearner in * iteration `current_iteration` * \param offset `double` of the overall offset of the training * \param learning_rate `double` lerning rate of the `current_iteration` - * + * */ -void LoggerIteration::logStep (const unsigned int& current_iteration, const arma::vec& response, - const arma::vec& prediction, blearner::Baselearner* used_blearner, const double& offset, - const double& learning_rate, const double& step_size) +void LoggerIteration::logStep (const unsigned int& current_iteration, std::shared_ptr sh_ptr_response, + blearner::Baselearner* used_blearner, const double& learning_rate, const double& step_size) { iterations.push_back(current_iteration); } /** * \brief Stop criteria is fulfilled if the current iteration exceed `max_iteration` - * - * - * - * \returns `bool` which tells if the stopping criteria is reached or not + * + * + * + * \returns `bool` which tells if the stopping criteria is reached or not * (if the logger isn't a stopper then this is always false) */ bool LoggerIteration::reachedStopCriteria () const { bool stop_criteria_is_reached = false; - + if (is_a_stopper) { if (max_iterations <= iterations.back()) { stop_criteria_is_reached = true; @@ -142,13 +126,13 @@ bool LoggerIteration::reachedStopCriteria () const /** * \brief Return the data stored within the iteration logger - * + * * This function returns the logged integer. An issue here is, that the later * transformation of all logged data to an `arma::mat` requires `arma::vec` as - * return value. Therefore the std integer vector is transforemd to an - * `arma::vec`. We know that this isn't very memory friendly, but the + * return value. Therefore the std integer vector is transforemd to an + * `arma::vec`. We know that this isn't very memory friendly, but the * `arma::mat` we use later can just have one type. - * + * * \return `arma::vec` of iterations. */ @@ -156,17 +140,17 @@ arma::vec LoggerIteration::getLoggedData () const { // Cast integer vector to double: std::vector iterations_double (iterations.begin(), iterations.end()); - + arma::vec out (iterations_double); return out; } /** * \brief Clear the logger data - * - * This is an important thing which is called every time in front of retraining + * + * This is an important thing which is called every time in front of retraining * the model. If we don't clear the data, the new iterations are just pasted at - * the end of the existing vectors which couses some troubles. + * the end of the existing vectors which couses some troubles. */ void LoggerIteration::clearLoggerData () @@ -175,12 +159,12 @@ void LoggerIteration::clearLoggerData () } /** - * \brief Print status of current iteration into the console - * + * \brief Print status of current iteration into the console + * * The string which is created in this functions must have exactly the same * length as the string from `initializeLoggerPrinter()`. Those strings are * printed line by line. - * + * * \returns `std::string` which includes the log of the current iteration */ @@ -189,7 +173,7 @@ std::string LoggerIteration::printLoggerStatus () const std::string max_iters = std::to_string(max_iterations); std::stringstream ss; ss << std::setw(2 * max_iters.size() + 1) << std::to_string(iterations.back()) + "/" + max_iters; - + return ss.str(); } @@ -202,15 +186,15 @@ std::string LoggerIteration::printLoggerStatus () const /** * \brief Default constructor of class `LoggerInbagRisk` - * + * * \param logger_id0 `std::string` unique identifier of the logger * \param is_a_stopper0 `bool` specify if the logger should be used as stopper - * \param used_loss `Loss*` used loss to calculate the empirical risk (this + * \param used_loss `Loss*` used loss to calculate the empirical risk (this * can differ from the one used while training the model) * \param eps_for_break `double` sets value of the stopping criteria` */ -LoggerInbagRisk::LoggerInbagRisk (const std::string& logger_id0, const bool& is_a_stopper0, loss::Loss* used_loss, +LoggerInbagRisk::LoggerInbagRisk (const std::string& logger_id0, const bool& is_a_stopper0, loss::Loss* used_loss, const double& eps_for_break) : used_loss ( used_loss ), eps_for_break ( eps_for_break ) @@ -221,74 +205,73 @@ LoggerInbagRisk::LoggerInbagRisk (const std::string& logger_id0, const bool& is_ /** * \brief Log current step of compboost iteration for class `LoggerInbagRisk` - * + * * This logger computes the risk for the given training data * \f$\mathcal{D}_\mathrm{train} = \{(x_i,\ y_i)\ |\ i \in \{1, \dots, n\}\}\f$ - * and stores it into a vector. The empirical risk \f$\mathcal{R}\f$ for + * and stores it into a vector. The empirical risk \f$\mathcal{R}\f$ for * iteration \f$m\f$ is calculated by: * \f[ * \mathcal{R}_\mathrm{emp}^{[m]} = \frac{1}{|\mathcal{D}_\mathrm{train}|}\sum\limits_{(x,y) \in \mathcal{D}_\mathrm{train}} L(y, \hat{f}^{[m]}(x)) * \f] - * - * **Note:** + * + * **Note:** * - If \f$m=0\f$ than \f$\hat{f}\f$ is just the offset. * - The implementation to calculate \f$\mathcal{R}_\mathrm{emp}^{[m]}\f$ is - * done in two steps: - * 1. Calculate vector `risk_temp` of losses for every observation for + * done in two steps: + * 1. Calculate vector `risk_temp` of losses for every observation for * given response \f$y^{(i)}\f$ and prediction \f$\hat{f}^{[m]}(x^{(i)})\f$. * 2. Average over `risk_temp`. - * + * * This procedure ensures, that it is possible to e.g. use the AUC or any - * arbitrary performance measure for risk logging. This gives just one - * value for `risk_temp` and therefore the average equals the loss - * function. If this is just a value (like for the AUC) then the value is + * arbitrary performance measure for risk logging. This gives just one + * value for `risk_temp` and therefore the average equals the loss + * function. If this is just a value (like for the AUC) then the value is * returned. - * - * \param current_iteration `unsigned int` of current iteration + * + * \param current_iteration `unsigned int` of current iteration * \param response `arma::vec` of the given response used for training - * \param prediction `arma::vec` actual prediction of the boosting model at + * \param prediction `arma::vec` actual prediction of the boosting model at * iteration `current_iteration` - * \param used_blearner `Baselearner*` pointer to the selected baselearner in + * \param used_blearner `Baselearner*` pointer to the selected baselearner in * iteration `current_iteration` * \param offset `double` of the overall offset of the training * \param learning_rate `double` lerning rate of the `current_iteration` - * + * */ -void LoggerInbagRisk::logStep (const unsigned int& current_iteration, const arma::vec& response, - const arma::vec& prediction, blearner::Baselearner* used_blearner, const double& offset, - const double& learning_rate, const double& step_size) +void LoggerInbagRisk::logStep (const unsigned int& current_iteration, std::shared_ptr sh_ptr_response, + blearner::Baselearner* used_blearner, const double& learning_rate, const double& step_size) { // Calculate empirical risk. Calculateion of the temporary vector ensures // // that stuff like auc logging is possible: // arma::vec loss_vec_temp = used_loss->definedLoss(response, prediction); // double temp_risk = arma::accu(loss_vec_temp) / loss_vec_temp.size(); - double temp_risk = arma::mean(used_loss->definedLoss(response, prediction)); - + double temp_risk = sh_ptr_response->calculateEmpiricalRisk(used_loss); + tracked_inbag_risk.push_back(temp_risk); } /** * \brief Stop criteria is fulfilled if the relative improvement falls below `eps_for_break` - * - * The stopping criteria is fulfilled, if the relative improvement at the - * current iteration \f$m\f$ \f$\varepsilon^{[m]}\f$ falls under a fixed boundary + * + * The stopping criteria is fulfilled, if the relative improvement at the + * current iteration \f$m\f$ \f$\varepsilon^{[m]}\f$ falls under a fixed boundary * \f$\varepsilon\f$. Where the relative improvement is defined by * \f[ * \varepsilon^{[m]} = \frac{\mathcal{R}_\mathrm{emp}^{[m-1]} - \mathcal{R}_\mathrm{emp}^{[m]}}{\mathcal{R}_\mathrm{emp}^{[m-1]}}. * \f] - * + * * The logger stops the algorithm if \f$\varepsilon^{[m]} \leq \varepsilon\f$. - * - * \returns `bool` which tells if the stopping criteria is reached or not + * + * \returns `bool` which tells if the stopping criteria is reached or not * (if the logger isn't a stopper then this is always false) */ bool LoggerInbagRisk::reachedStopCriteria () const { bool stop_criteria_is_reached = false; - + if (is_a_stopper) { if (tracked_inbag_risk.size() > 1) { // We need to subtract -2 and -1 since c++ start counting by 0 while size @@ -296,7 +279,7 @@ bool LoggerInbagRisk::reachedStopCriteria () const // size returns 1 but we want to access 0: double inbag_eps = tracked_inbag_risk[tracked_inbag_risk.size() - 2] - tracked_inbag_risk[tracked_inbag_risk.size() - 1]; inbag_eps = inbag_eps / tracked_inbag_risk[tracked_inbag_risk.size() - 2]; - + if (inbag_eps <= eps_for_break) { stop_criteria_is_reached = true; } @@ -307,9 +290,9 @@ bool LoggerInbagRisk::reachedStopCriteria () const /** * \brief Return the data stored within the OOB risk logger - * + * * This function returns the logged OOB risk. - * + * * \return `arma::vec` of elapsed time */ @@ -321,10 +304,10 @@ arma::vec LoggerInbagRisk::getLoggedData () const /** * \brief Clear the logger data - * - * This is an important thing which is called every time in front of retraining + * + * This is an important thing which is called every time in front of retraining * the model. If we don't clear the data, the new iterations are just pasted at - * the end of the existing vectors which couses some troubles. + * the end of the existing vectors which couses some troubles. */ void LoggerInbagRisk::clearLoggerData () @@ -333,12 +316,12 @@ void LoggerInbagRisk::clearLoggerData () } /** - * \brief Print status of current iteration into the console - * + * \brief Print status of current iteration into the console + * * The string which is created in this functions must have exactly the same * length as the string from `initializeLoggerPrinter()`. Those strings are * printed line by line. - * + * * \returns `std::string` which includes the log of the current iteration */ @@ -346,7 +329,7 @@ std::string LoggerInbagRisk::printLoggerStatus () const { std::stringstream ss; ss << logger_id << " = " << std::setprecision(2) << tracked_inbag_risk.back(); - + return ss.str(); } @@ -360,85 +343,82 @@ std::string LoggerInbagRisk::printLoggerStatus () const /** * \brief Default constructor of `LoggerOobRisk` - * + * * \param logger_id0 `std::string` unique identifier of the logger * \param is_a_stopper0 `bool` to set if the logger should be used as stopper - * \param used_loss `Loss*` which is used to calculate the empirical risk (this + * \param used_loss `Loss*` which is used to calculate the empirical risk (this * can differ from the loss used while trining the model) * \param eps_for_break `double` sets value of the stopping criteria * \param oob_data `std::map` the new data * \param oob_response `arma::vec` response of the new data */ -LoggerOobRisk::LoggerOobRisk (const std::string& logger_id0, const bool& is_a_stopper0, loss::Loss* used_loss, - const double& eps_for_break, std::map oob_data, - const arma::vec& oob_response) +LoggerOobRisk::LoggerOobRisk (const std::string& logger_id0, const bool& is_a_stopper0, loss::Loss* used_loss, + const double& eps_for_break, std::map oob_data, std::shared_ptr oob_response) : used_loss ( used_loss ), eps_for_break ( eps_for_break ), oob_data ( oob_data ), - oob_response ( oob_response ) + sh_ptr_oob_response ( oob_response ) { is_a_stopper = is_a_stopper0; - - arma::vec temp (oob_response.size()); - oob_prediction = temp; logger_id = logger_id0; + // sh_ptr_oob_response = oob_response; } /** * \brief Log current step of compboost iteration for class `LoggerOobRisk` - * - * This logger computes the risk for a given new dataset + * + * This logger computes the risk for a given new dataset * \f$\mathcal{D}_\mathrm{oob} = \{(x_i,\ y_i)\ |\ i \in I_\mathrm{oob}\}\f$ - * and stores it into a vector. The OOB risk \f$\mathcal{R}_\mathrm{oob}\f$ for + * and stores it into a vector. The OOB risk \f$\mathcal{R}_\mathrm{oob}\f$ for * iteration \f$m\f$ is calculated by: * \f[ - * \mathcal{R}_\mathrm{oob}^{[m]} = \frac{1}{|\mathcal{D}_\mathrm{oob}|}\sum\limits_{(x,y) \in \mathcal{D}_\mathrm{oob}} + * \mathcal{R}_\mathrm{oob}^{[m]} = \frac{1}{|\mathcal{D}_\mathrm{oob}|}\sum\limits_{(x,y) \in \mathcal{D}_\mathrm{oob}} * L(y, \hat{f}^{[m]}(x)) * \f] - * - * **Note:** + * + * **Note:** * - If \f$m=0\f$ than \f$\hat{f}\f$ is just the offset. * - The implementation to calculate \f$\mathcal{R}_\mathrm{oob}^{[m]}\f$ is - * done in two steps: - * 1. Calculate vector `risk_temp` of losses for every observation for + * done in two steps: + * 1. Calculate vector `risk_temp` of losses for every observation for * given response \f$y^{(i)}\f$ and prediction \f$\hat{f}^{[m]}(x^{(i)})\f$. * 2. Average over `risk_temp`. - * + * * This procedure ensures, that it is possible to e.g. use the AUC or any - * arbitrary performance measure for risk logging. This gives just one - * value for `risk_temp` and therefore the average equals the loss - * function. If this is just a value (like for the AUC) then the value is + * arbitrary performance measure for risk logging. This gives just one + * value for `risk_temp` and therefore the average equals the loss + * function. If this is just a value (like for the AUC) then the value is * returned. - * - * \param current_iteration `unsigned int` of current iteration + * + * \param current_iteration `unsigned int` of current iteration * \param response `arma::vec` of the given response used for training - * \param prediction `arma::vec` actual prediction of the boosting model at + * \param prediction `arma::vec` actual prediction of the boosting model at * iteration `current_iteration` - * \param used_blearner `Baselearner*` pointer to the selected baselearner in + * \param used_blearner `Baselearner*` pointer to the selected baselearner in * iteration `current_iteration` * \param offset `double` of the overall offset of the training * \param learning_rate `double` lerning rate of the `current_iteration` - * + * */ -void LoggerOobRisk::logStep (const unsigned int& current_iteration, const arma::vec& response, - const arma::vec& prediction, blearner::Baselearner* used_blearner, const double& offset, - const double& learning_rate, const double& step_size) +void LoggerOobRisk::logStep (const unsigned int& current_iteration, std::shared_ptr sh_ptr_response, + blearner::Baselearner* used_blearner, const double& learning_rate, const double& step_size) { if (current_iteration == 1) { - oob_prediction.fill(offset); + sh_ptr_oob_response->constantInitialization(sh_ptr_response->getInitialization()); + sh_ptr_oob_response->initializePrediction(); } std::string blearner_id = used_blearner->getDataIdentifier(); - // Get data of corresponding selected baselearner. E.g. iteration 100 linear + // Get data of corresponding selected baselearner. E.g. iteration 100 linear // baselearner of feature x_7, then get the data of feature x_7: data::Data* oob_blearner_data = oob_data.find(blearner_id)->second; - + // Predict this data using the selected baselearner: - arma::vec temp_oob_prediction = used_blearner->predict(oob_blearner_data); + arma::mat temp_oob_prediction = used_blearner->predict(oob_blearner_data); + sh_ptr_oob_response->updatePrediction(learning_rate, step_size, temp_oob_prediction); - oob_prediction += learning_rate * step_size * temp_oob_prediction; /* ***************************************************************************************************************************** * @@ -451,9 +431,9 @@ void LoggerOobRisk::logStep (const unsigned int& current_iteration, const arma:: * * mat_temp = used_blearner->instantiateData(oob_data.find(blearner_id)->second->getData()); * oob_data_transformed.insert(std::pair(blearner_id, mat_temp)); - * } + * } * - * /////// Get data of corresponding selected baselearner. E.g. iteration 100 linear + * /////// Get data of corresponding selected baselearner. E.g. iteration 100 linear * /////// baselearner of feature x_7, then get the data of feature x_7: * /////// data::Data* oob_blearner_data = oob_data.find(used_blearner->getDataIdentifier())->second; * ///// @@ -464,36 +444,31 @@ void LoggerOobRisk::logStep (const unsigned int& current_iteration, const arma:: * oob_prediction += learning_rate * step_size * oob_data_transformed.find(blearner_id)->second * used_blearner->getParameter(); ****************************************************************************************************************************** */ - // Calculate empirical risk. Calculation of the temporary vector ensures - // that stuff like auc logging is possible: - arma::vec loss_vec_temp = used_loss->definedLoss(oob_response, oob_prediction); - double temp_risk = arma::mean(loss_vec_temp); - - // Track empirical risk: + double temp_risk = sh_ptr_oob_response->calculateEmpiricalRisk(used_loss); tracked_oob_risk.push_back(temp_risk); } /** - * \brief Stop criteria is fulfilled if the relative improvement falls below + * \brief Stop criteria is fulfilled if the relative improvement falls below * `eps_for_break` - * - * The stopping criteria is fulfilled, if the relative improvement at the - * current iteration \f$m\f$ \f$\varepsilon^{[m]}\f$ falls under a fixed boundary + * + * The stopping criteria is fulfilled, if the relative improvement at the + * current iteration \f$m\f$ \f$\varepsilon^{[m]}\f$ falls under a fixed boundary * \f$\varepsilon\f$. Where the relative improvement is defined by * \f[ * \varepsilon^{[m]} = \frac{\mathcal{R}_\mathrm{oob}^{[m-1]} - \mathcal{R}_\mathrm{oob}^{[m]}}{\mathcal{R}_\mathrm{oob}^{[m-1]}}. * \f] - * + * * The logger stops the algorithm if \f$\varepsilon^{[m]} \leq \varepsilon\f$ - * - * \returns `bool` which tells if the stopping criteria is reached or not + * + * \returns `bool` which tells if the stopping criteria is reached or not * (if the logger isn't a stopper then this is always false) */ bool LoggerOobRisk::reachedStopCriteria () const { bool stop_criteria_is_reached = false; - + if (is_a_stopper) { if (tracked_oob_risk.size() > 1) { // We need to subtract -2 and -1 since c++ start counting by 0 while size @@ -501,7 +476,7 @@ bool LoggerOobRisk::reachedStopCriteria () const // size returns 1 but we want to access 0: double oob_eps = tracked_oob_risk[tracked_oob_risk.size() - 2] - tracked_oob_risk[tracked_oob_risk.size() - 1]; oob_eps = oob_eps / tracked_oob_risk[tracked_oob_risk.size() - 2]; - + if (oob_eps <= eps_for_break) { stop_criteria_is_reached = true; } @@ -512,9 +487,9 @@ bool LoggerOobRisk::reachedStopCriteria () const /** * \brief Return the data stored within the OOB risk logger - * + * * This function returns the logged OOB risk. - * + * * \return `arma::vec` of elapsed out of bag risk */ @@ -526,10 +501,10 @@ arma::vec LoggerOobRisk::getLoggedData () const /** * \brief Clear the logger data - * - * This is an important thing which is called every time in front of retraining + * + * This is an important thing which is called every time in front of retraining * the model. If we don't clear the data, the new iterations are just pasted at - * the end of the existing vectors which couses some troubles. + * the end of the existing vectors which couses some troubles. */ void LoggerOobRisk::clearLoggerData () { @@ -537,12 +512,12 @@ void LoggerOobRisk::clearLoggerData () } /** - * \brief Print status of current iteration into the console - * + * \brief Print status of current iteration into the console + * * The string which is created in this functions must have exactly the same * length as the string from `initializeLoggerPrinter()`. Those strings are * printed line by line. - * + * * \returns `std::string` which includes the log of the current iteration */ @@ -550,7 +525,7 @@ std::string LoggerOobRisk::printLoggerStatus () const { std::stringstream ss; ss << logger_id << " = " << std::setprecision(2) << tracked_oob_risk.back(); - + return ss.str(); } @@ -563,16 +538,16 @@ std::string LoggerOobRisk::printLoggerStatus () const /** * \brief Default constructor of class `LoggerTime` - * + * * \param logger_id0 `std::string` unique identifier of the logger * \param is_a_stopper0 `bool` which specifies if the logger is used as stopper - * \param max_time `unsigned int` maximal time for training (just used if logger + * \param max_time `unsigned int` maximal time for training (just used if logger * is a stopper) - * \param time_unit `std::string` of the unit used for measuring, allowed are + * \param time_unit `std::string` of the unit used for measuring, allowed are * `minutes`, `seconds` and `microseconds` */ -LoggerTime::LoggerTime (const std::string& logger_id0, const bool& is_a_stopper0, const unsigned int& max_time, +LoggerTime::LoggerTime (const std::string& logger_id0, const bool& is_a_stopper0, const unsigned int& max_time, const std::string& time_unit) : max_time ( max_time ), time_unit ( time_unit ) @@ -589,8 +564,8 @@ LoggerTime::LoggerTime (const std::string& logger_id0, const bool& is_a_stopper0 } } catch ( std::exception &ex ) { forward_exception_to_r( ex ); - } catch (...) { - ::Rf_error( "c++ exception (unknown reason)" ); + } catch (...) { + ::Rf_error( "c++ exception (unknown reason)" ); } is_a_stopper = is_a_stopper0; logger_id = logger_id0; @@ -598,34 +573,33 @@ LoggerTime::LoggerTime (const std::string& logger_id0, const bool& is_a_stopper0 /** * \brief Log current step of compboost iteration for class `LoggerTime` - * + * * This functions loggs dependent on `time_unit` the elapsed time at the * current iteration. - * - * \param current_iteration `unsigned int` of current iteration + * + * \param current_iteration `unsigned int` of current iteration * \param response `arma::vec` of the given response used for training - * \param prediction `arma::vec` actual prediction of the boosting model at + * \param prediction `arma::vec` actual prediction of the boosting model at * iteration `current_iteration` - * \param used_blearner `Baselearner*` pointer to the selected baselearner in + * \param used_blearner `Baselearner*` pointer to the selected baselearner in * iteration `current_iteration` * \param offset `double` of the overall offset of the training * \param learning_rate `double` lerning rate of the `current_iteration` - * + * */ -void LoggerTime::logStep (const unsigned int& current_iteration, const arma::vec& response, - const arma::vec& prediction, blearner::Baselearner* used_blearner, const double& offset, - const double& learning_rate, const double& step_size) +void LoggerTime::logStep (const unsigned int& current_iteration, std::shared_ptr sh_ptr_response, + blearner::Baselearner* used_blearner, const double& learning_rate, const double& step_size) { if (current_time.size() == 0) { init_time = std::chrono::steady_clock::now(); } if (time_unit == "minutes") { current_time.push_back(std::chrono::duration_cast(std::chrono::steady_clock::now() - init_time).count()); - } + } if (time_unit == "seconds") { current_time.push_back(std::chrono::duration_cast(std::chrono::steady_clock::now() - init_time).count()); - } + } if (time_unit == "microseconds") { current_time.push_back(std::chrono::duration_cast(std::chrono::steady_clock::now() - init_time).count()); } @@ -633,21 +607,21 @@ void LoggerTime::logStep (const unsigned int& current_iteration, const arma::vec /** * \brief Stop criteria is fulfilled if the passed time exceeds `max_time` - * - * The stop criteria here is quite simple. For the current iteration \f$m\f$ it - * is triggered if + * + * The stop criteria here is quite simple. For the current iteration \f$m\f$ it + * is triggered if * \f[ * \mathrm{current_time}_m > \mathrm{max_time} * \f] - * - * \returns `bool` which tells if the stopping criteria is reached or not + * + * \returns `bool` which tells if the stopping criteria is reached or not * (if the logger isn't a stopper then this is always false) */ bool LoggerTime::reachedStopCriteria () const { bool stop_criteria_is_reached = false; - + if (is_a_stopper) { if (current_time.back() >= max_time) { stop_criteria_is_reached = true; @@ -658,13 +632,13 @@ bool LoggerTime::reachedStopCriteria () const /** * \brief Return the data stored within the time logger - * - * This function returns the logged elapsed time. An issue here is, that the - * later transformation of all logged data to an `arma::mat` requires - * `arma::vec` as return value. Therefore the std integer vector is transforemd - * to an `arma::vec`. We know that this isn't very memory friendly, but the + * + * This function returns the logged elapsed time. An issue here is, that the + * later transformation of all logged data to an `arma::mat` requires + * `arma::vec` as return value. Therefore the std integer vector is transforemd + * to an `arma::vec`. We know that this isn't very memory friendly, but the * `arma::mat` we use later can just have one type. - * + * * \return `arma::vec` of elapsed time */ @@ -672,17 +646,17 @@ arma::vec LoggerTime::getLoggedData () const { // Cast integer vector to double: std::vector seconds_double (current_time.begin(), current_time.end()); - + arma::vec out (seconds_double); return out; } /** * \brief Clear the logger data - * - * This is an important thing which is called every time in front of retraining + * + * This is an important thing which is called every time in front of retraining * the model. If we don't clear the data, the new iterations are just pasted at - * the end of the existing vectors which couses some troubles. + * the end of the existing vectors which couses some troubles. */ void LoggerTime::clearLoggerData () @@ -691,12 +665,12 @@ void LoggerTime::clearLoggerData () } /** - * \brief Print status of current iteration into the console - * + * \brief Print status of current iteration into the console + * * The string which is created in this functions must have exactly the same * length as the string from `initializeLoggerPrinter()`. Those strings are * printed line by line. - * + * * \returns `std::string` which includes the log of the current iteration */ @@ -704,7 +678,7 @@ std::string LoggerTime::printLoggerStatus () const { std::stringstream ss; ss << logger_id << " = " << std::setprecision(2) << current_time.back(); - + return ss.str(); } diff --git a/src/logger.h b/src/logger.h index deb3ad84..463559fd 100644 --- a/src/logger.h +++ b/src/logger.h @@ -13,42 +13,27 @@ // Compboost is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// MIT License for more details. You should have received a copy of -// the MIT License along with compboost. -// -// Written by: -// ----------- -// -// Daniel Schalk -// Department of Statistics -// Ludwig-Maximilians-University Munich -// Ludwigstrasse 33 -// D-80539 München -// -// https://www.compstat.statistik.uni-muenchen.de -// -// Contact -// e: contact@danielschalk.com -// w: danielschalk.com +// MIT License for more details. You should have received a copy of +// the MIT License along with compboost. // // =========================================================================== # -/** +/** * @file logger.h * @author Daniel Schalk (github: schalkdaniel) - * + * * @brief Logger class definition * * @section DESCRIPTION - * + * * This file contains all the available logger which also can be used for - * early stopping. The idea is not just about to use multiple stopping - * criteria (e.g. maximal number of iterations + a given amount of time), + * early stopping. The idea is not just about to use multiple stopping + * criteria (e.g. maximal number of iterations + a given amount of time), * but also to log while training (e.g. the inbag or oob risk). - * + * * The logger are collected by the `LoggerList` class. Basicall, that class * takes as much logger as you like and logs every step. This can be used - * to log different risks for different loss functions. Just create two + * to log different risks for different loss functions. Just create two * inbag or oob risk logger with a different loss. * */ @@ -58,11 +43,13 @@ #include #include +#include #include // ::setw #include // ::stringstream #include "loss.h" #include "baselearner.h" +#include "response.h" namespace logger { @@ -73,53 +60,53 @@ namespace logger /** * \class Logger - * + * * \brief Abstract logger class with minimal requirements to all logger - * - * This class is meant to define some minimal functionality any logger must + * + * This class is meant to define some minimal functionality any logger must * have! The key of the logger is nut only the logging of the process, but also * to be able to define a logger as stopper to force an early stopping if one * or all of the used logger have reached a stopping criteria. This is more * explained within the child classes. - * + * * **Note** that this minimal functionality mentioned above differs for every * class and is explained within the specific class documentation. - * + * */ class Logger { public: - + /// Log current step of compboost iteration dependent on the child class - virtual void logStep (const unsigned int&, const arma::vec&, const arma::vec&, - blearner::Baselearner*, const double&, const double&, const double&) = 0; - + virtual void logStep (const unsigned int&, std::shared_ptr, + blearner::Baselearner*, const double&, const double&) = 0; + /// Class dependent check if the stopping criteria is fulfilled virtual bool reachedStopCriteria () const = 0; - + /// Return the data stored within the logger virtual arma::vec getLoggedData () const = 0; - + /// Clear the logger data virtual void clearLoggerData () = 0; - - /// Print status of current iteration into the console + + /// Print status of current iteration into the console virtual std::string printLoggerStatus () const = 0; /// Get logger identifier: std::string getLoggerId () const; - + /// Just a getter if the logger is also used as stopper bool getIfLoggerIsStopper () const; - - virtual + + virtual ~Logger (); - + protected: std::string logger_id; - + /// Tag if the logger is used as stopper bool is_a_stopper; }; @@ -130,46 +117,46 @@ class Logger /** * \class LoggerIteration - * + * * \brief Logger class to log the current iteration - * - * This class seems to be useless, but it gives more control about the algorithm - * and doesn't violate the idea of object programming here. Additionally, it is - * quite convenient to have this class instead of tracking the iteration at any + * + * This class seems to be useless, but it gives more control about the algorithm + * and doesn't violate the idea of object programming here. Additionally, it is + * quite convenient to have this class instead of tracking the iteration at any * stage of the fitting within the compboost object as another vector. - * + * */ -class LoggerIteration : public Logger +class LoggerIteration : public Logger { private: - + /// Maximal number of iterations (only interesting if used as stopper) unsigned int max_iterations; - - /// Vector to log the iterations + + /// Vector to log the iterations std::vector iterations; - - + + public: - + /// Default constructor of class `LoggerIteration` LoggerIteration (const std::string&, const bool&, const unsigned int&); - + /// Log current step of compboost iteration of class `LoggerIteration` - void logStep (const unsigned int&, const arma::vec&, const arma::vec&, - blearner::Baselearner*, const double&, const double&, const double&); - + void logStep (const unsigned int&, std::shared_ptr, + blearner::Baselearner*, const double&, const double&); + /// Stop criteria is fulfilled if the current iteration exceed `max_iteration` bool reachedStopCriteria () const; - + /// Return the data stored within the iteration logger arma::vec getLoggedData () const; - + /// Clear the logger data void clearLoggerData (); - - /// Print status of current iteration into the console + + /// Print status of current iteration into the console std::string printLoggerStatus () const; }; @@ -178,51 +165,51 @@ class LoggerIteration : public Logger /** * \class LoggerInbagRisk - * + * * \brief Logger class to log the inbag risk - * + * * This class loggs the inbag risk for a specific loss function. It is possible * to define more than one inbag risk logger (e.g. for 2 different loss * functions). For details about logging and stopping see the description of the * `logStep()` function. - * + * */ class LoggerInbagRisk : public Logger { private: - + /// Used loss. **Note** that you can specify a different loss than the loss used for training loss::Loss* used_loss; - + /// Vector of inbag risk for every iteration std::vector tracked_inbag_risk; - + /// Stopping criteria, stop if \f$(\mathrm{risk}_{i-1} - \mathrm{risk}_i) / \mathrm{risk}_{i-1} < \mathrm{eps\_for\_break}\f$ double eps_for_break; - - + + public: - + /// Default constructor LoggerInbagRisk (const std::string&, const bool&, loss::Loss*, const double&); - + /// Log current step of compboost iteration for class `LoggerInbagRisk` - void logStep (const unsigned int&, const arma::vec&, const arma::vec&, - blearner::Baselearner*, const double&, const double&, const double&); - + void logStep (const unsigned int&, std::shared_ptr, + blearner::Baselearner*, const double&, const double&); + /// Stop criteria is fulfilled if the relative improvement falls below `eps_for_break` bool reachedStopCriteria () const; - + /// Return the data stored within the logger arma::vec getLoggedData () const; - + /// Clear the logger data void clearLoggerData (); - - /// Print status of current iteration into the console + + /// Print status of current iteration into the console std::string printLoggerStatus () const; - + }; // OobRisk: @@ -230,32 +217,32 @@ class LoggerInbagRisk : public Logger /** * \class LoggerOobRisk - * + * * \brief Logger class to log the out of bag risk - * + * * This class loggs the out of bag risk for a specific loss function and a map - * of new data. It is possible to define more than one inbag risk logger - * (e.g. for 2 different loss functions). For details about logging and + * of new data. It is possible to define more than one inbag risk logger + * (e.g. for 2 different loss functions). For details about logging and * stopping see the description of the `logStep()` function. - * + * */ class LoggerOobRisk : public Logger { private: - + /// Used loss. **Note** that you can specify a different loss than the loss used for training loss::Loss* used_loss; - + /// Vector of OOB risk for every iteration std::vector tracked_oob_risk; - + /// Stopping criteria, stop if \f$(\mathrm{risk}_{i-1} - \mathrm{risk}_i) / \mathrm{risk}_{i-1} < \mathrm{eps\_for\_break}\f$ double eps_for_break; - + /// OOB prediction which is internally done in every iteration - arma::vec oob_prediction; - + arma::mat oob_prediction; + /// The OOB data provided by the user std::map oob_data; @@ -263,33 +250,33 @@ class LoggerOobRisk : public Logger * /// Transformed oob data for predicting on the oob set: * std::map oob_data_transformed; */ - + /// The response variable which corresponds to the given OOB data - arma::vec oob_response; - - + std::shared_ptr sh_ptr_oob_response; + + public: - + /// Default constructor - LoggerOobRisk (const std::string&, const bool&, loss::Loss*, const double&, - std::map, const arma::vec&); - + LoggerOobRisk (const std::string&, const bool&, loss::Loss*, const double&, + std::map, std::shared_ptr); + /// Log current step of compboost iteration for class `LoggerOobRisk` - void logStep (const unsigned int&, const arma::vec&, const arma::vec&, - blearner::Baselearner*, const double&, const double&, const double&); - + void logStep (const unsigned int&, std::shared_ptr, + blearner::Baselearner*, const double&, const double&); + /// Stop criteria is fulfilled if the relative improvement falls below `eps_for_break` bool reachedStopCriteria () const; - + /// Return the data stored within the logger arma::vec getLoggedData () const; - + /// Clear the logger data void clearLoggerData (); - - /// Print status of current iteration into the console + + /// Print status of current iteration into the console std::string printLoggerStatus () const; - + }; // LoggerTime: @@ -297,56 +284,56 @@ class LoggerOobRisk : public Logger /** * \class LoggerTime - * + * * \brief Logger class to log the ellapsed time - * + * * This class just loggs the ellapsed time. This sould be very handy if one * wants to run the algorithm for just 2 hours and see how far he comes within * that time. There are three time units available for logging: * - minutes * - seconds * - microseconds - * + * */ class LoggerTime : public Logger { private: - + /// Initial time, important to get the actual elapsed time std::chrono::steady_clock::time_point init_time; - + /// Vector of elapsed time at each iteration std::vector current_time; - + /// Stopping criteria, stop if \f$\mathrm{current_time} > \mathrm{max_time}\f$ unsigned int max_time; - - /// The unit for time measuring, allowed are `minutes`, `seconds` and `microseconds` + + /// The unit for time measuring, allowed are `minutes`, `seconds` and `microseconds` std::string time_unit; - - + + public: - + /// Default constructor of class `LoggerTime` LoggerTime (const std::string&, const bool&, const unsigned int&, const std::string&); - + /// Log current step of compboost iteration for class `LoggerTime` - void logStep (const unsigned int&, const arma::vec&, const arma::vec&, - blearner::Baselearner*, const double&, const double&, const double&); - + void logStep (const unsigned int&, std::shared_ptr, + blearner::Baselearner*, const double&, const double&); + /// Stop criteria is fulfilled if the passed time exceeds `max_time` bool reachedStopCriteria () const; - + /// Return the data stored within the logger arma::vec getLoggedData () const; - + /// Clear the logger data void clearLoggerData(); - - /// Print status of current iteration into the console + + /// Print status of current iteration into the console std::string printLoggerStatus () const; - + }; } // namespace logger diff --git a/src/loggerlist.cpp b/src/loggerlist.cpp index 3fd00f51..866ff8d4 100644 --- a/src/loggerlist.cpp +++ b/src/loggerlist.cpp @@ -110,9 +110,8 @@ std::pair, arma::mat> LoggerList::getLoggerData () cons return std::pair, arma::mat>(logger_names, out_matrix); } -void LoggerList::logCurrent (const unsigned int& current_iteration, const arma::vec& response, - const arma::vec& prediction, blearner::Baselearner* used_blearner, const double& offset, - const double& learning_rate, const double& step_size) +void LoggerList::logCurrent (const unsigned int& current_iteration, std::shared_ptr sh_ptr_response, + blearner::Baselearner* used_blearner, const double& learning_rate, const double& step_size) { // Think about how to implement this the best way. I think the computations // e.g. for the risk should be done within the logger object. If so, the @@ -126,8 +125,8 @@ void LoggerList::logCurrent (const unsigned int& current_iteration, const arma:: // This can be easily extended to an oob risk by just using the evaluation // data specified by initializing the logger list. for (logger_map::iterator it = log_list.begin(); it != log_list.end(); ++it) { - it->second->logStep(current_iteration, response, prediction, used_blearner, - offset, learning_rate, step_size); + it->second->logStep(current_iteration, sh_ptr_response, used_blearner, + learning_rate, step_size); } } // Print logger: diff --git a/src/loggerlist.h b/src/loggerlist.h index 432514aa..58fa8039 100644 --- a/src/loggerlist.h +++ b/src/loggerlist.h @@ -13,23 +13,8 @@ // Compboost is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// MIT License for more details. You should have received a copy of -// the MIT License along with compboost. -// -// Written by: -// ----------- -// -// Daniel Schalk -// Department of Statistics -// Ludwig-Maximilians-University Munich -// Ludwigstrasse 33 -// D-80539 München -// -// https://www.compstat.statistik.uni-muenchen.de -// -// Contact -// e: contact@danielschalk.com -// w: danielschalk.com +// MIT License for more details. You should have received a copy of +// the MIT License along with compboost. // // =========================================================================== # @@ -40,6 +25,7 @@ #include #include "logger.h" +#include "response.h" typedef std::map logger_map; @@ -49,48 +35,48 @@ namespace loggerlist class LoggerList { private: - + logger_map log_list; unsigned int sum_of_stopper = 0; - + public: - + LoggerList (); // LoggerList (arma::mat&, std::chrono::system_clock::time_point, double); - + // String for logger and the logger itselfe: void registerLogger (logger::Logger*); void printRegisteredLogger () const; - + logger_map getMap () const; void clearMap (); - + // This function should iterate over all registered logger, check if it is // a stopper and returns just one bool, aggregated over a vector of bools - // from the single logger. This could be e.g. one is fullfilled or an all - // check (all stopper has to be fullfilled). The priority comes with the + // from the single logger. This could be e.g. one is fullfilled or an all + // check (all stopper has to be fullfilled). The priority comes with the // map identifier since it sorts the entrys after name. - + // If the argument is 'true', than all stopper has to be fullfilled. bool getStopperStatus (const bool&) const; - - // Get a matrix of tracked logger (iterator over all logger and paste + + // Get a matrix of tracked logger (iterator over all logger and paste // all columns of the private member). The return is a pair with a // string vector containing the logger type and a matrix with corresponging // columns for each logger type: std::pair, arma::mat> getLoggerData () const; - + // Log the current step (structure ). // This is given to the instantiated logger: - void logCurrent (const unsigned int&, const arma::vec&, const arma::vec&, - blearner::Baselearner*, const double&, const double&, const double&); - + void logCurrent (const unsigned int&, std::shared_ptr, + blearner::Baselearner*, const double&, const double&); + // Print the logger status: void printLoggerStatus (const double&) const; - + // Clear the logger data (should be used in front of every compboost training): void clearLoggerData (); - + // Destructor: ~LoggerList (); }; diff --git a/src/loss.cpp b/src/loss.cpp index 59b91208..a9af981a 100644 --- a/src/loss.cpp +++ b/src/loss.cpp @@ -13,23 +13,8 @@ // Compboost is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// MIT License for more details. You should have received a copy of -// the MIT License along with compboost. -// -// Written by: -// ----------- -// -// Daniel Schalk -// Department of Statistics -// Ludwig-Maximilians-University Munich -// Ludwigstrasse 33 -// D-80539 München -// -// https://www.compstat.statistik.uni-muenchen.de -// -// Contact -// e: contact@danielschalk.com -// w: danielschalk.com +// MIT License for more details. You should have received a copy of +// the MIT License along with compboost. // // =========================================================================== # @@ -41,6 +26,38 @@ namespace loss // Parent class: // ----------------------- +std::string Loss::getTaskId () const +{ + return task_id; +} + +arma::mat Loss::weightedLoss (const arma::mat& true_value, const arma::mat& prediction, const arma::mat& weights) const +{ + return weights % definedLoss(true_value, prediction); +} +arma::mat Loss::weightedGradient (const arma::mat& true_value, const arma::mat& prediction, const arma::mat& weights) const +{ + return weights % definedGradient(true_value, prediction); +} + +double Loss::calculateEmpiricalRisk (const arma::mat& true_value, const arma::mat& prediction) const +{ + return arma::accu(definedLoss(true_value, prediction)) / true_value.size(); +} +double Loss::calculateWeightedEmpiricalRisk (const arma::mat& true_value, const arma::mat& prediction, const arma::mat& weights) const +{ + return arma::accu(weightedLoss(true_value, prediction, weights)) / true_value.size(); +} + +arma::mat Loss::calculatePseudoResiduals (const arma::mat& true_value, const arma::mat& prediction) const +{ + return -definedGradient(true_value, prediction); +} +arma::mat Loss::calculateWeightedPseudoResiduals (const arma::mat& true_value, const arma::mat& prediction, const arma::mat& weights) const +{ + return -weightedGradient(true_value, prediction, weights); +} + Loss::~Loss () { // Rcpp::Rcout << "Call Loss Destructor" << std::endl; } @@ -54,21 +71,23 @@ Loss::~Loss () { /** * \brief Default constructor of `LossQuadratic` - * + * */ - -LossQuadratic::LossQuadratic () { } +LossQuadratic::LossQuadratic () +{ + task_id = "regression"; +} /** * \brief Constructor to initialize custom offset of `LossQuadratic` - * - * \param custom_offset0 `double` Offset which is used instead of the pre + * + * \param custom_offset0 `double` Offset which is used instead of the pre * defined initialization - * + * */ - LossQuadratic::LossQuadratic (const double& custom_offset0) -{ +{ + task_id = "regression"; custom_offset = custom_offset0; use_custom_offset = true; } @@ -76,14 +95,13 @@ LossQuadratic::LossQuadratic (const double& custom_offset0) /** * \brief Definition of the loss function (see description of the class) - * - * \param true_value `arma::vec` True value of the response - * \param prediction `arma::vec` Prediction of the true value - * - * \returns `arma::vec` vector of elementwise application of the loss function + * + * \param true_value `arma::mat` True value of the response + * \param prediction `arma::mat` Prediction of the true value + * + * \returns `arma::mat` vector of elementwise application of the loss function */ - -arma::vec LossQuadratic::definedLoss (const arma::vec& true_value, const arma::vec& prediction) const +arma::mat LossQuadratic::definedLoss (const arma::mat& true_value, const arma::mat& prediction) const { // for debugging: // Rcpp::Rcout << "Calculate loss of child class Quadratic!" << std::endl; @@ -92,14 +110,13 @@ arma::vec LossQuadratic::definedLoss (const arma::vec& true_value, const arma::v /** * \brief Definition of the gradient of the loss function (see description of the class) - * - * \param true_value `arma::vec` True value of the response - * \param prediction `arma::vec` Prediction of the true value - * - * \returns `arma::vec` vector of elementwise application of the gradient + * + * \param true_value `arma::mat` True value of the response + * \param prediction `arma::mat` Prediction of the true value + * + * \returns `arma::mat` vector of elementwise application of the gradient */ - -arma::vec LossQuadratic::definedGradient (const arma::vec& true_value, const arma::vec& prediction) const +arma::mat LossQuadratic::definedGradient (const arma::mat& true_value, const arma::mat& prediction) const { // for debugging: // Rcpp::Rcout << "Calculate gradient of child class Quadratic!" << std::endl; @@ -108,28 +125,24 @@ arma::vec LossQuadratic::definedGradient (const arma::vec& true_value, const arm /** * \brief Definition of the constant risk initialization (see description of the class) - * - * \param true_value `arma::vec` True value of the response - * + * + * \param true_value `arma::mat` True value of the response + * * \returns `double` constant which minimizes the empirical risk for the given true value */ - -double LossQuadratic::constantInitializer (const arma::vec& true_value) const +arma::mat LossQuadratic::constantInitializer (const arma::mat& true_value) const { if (use_custom_offset) { return custom_offset; } - return arma::mean(true_value); + arma::mat out(1, 1); + out.fill(arma::accu(true_value) / true_value.size()); + return out; } - -/** - * \brief Definition of the response function - * - * \param score `arma::vec` The score trained during the fitting process - * - * \returns `arma::vec` The transforemd score. - */ -arma::vec LossQuadratic::responseTransformation (const arma::vec& score) const +arma::mat LossQuadratic::weightedConstantInitializer (const arma::mat& true_value, const arma::mat& weights) const { - return score; + if (use_custom_offset) { return custom_offset; } + arma::mat out(1, 1); + out.fill(arma::accu(weights % true_value) / true_value.size()); + return out; } @@ -138,35 +151,36 @@ arma::vec LossQuadratic::responseTransformation (const arma::vec& score) const /** * \brief Default constructor of `LossAbsolute` - * + * */ - -LossAbsolute::LossAbsolute () { } +LossAbsolute::LossAbsolute () +{ + task_id = "regression"; // set parent +} /** * \brief Constructor to initialize custom offset of `LossAbsolute` - * - * \param custom_offset0 `double` Offset which is used instead of the pre + * + * \param custom_offset0 `double` Offset which is used instead of the pre * defined initialization - * + * */ - LossAbsolute::LossAbsolute (const double& custom_offset0) -{ +{ + task_id = "regression"; // set parent custom_offset = custom_offset0; use_custom_offset = true; } /** * \brief Definition of the loss function (see description of the class) - * - * \param true_value `arma::vec` True value of the response - * \param prediction `arma::vec` Prediction of the true value - * - * \returns `arma::vec` vector of elementwise application of the loss function + * + * \param true_value `arma::mat` True value of the response + * \param prediction `arma::mat` Prediction of the true value + * + * \returns `arma::mat` vector of elementwise application of the loss function */ - -arma::vec LossAbsolute::definedLoss (const arma::vec& true_value, const arma::vec& prediction) const +arma::mat LossAbsolute::definedLoss (const arma::mat& true_value, const arma::mat& prediction) const { // for debugging: // Rcpp::Rcout << "Calculate loss of child class Absolute!" << std::endl; @@ -175,14 +189,13 @@ arma::vec LossAbsolute::definedLoss (const arma::vec& true_value, const arma::ve /** * \brief Definition of the gradient of the loss function (see description of the class) - * - * \param true_value `arma::vec` True value of the response - * \param prediction `arma::vec` Prediction of the true value - * - * \returns `arma::vec` vector of elementwise application of the gradient + * + * \param true_value `arma::mat` True value of the response + * \param prediction `arma::mat` Prediction of the true value + * + * \returns `arma::mat` vector of elementwise application of the gradient */ - -arma::vec LossAbsolute::definedGradient (const arma::vec& true_value, const arma::vec& prediction) const +arma::mat LossAbsolute::definedGradient (const arma::mat& true_value, const arma::mat& prediction) const { // for debugging: // Rcpp::Rcout << "Calculate gradient of child class Absolute!" << std::endl; @@ -191,133 +204,105 @@ arma::vec LossAbsolute::definedGradient (const arma::vec& true_value, const arma /** * \brief Definition of the constant risk initialization (see description of the class) - * - * \param true_value `arma::vec` True value of the response - * + * + * \param true_value `arma::mat` True value of the response + * * \returns `double` constant which minimizes the empirical risk for the given true value */ - -double LossAbsolute::constantInitializer (const arma::vec& true_value) const +arma::mat LossAbsolute::constantInitializer (const arma::mat& true_value) const { if (use_custom_offset) { return custom_offset; } - return arma::median(true_value); + arma::vec temp = true_value; + arma::mat out(1, 1); + out.fill(arma::median(temp)); + return out; } - -/** - * \brief Definition of the response function - * - * \param score `arma::vec` The score trained during the fitting process - * - * \returns `arma::vec` The transforemd score. - */ -arma::vec LossAbsolute::responseTransformation (const arma::vec& score) const +arma::mat LossAbsolute::weightedConstantInitializer (const arma::mat& true_value, const arma::mat& weights) const { - return score; + return constantInitializer(true_value); } - // Binomial loss: // ----------------------- /** * \brief Default constructor of `LossBinomial` - * + * */ - -LossBinomial::LossBinomial () { } +LossBinomial::LossBinomial () +{ + task_id = "binary_classif"; // set parent +} /** * \brief Constructor to initialize custom offset of `LossAbsolute` -* -* \param custom_offset0 `double` Offset which is used instead of the pre +* +* \param custom_offset0 `double` Offset which is used instead of the pre * defined initialization -* +* */ - LossBinomial::LossBinomial (const double& custom_offset0) -{ +{ + task_id = "binary_classif"; // set parent if (custom_offset0 > 1 || custom_offset0 < -1) { - + Rcpp::warning("LossBinomial allows just values between -1 and 1 as offset. Continuing with default offset."); - + } else { - + custom_offset = custom_offset0; use_custom_offset = true; - + } } /** * \brief Definition of the loss function (see description of the class) -* -* \param true_value `arma::vec` True value of the response -* \param prediction `arma::vec` Prediction of the true value -* -* \returns `arma::vec` vector of elementwise application of the loss function +* +* \param true_value `arma::mat` True value of the response +* \param prediction `arma::mat` Prediction of the true value +* +* \returns `arma::mat` vector of elementwise application of the loss function */ - -arma::vec LossBinomial::definedLoss (const arma::vec& true_value, const arma::vec& prediction) const +arma::mat LossBinomial::definedLoss (const arma::mat& true_value, const arma::mat& prediction) const { return arma::log(1 + arma::exp(-2 * true_value % prediction)); } /** * \brief Definition of the gradient of the loss function (see description of the class) -* -* \param true_value `arma::vec` True value of the response -* \param prediction `arma::vec` Prediction of the true value -* -* \returns `arma::vec` vector of elementwise application of the gradient +* +* \param true_value `arma::mat` True value of the response +* \param prediction `arma::mat` Prediction of the true value +* +* \returns `arma::mat` vector of elementwise application of the gradient */ - -arma::vec LossBinomial::definedGradient (const arma::vec& true_value, const arma::vec& prediction) const +arma::mat LossBinomial::definedGradient (const arma::mat& true_value, const arma::mat& prediction) const { return -2 * true_value / (1 + arma::exp(true_value % prediction)); } /** * \brief Definition of the constant risk initialization (see description of the class) -* -* \param true_value `arma::vec` True value of the response -* +* +* \param true_value `arma::mat` True value of the response +* * \returns `double` constant which minimizes the empirical risk for the given true value */ - -double LossBinomial::constantInitializer (const arma::vec& true_value) const -{ - arma::vec unique_values = arma::unique(true_value); - // This is necessary to prevent the program from segfolds... whyever??? - // Copied from: http://lists.r-forge.r-project.org/pipermail/rcpp-devel/2012-November/004796.html - try { - if (unique_values.size() != 2) { - Rcpp::stop("Binomial loss does not support multiclass classification."); - } - if (! arma::all((true_value == -1) || (true_value == 1))) { - Rcpp::stop("Labels must be coded as -1 and 1."); - } - } catch ( std::exception &ex ) { - forward_exception_to_r( ex ); - } catch (...) { - ::Rf_error( "c++ exception (unknown reason)" ); - } +arma::mat LossBinomial::constantInitializer (const arma::mat& true_value) const +{ + helper::checkForBinaryClassif(true_value, 1, -1); if (use_custom_offset) { return custom_offset; } - + double p = arma::accu(true_value + 1) / (2 * true_value.size()); - return 0.5 * std::log(p / (1 - p)); + arma::mat out(1, 1); + out.fill(0.5 * std::log(p / (1 - p))); + return out; } - -/** - * \brief Definition of the response function - * - * \param score `arma::vec` The score trained during the fitting process - * - * \returns `arma::vec` The transforemd score. - */ -arma::vec LossBinomial::responseTransformation (const arma::vec& score) const +arma::mat LossBinomial::weightedConstantInitializer (const arma::mat& true_value, const arma::mat& weights) const { - return 1 / (1 + arma::exp(-score)); + return constantInitializer(true_value); } // Custom loss: @@ -325,21 +310,21 @@ arma::vec LossBinomial::responseTransformation (const arma::vec& score) const /** * \brief Default constructor of custom loss class - * + * * \param lossFun `Rcpp::Function` `R` function to calculate the loss - * \param gradientFun `Rcpp::Function` `R` function to calculate the gradient + * \param gradientFun `Rcpp::Function` `R` function to calculate the gradient * of the loss function * \param initFun `Rcpp::Function` `R` function to initialize a constant (here * it is not neccessary to initialize in a loss/risk optimal manner) - * + * * \returns `double` constant which minimizes the empirical risk for the given true value */ - -LossCustom::LossCustom (Rcpp::Function lossFun, Rcpp::Function gradientFun, Rcpp::Function initFun) - : lossFun( lossFun ), - gradientFun( gradientFun ), +LossCustom::LossCustom (Rcpp::Function lossFun, Rcpp::Function gradientFun, Rcpp::Function initFun) + : lossFun( lossFun ), + gradientFun( gradientFun ), initFun( initFun ) { + task_id = "custom"; // Rcpp::Rcout << "Be careful! You are using a custom loss out of R!" // << "This will slow down everything!" // << std::endl; @@ -347,14 +332,13 @@ LossCustom::LossCustom (Rcpp::Function lossFun, Rcpp::Function gradientFun, Rcpp /** * \brief Definition of the loss function (see description of the class) - * - * \param true_value `arma::vec` True value of the response - * \param prediction `arma::vec` Prediction of the true value - * - * \returns `arma::vec` vector of elementwise application of the loss function + * + * \param true_value `arma::mat` True value of the response + * \param prediction `arma::mat` Prediction of the true value + * + * \returns `arma::mat` vector of elementwise application of the loss function */ - -arma::vec LossCustom::definedLoss (const arma::vec& true_value, const arma::vec& prediction) const +arma::mat LossCustom::definedLoss (const arma::mat& true_value, const arma::mat& prediction) const { // for debugging: // Rcpp::Rcout << "Calculate loss for a custom loss!" << std::endl; @@ -364,14 +348,13 @@ arma::vec LossCustom::definedLoss (const arma::vec& true_value, const arma::vec& /** * \brief Definition of the gradient of the loss function (see description of the class) - * - * \param true_value `arma::vec` True value of the response - * \param prediction `arma::vec` Prediction of the true value - * - * \returns `arma::vec` vector of elementwise application of the gradient + * + * \param true_value `arma::mat` True value of the response + * \param prediction `arma::mat` Prediction of the true value + * + * \returns `arma::mat` vector of elementwise application of the gradient */ - -arma::vec LossCustom::definedGradient (const arma::vec& true_value, const arma::vec& prediction) const +arma::mat LossCustom::definedGradient (const arma::mat& true_value, const arma::mat& prediction) const { // for debugging: // Rcpp::Rcout << "Calculate gradient for a custom loss!" << std::endl; @@ -381,116 +364,96 @@ arma::vec LossCustom::definedGradient (const arma::vec& true_value, const arma:: /** * \brief Definition of the constant risk initialization (see description of the class) - * - * \param true_value `arma::vec` True value of the response - * + * + * \param true_value `arma::mat` True value of the response + * * \returns `double` constant which minimizes the empirical risk for the given true value */ - -double LossCustom::constantInitializer (const arma::vec& true_value) const +arma::mat LossCustom::constantInitializer (const arma::mat& true_value) const { // for debugging: // Rcpp::Rcout << "Initialize custom loss!" << std::endl; - + Rcpp::NumericVector out = initFun(true_value); - - return out[0]; + arma::mat out_single(1, 1); + out_single.fill(out[0]); + return out_single; } - -/** - * \brief Definition of the response function - * - * \param score `arma::vec` The score trained during the fitting process - * - * \returns `arma::vec` The transforemd score. - */ -arma::vec LossCustom::responseTransformation (const arma::vec& score) const +arma::mat LossCustom::weightedConstantInitializer (const arma::mat& true_value, const arma::mat& weights) const { - return score; + return constantInitializer(true_value); } - // Custom cpp loss: // ----------------------- /** * \brief Default constructor of custom cpp loss class -* +* * \param lossFun `Rcpp::Function` `R` function to calculate the loss -* \param gradientFun `Rcpp::Function` `R` function to calculate the gradient +* \param gradientFun `Rcpp::Function` `R` function to calculate the gradient * of the loss function * \param initFun `Rcpp::Function` `R` function to initialize a constant (here * it is not neccessary to initialize in a loss/risk optimal manner) -* +* * \returns `double` constant which minimizes the empirical risk for the given true value */ - LossCustomCpp::LossCustomCpp (SEXP lossFun0, SEXP gradFun0, SEXP constInitFun0) { + task_id = "custom"; // set parent // Set functions: Rcpp::XPtr myTempLoss (lossFun0); lossFun = *myTempLoss; - + Rcpp::XPtr myTempGrad (gradFun0); gradFun = *myTempGrad; - + Rcpp::XPtr myTempConstInit (constInitFun0); constInitFun = *myTempConstInit; } /** * \brief Definition of the loss function (see description of the class) - * - * \param true_value `arma::vec` True value of the response - * \param prediction `arma::vec` Prediction of the true value - * - * \returns `arma::vec` vector of elementwise application of the loss function + * + * \param true_value `arma::mat` True value of the response + * \param prediction `arma::mat` Prediction of the true value + * + * \returns `arma::mat` vector of elementwise application of the loss function */ - -arma::vec LossCustomCpp::definedLoss (const arma::vec& true_value, const arma::vec& prediction) const +arma::mat LossCustomCpp::definedLoss (const arma::mat& true_value, const arma::mat& prediction) const { return lossFun(true_value, prediction); } /** * \brief Definition of the gradient of the loss function (see description of the class) -* -* \param true_value `arma::vec` True value of the response -* \param prediction `arma::vec` Prediction of the true value -* -* \returns `arma::vec` vector of elementwise application of the gradient +* +* \param true_value `arma::mat` True value of the response +* \param prediction `arma::mat` Prediction of the true value +* +* \returns `arma::mat` vector of elementwise application of the gradient */ - -arma::vec LossCustomCpp::definedGradient (const arma::vec& true_value, const arma::vec& prediction) const +arma::mat LossCustomCpp::definedGradient (const arma::mat& true_value, const arma::mat& prediction) const { return gradFun(true_value, prediction); } /** * \brief Definition of the constant risk initialization (see description of the class) -* -* \param true_value `arma::vec` True value of the response -* +* +* \param true_value `arma::mat` True value of the response +* * \returns `double` constant which minimizes the empirical risk for the given true value */ - -double LossCustomCpp::constantInitializer (const arma::vec& true_value) const +arma::mat LossCustomCpp::constantInitializer (const arma::mat& true_value) const { - return constInitFun(true_value); + arma::mat out(1, 1); + out.fill(constInitFun(true_value)); + return out; } - -/** - * \brief Definition of the response function - * - * \param score `arma::vec` The score trained during the fitting process - * - * \returns `arma::vec` The transforemd score. - */ -arma::vec LossCustomCpp::responseTransformation (const arma::vec& score) const +arma::mat LossCustomCpp::weightedConstantInitializer (const arma::mat& true_value, const arma::mat& weights) const { - return score; + return constantInitializer(true_value); } - - } // namespace loss diff --git a/src/loss.h b/src/loss.h index 07e679c2..b6a3db4a 100644 --- a/src/loss.h +++ b/src/loss.h @@ -13,44 +13,20 @@ // Compboost is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// MIT License for more details. You should have received a copy of -// the MIT License along with compboost. +// MIT License for more details. You should have received a copy of +// the MIT License along with compboost. // -// Written by: -// ----------- -// -// Daniel Schalk -// Department of Statistics -// Ludwig-Maximilians-University Munich -// Ludwigstrasse 33 -// D-80539 München -// -// https://www.compstat.statistik.uni-muenchen.de -// -// Contact -// e: contact@danielschalk.com -// w: danielschalk.com -// -// =========================================================================== # +// ========================================================================== // -/** +/** * @file loss.h * @author Daniel Schalk (github: schalkdaniel) - * + * * @brief Loss class definition * * @section DESCRIPTION - * - * This file contains the different loss implementations. The structure is: - * - * ``` - * class SpecificLoss: public Loss - * { - * arma::vec definedLoss { IMPLEMENTATION }; - * arma::vec definedGradient { IMPLEMENTATION }; - * double constantInitializer { IMPLEMENTATION }; - * } - * ``` + * + * This file contains the different loss implementations. * */ @@ -62,6 +38,8 @@ #include #include +#include "helper.h" + namespace loss { @@ -70,42 +48,49 @@ namespace loss /** * \class Loss - * + * * \brief Abstract loss class - * - * This class defines the minimal requirements of every loss class. Note that + * + * This class defines the minimal requirements of every loss class. Note that * the custom offset uses two members. The initial idea of assigning `NAN` to * the `custom_offset` fails. - * + * */ class Loss { public: - + /// Get the task id + std::string getTaskId () const; + /// Specific loss function - virtual arma::vec definedLoss (const arma::vec&, const arma::vec&) const = 0; - + virtual arma::mat definedLoss (const arma::mat&, const arma::mat&) const = 0; + arma::mat weightedLoss (const arma::mat&, const arma::mat&, const arma::mat&) const; + /// Gradient of loss functions for pseudo residuals - virtual arma::vec definedGradient (const arma::vec&, const arma::vec&) const = 0; - + virtual arma::mat definedGradient (const arma::mat&, const arma::mat&) const = 0; + arma::mat weightedGradient (const arma::mat&, const arma::mat&, const arma::mat&) const; + /// Constant initialization of the empirical risk - virtual double constantInitializer (const arma::vec&) const = 0; + virtual arma::mat constantInitializer (const arma::mat&) const = 0; + virtual arma::mat weightedConstantInitializer (const arma::mat&, const arma::mat&) const = 0; + + double calculateEmpiricalRisk (const arma::mat&, const arma::mat&) const; + double calculateWeightedEmpiricalRisk (const arma::mat&, const arma::mat&, const arma::mat&) const; + + arma::mat calculatePseudoResiduals (const arma::mat&, const arma::mat&) const; + arma::mat calculateWeightedPseudoResiduals (const arma::mat&, const arma::mat&, const arma::mat&) const; - /// Response function to map score to output space: - virtual arma::vec responseTransformation (const arma::vec&) const = 0; - virtual ~Loss (); - + protected: - + /// The id of the task, e.g. regression or binary classification + std::string task_id; + /// Custom offset: - double custom_offset; - + arma::mat custom_offset; + /// Tag if a custom offset is used bool use_custom_offset = false; - - /// Weights: - arma::vec weights; }; // -------------------------------------------------------------------------- // @@ -117,11 +102,11 @@ class Loss /** * \class LossQuadratic - * + * * \brief Quadratic loss for regression tasks. - * - * This loss can be used for regression with \f$y \in \mathbb{R}\f$. - * + * + * This loss can be used for regression with \f$y \in \mathbb{R}\f$. + * * **Loss Function:** * \f[ * L(y, f(x)) = \frac{1}{2}\left( y - f(x) \right)^2 @@ -135,29 +120,27 @@ class Loss * \hat{f}^{[0]}(x) = \underset{c\in\mathbb{R}}{\mathrm{arg~min}}\ \frac{1}{n}\sum\limits_{i=1}^n * L\left(y^{(i)}, c\right) = \bar{y} * \f] - * + * */ class LossQuadratic : public Loss { public: - + /// Default Constructor LossQuadratic (); - + /// Constructor to initialize custom offset LossQuadratic (const double&); - + /// Specific loss function - arma::vec definedLoss (const arma::vec&, const arma::vec&) const; - + arma::mat definedLoss (const arma::mat&, const arma::mat&) const; + /// Gradient of loss functions for pseudo residuals - arma::vec definedGradient (const arma::vec&, const arma::vec&) const; - - /// Constant initialization of the empirical risk - double constantInitializer (const arma::vec&) const; + arma::mat definedGradient (const arma::mat&, const arma::mat&) const; - /// Definition of the response function - arma::vec responseTransformation (const arma::vec&) const; + /// Constant initialization of the empirical risk + arma::mat constantInitializer (const arma::mat&) const; + arma::mat weightedConstantInitializer (const arma::mat&, const arma::mat&) const; }; // LossAbsolute loss: @@ -165,9 +148,9 @@ class LossQuadratic : public Loss /** * \class LossAbsolute - * + * * \brief Absolute loss for regression tasks. - * + * * **Loss Function:** * \f[ * L(y, f(x)) = \left| y - f(x) \right| @@ -181,29 +164,27 @@ class LossQuadratic : public Loss * \hat{f}^{[0]}(x) = \underset{c\in\mathbb{R}}{\mathrm{arg~min}}\ \frac{1}{n}\sum\limits_{i=1}^n * L\left(y^{(i)}, c\right) = \mathrm{median}(y) * \f] - * + * */ class LossAbsolute : public Loss { public: - + /// Default Constructor LossAbsolute (); - + /// Constructor to initialize custom offset LossAbsolute (const double&); - + /// Specific loss function - arma::vec definedLoss (const arma::vec&, const arma::vec&) const; - + arma::mat definedLoss (const arma::mat&, const arma::mat&) const; + /// Gradient of loss functions for pseudo residuals - arma::vec definedGradient (const arma::vec&, const arma::vec&) const; - - /// Constant initialization of the empirical risk - double constantInitializer (const arma::vec&) const; + arma::mat definedGradient (const arma::mat&, const arma::mat&) const; - /// Definition of the response function - arma::vec responseTransformation (const arma::vec&) const; + /// Constant initialization of the empirical risk + arma::mat constantInitializer (const arma::mat&) const; + arma::mat weightedConstantInitializer (const arma::mat&, const arma::mat&) const; }; // Binomial loss: @@ -211,15 +192,15 @@ class LossAbsolute : public Loss /** * \class LossBinomial - * + * * \brief 0-1 Loss for binary classification derifed of the binomial distribution - * + * * This loss can be used for binary classification. The coding we have chosen - * here acts on + * here acts on * \f[ * y \in \{-1, 1\}. * \f] - * + * * **Loss Function:** * \f[ * L(y, f(x)) = \log\left\{1 + \exp\left(-2yf(x)\right)\right\} @@ -236,30 +217,29 @@ class LossAbsolute : public Loss * \f[ * p = \frac{1}{n}\sum\limits_{i=1}^n\mathbb{1}_{\{y_i > 0\}} * \f] - * + * */ class LossBinomial : public Loss { public: - + + /// Default Constructor LossBinomial (); - + /// Constructor to initialize custom offset LossBinomial (const double&); - + /// Specific loss function - arma::vec definedLoss (const arma::vec&, const arma::vec&) const; - + arma::mat definedLoss (const arma::mat&, const arma::mat&) const; + /// Gradient of loss functions for pseudo residuals - arma::vec definedGradient (const arma::vec&, const arma::vec&) const; - - /// Constant initialization of the empirical risk - double constantInitializer (const arma::vec&) const; + arma::mat definedGradient (const arma::mat&, const arma::mat&) const; - /// Definition of the response function - arma::vec responseTransformation (const arma::vec&) const; + /// Constant initialization of the empirical risk + arma::mat constantInitializer (const arma::mat&) const; + arma::mat weightedConstantInitializer (const arma::mat&, const arma::mat&) const; }; // Custom loss: @@ -267,53 +247,51 @@ class LossBinomial : public Loss /** * \class LossCustom - * + * * \brief With this loss it is possible to define custom functions out of `R` - * + * * This one is a special one. It allows to use a custom loss predefined in R. * The convenience here comes from the 'Rcpp::Function' class and the use of * a special constructor which defines the three needed functions. * * **Note** that there is one conversion step. There is no predefined conversion - * from `Rcpp::Function` (which acts as SEXP) to a `arma::vec`. But it is - * possible by using `Rcpp::NumericVector`. Therefore the custom functions + * from `Rcpp::Function` (which acts as SEXP) to a `arma::mat`. But it is + * possible by using `Rcpp::NumericVector`. Therefore the custom functions * returns a `Rcpp::NumericVector` which then is able to be converted to a - * `arma::vec`. - * + * `arma::mat`. + * * **Also Note:** This class doesn't have a constructor to initialize a * custom offset. Because this is not necessary here since the user can * define a custom offset within the `initFun` function. - * + * */ class LossCustom : public Loss { private: - + /// `R` loss function Rcpp::Function lossFun; - + /// `R` gradient of loss function Rcpp::Function gradientFun; - + /// `R` constant initializer of empirical risk Rcpp::Function initFun; - + public: /// Default constructor LossCustom (Rcpp::Function, Rcpp::Function, Rcpp::Function); /// Specific loss function - arma::vec definedLoss (const arma::vec&, const arma::vec&) const; - + arma::mat definedLoss (const arma::mat&, const arma::mat&) const; + /// Gradient of loss functions for pseudo residuals - arma::vec definedGradient (const arma::vec&, const arma::vec&) const; - - /// Constant initialization of the empirical risk - double constantInitializer (const arma::vec&) const; + arma::mat definedGradient (const arma::mat&, const arma::mat&) const; - /// Definition of the response function - arma::vec responseTransformation (const arma::vec&) const; + /// Constant initialization of the empirical risk + arma::mat constantInitializer (const arma::mat&) const; + arma::mat weightedConstantInitializer (const arma::mat&, const arma::mat&) const; }; // Custom loss: @@ -321,55 +299,52 @@ class LossCustom : public Loss /** * \class LossCustomCpp -* +* * \brief With this loss it is possible to define custom functions in `C++` -* +* * This one is a special one. It allows to use a custom loss programmed in `C++`. * The key is to use external pointer to set the corresponding functions. The * big advantage of this is to provide a (not too complicated) method to define * custom `C++` losses without recompiling compboost. -* +* * **Note:** This class doesn't have a constructor to initialize a * custom offset. Because this is not necessary here since the user can * define a custom offset within the `initFun` function. -* +* */ -typedef arma::vec (*lossFunPtr) (const arma::vec& true_value, const arma::vec& prediction); -typedef arma::vec (*gradFunPtr) (const arma::vec& true_value, const arma::vec& prediction); -typedef double (*constInitFunPtr) (const arma::vec& true_value); +typedef arma::mat (*lossFunPtr) (const arma::mat& true_value, const arma::mat& prediction); +typedef arma::mat (*gradFunPtr) (const arma::mat& true_value, const arma::mat& prediction); +typedef double (*constInitFunPtr) (const arma::mat& true_value); class LossCustomCpp : public Loss { private: - + /// Pointer to `C++` function to define the loss lossFunPtr lossFun; - + /// Pointer to `C++` function to define the gradient of the loss function gradFunPtr gradFun; - + /// Pointer to `C++` function to initialize the model constInitFunPtr constInitFun; - + public: - - /// Default constructor to set pointer (`Rcpp`s `XPtr` class) out of + + /// Default constructor to set pointer (`Rcpp`s `XPtr` class) out of /// external pointer wrapped by SEXP LossCustomCpp (SEXP, SEXP, SEXP); - + /// Specific loss function - arma::vec definedLoss (const arma::vec&, const arma::vec&) const; - + arma::mat definedLoss (const arma::mat&, const arma::mat&) const; + /// Gradient of loss functions for pseudo residuals - arma::vec definedGradient (const arma::vec&, const arma::vec&) const; - + arma::mat definedGradient (const arma::mat&, const arma::mat&) const; + /// Constant initialization of the empirical risk - double constantInitializer (const arma::vec&) const; - - /// Definition of the response function - arma::vec responseTransformation (const arma::vec&) const; - + arma::mat constantInitializer (const arma::mat&) const; + arma::mat weightedConstantInitializer (const arma::mat&, const arma::mat&) const; }; } // namespace loss diff --git a/src/optimizer.cpp b/src/optimizer.cpp index 34931110..aa8da811 100644 --- a/src/optimizer.cpp +++ b/src/optimizer.cpp @@ -13,8 +13,8 @@ // Compboost is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// MIT License for more details. You should have received a copy of -// the MIT License along with compboost. +// MIT License for more details. You should have received a copy of +// the MIT License along with compboost. // // Written by: // ----------- @@ -58,58 +58,54 @@ OptimizerCoordinateDescent::OptimizerCoordinateDescent () { step_sizes.assign(1, 1.0); } -blearner::Baselearner* OptimizerCoordinateDescent::findBestBaselearner (const std::string& iteration_id, - const arma::vec& pseudo_residuals, const blearner_factory_map& my_blearner_factory_map) const +blearner::Baselearner* OptimizerCoordinateDescent::findBestBaselearner (const std::string& iteration_id, + std::shared_ptr sh_ptr_response, const blearner_factory_map& my_blearner_factory_map) const { double ssq_temp; double ssq_best = std::numeric_limits::infinity(); - + blearner::Baselearner* blearner_temp; blearner::Baselearner* blearner_best; - + for (auto& it : my_blearner_factory_map) { // Paste string identifier for new base-learner: std::string id = "(" + iteration_id + ") " + it.second->getBaselearnerType(); - - // Create new base-learner out of the actual factory (just the + + // Create new base-learner out of the actual factory (just the // pointer is overwritten): blearner_temp = it.second->createBaselearner(id); - - // Train that base learner on the pseudo residuals: - blearner_temp->train(pseudo_residuals); - - // Calculate SSE: - ssq_temp = arma::mean(arma::pow(pseudo_residuals - blearner_temp->predict(), 2)); - + blearner_temp->train(sh_ptr_response->getPseudoResiduals()); + ssq_temp = helper::calculateSumOfSquaredError(sh_ptr_response->getPseudoResiduals(), blearner_temp->predict()); + // Check if SSE of new temporary base-learner is smaller then SSE of the best - // base-learner. If so, assign the temporary base-learner with the best + // base-learner. If so, assign the temporary base-learner with the best // base-learner (This is always triggered within the first iteration since // ssq_best is declared as infinity): if (ssq_temp < ssq_best) { - ssq_best = ssq_temp; + ssq_best = ssq_temp; // Deep copy since the temporary base-learner is deleted every time which // will also deletes the data for the best base-learner if we don't copy // the whole data of the object: blearner_best = blearner_temp->clone(); } - + // Completely remove the temporary base-learner. This one isn't needed anymore: delete blearner_temp; } // Remove pointer of the temporary base-learner. blearner_temp = NULL; - + return blearner_best; } -void OptimizerCoordinateDescent::calculateStepSize (loss::Loss* used_loss, const arma::vec& target, - const arma::vec& model_prediction, const arma::vec& baselearner_prediction) -{ +void OptimizerCoordinateDescent::calculateStepSize (loss::Loss* used_loss, std::shared_ptr sh_ptr_response, + const arma::vec& baselearner_prediction) +{ // This function does literally nothing! } -std::vector OptimizerCoordinateDescent::getStepSize () const +std::vector OptimizerCoordinateDescent::getStepSize () const { return step_sizes; } @@ -126,58 +122,54 @@ double OptimizerCoordinateDescent::getStepSize (const unsigned int& actual_itera OptimizerCoordinateDescentLineSearch::OptimizerCoordinateDescentLineSearch () { } -blearner::Baselearner* OptimizerCoordinateDescentLineSearch::findBestBaselearner (const std::string& iteration_id, - const arma::vec& pseudo_residuals, const blearner_factory_map& my_blearner_factory_map) const +blearner::Baselearner* OptimizerCoordinateDescentLineSearch::findBestBaselearner (const std::string& iteration_id, + std::shared_ptr sh_ptr_response, const blearner_factory_map& my_blearner_factory_map) const { double ssq_temp; double ssq_best = std::numeric_limits::infinity(); - + blearner::Baselearner* blearner_temp; blearner::Baselearner* blearner_best; - + for (auto& it : my_blearner_factory_map) { // Paste string identifier for new base-learner: std::string id = "(" + iteration_id + ") " + it.second->getBaselearnerType(); - - // Create new base-learner out of the actual factory (just the + + // Create new base-learner out of the actual factory (just the // pointer is overwritten): blearner_temp = it.second->createBaselearner(id); - - // Train that base learner on the pseudo residuals: - blearner_temp->train(pseudo_residuals); - - // Calculate SSE: - ssq_temp = arma::mean(arma::pow(pseudo_residuals - blearner_temp->predict(), 2)); - + blearner_temp->train(sh_ptr_response->getPseudoResiduals()); + ssq_temp = helper::calculateSumOfSquaredError(sh_ptr_response->getPseudoResiduals(), blearner_temp->predict()); + // Check if SSE of new temporary base-learner is smaller then SSE of the best - // base-learner. If so, assign the temporary base-learner with the best + // base-learner. If so, assign the temporary base-learner with the best // base-learner (This is always triggered within the first iteration since // ssq_best is declared as infinity): if (ssq_temp < ssq_best) { - ssq_best = ssq_temp; + ssq_best = ssq_temp; // Deep copy since the temporary base-learner is deleted every time which // will also deletes the data for the best base-learner if we don't copy // the whole data of the object: blearner_best = blearner_temp->clone(); } - + // Completely remove the temporary base-learner. This one isn't needed anymore: delete blearner_temp; } // Remove pointer of the temporary base-learner. blearner_temp = NULL; - + return blearner_best; } -void OptimizerCoordinateDescentLineSearch::calculateStepSize (loss::Loss* used_loss, const arma::vec& target, - const arma::vec& model_prediction, const arma::vec& baselearner_prediction) -{ - step_sizes.push_back(linesearch::findOptimalStepSize(used_loss, target, model_prediction, baselearner_prediction)); +void OptimizerCoordinateDescentLineSearch::calculateStepSize (loss::Loss* used_loss, std::shared_ptr sh_ptr_response, + const arma::vec& baselearner_prediction) +{ + step_sizes.push_back(linesearch::findOptimalStepSize(used_loss, sh_ptr_response->getResponse(), sh_ptr_response->getPredictionScores(), baselearner_prediction)); } -std::vector OptimizerCoordinateDescentLineSearch::getStepSize () const +std::vector OptimizerCoordinateDescentLineSearch::getStepSize () const { return step_sizes; } @@ -187,7 +179,7 @@ double OptimizerCoordinateDescentLineSearch::getStepSize (const unsigned int& ac if (step_sizes.size() < actual_iteration) { Rcpp::stop("You cannot select a step size which is not trained!"); } - // Subtract 1 since the actual iteration starts counting with 1 and the step sizes with 0: + // Subtract 1 since the actual iteration starts counting with 1 and the step sizes with 0: return step_sizes[actual_iteration - 1]; } diff --git a/src/optimizer.h b/src/optimizer.h index 8041689e..83860f1f 100644 --- a/src/optimizer.h +++ b/src/optimizer.h @@ -13,8 +13,8 @@ // Compboost is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// MIT License for more details. You should have received a copy of -// the MIT License along with compboost. +// MIT License for more details. You should have received a copy of +// the MIT License along with compboost. // // Written by: // ----------- @@ -37,6 +37,7 @@ #define OPTIMIZER_H_ #include +#include #include #include @@ -46,6 +47,7 @@ #include "baselearner_factory_list.h" #include "loss.h" #include "line_search.h" +#include "helper.h" namespace optimizer { @@ -56,19 +58,19 @@ namespace optimizer { class Optimizer { public: - - virtual blearner::Baselearner* findBestBaselearner (const std::string&, - const arma::vec&, const blearner_factory_map&) const = 0; - + + virtual blearner::Baselearner* findBestBaselearner (const std::string&, + std::shared_ptr, const blearner_factory_map&) const = 0; + // loss, target, model_prediction, base_learner_prediction (prediction of newly selected base-learner) - virtual void calculateStepSize (loss::Loss*, const arma::vec&, const arma::vec&, const arma::vec&) = 0; + virtual void calculateStepSize (loss::Loss*, std::shared_ptr, const arma::vec&) = 0; virtual std::vector getStepSize () const = 0; virtual double getStepSize (const unsigned int&) const = 0; virtual ~Optimizer (); protected: - + blearner_factory_map my_blearner_factory_map; std::vector step_sizes; @@ -83,14 +85,14 @@ class Optimizer class OptimizerCoordinateDescent : public Optimizer { public: - + // No special initialization necessary: OptimizerCoordinateDescent (); - blearner::Baselearner* findBestBaselearner (const std::string&, - const arma::vec&, const blearner_factory_map&) const; + blearner::Baselearner* findBestBaselearner (const std::string&, std::shared_ptr, + const blearner_factory_map&) const; - void calculateStepSize (loss::Loss*, const arma::vec&, const arma::vec&, const arma::vec&); + void calculateStepSize (loss::Loss*, std::shared_ptr, const arma::vec&); std::vector getStepSize () const; double getStepSize (const unsigned int&) const; }; @@ -104,12 +106,12 @@ class OptimizerCoordinateDescentLineSearch : public Optimizer // No special initialization necessary: OptimizerCoordinateDescentLineSearch (); - blearner::Baselearner* findBestBaselearner (const std::string&, - const arma::vec&, const blearner_factory_map&) const; + blearner::Baselearner* findBestBaselearner (const std::string&, + std::shared_ptr, const blearner_factory_map&) const; - void calculateStepSize (loss::Loss*, const arma::vec&, const arma::vec&, const arma::vec&); + void calculateStepSize (loss::Loss*, std::shared_ptr, const arma::vec&); std::vector getStepSize () const; - double getStepSize (const unsigned int&) const; + double getStepSize (const unsigned int&) const; }; diff --git a/src/response.cpp b/src/response.cpp new file mode 100644 index 00000000..1b712548 --- /dev/null +++ b/src/response.cpp @@ -0,0 +1,365 @@ +// ========================================================================== // +// ___. __ // +// ____ ____ _____ ______\_ |__ ____ ____ _______/ |_ // +// _/ ___\/ _ \ / \\____ \| __ \ / _ \ / _ \/ ___/\ __\ // +// \ \__( <_> ) Y Y \ |_> > \_\ ( <_> | <_> )___ \ | | // +// \___ >____/|__|_| / __/|___ /\____/ \____/____ > |__| // +// \/ \/|__| \/ \/ // +// // +// ========================================================================== // +// +// Compboost is free software: you can redistribute it and/or modify +// it under the terms of the MIT License. +// Compboost is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// MIT License for more details. You should have received a copy of +// the MIT License along with compboost. +// +// ========================================================================== // + +#include "response.h" + +namespace response +{ + +// -------------------------------------------------------------------------- // +// Abstract 'Response' class: +// -------------------------------------------------------------------------- // + +Response::Response () {} + + +void Response::setActualIteration (const unsigned int& actual_iter) { actual_iteration = actual_iter; } +void Response::setActualPredictionScores (const arma::mat& new_prediction_scores, const unsigned int& actual_iter) +{ + prediction_scores = new_prediction_scores; + actual_iteration = actual_iter; +} + +std::string Response::getTargetName () const { return target_name; } +std::string Response::getTaskIdentifier () const { return task_id; } +arma::mat Response::getResponse () const { return response; } +arma::mat Response::getWeights () const { return weights; } +arma::mat Response::getInitialization () const { return initialization; } +arma::mat Response::getPseudoResiduals () const { return pseudo_residuals; } +arma::mat Response::getPredictionScores () const { return prediction_scores; } + + +void Response::checkLossCompatibility (loss::Loss* used_loss) const +{ + if ((task_id != used_loss->getTaskId()) && (used_loss->getTaskId() != "custom")) { + std::string error_msg = "Loss task '" + used_loss->getTaskId() + "' is not compatible with the response class task '" + task_id + "'."; + Rcpp::stop(error_msg); + } +} + + +void Response::updatePseudoResiduals (loss::Loss* used_loss) +{ + checkLossCompatibility(used_loss); + if (use_weights) { + pseudo_residuals = used_loss->calculateWeightedPseudoResiduals(response, prediction_scores, weights); + } else { + pseudo_residuals = used_loss->calculatePseudoResiduals(response, prediction_scores); + } +} + +void Response::updatePrediction (const double& learning_rate, const double& step_size, const arma::mat& update) +{ + prediction_scores += learning_rate * step_size * update; +} + + +void Response::constantInitialization (loss::Loss* used_loss) +{ + checkLossCompatibility(used_loss); + + if (! is_initialization_initialized) { + if (use_weights) { + initialization = used_loss->weightedConstantInitializer(response, weights); + } else { + initialization = used_loss->constantInitializer(response); + } + is_initialization_initialized = true; + } else { + Rcpp::stop("Constant initialization is already initialized."); + } +} + +void Response::constantInitialization (const arma::mat& init_mat) +{ + if (! is_initialization_initialized) { + initialization = init_mat; + is_initialization_initialized = true; + } else { + Rcpp::stop("Constant initialization is already initialized."); + } +} + + +double Response::calculateEmpiricalRisk (loss::Loss* used_loss) const +{ + checkLossCompatibility(used_loss); + if (use_weights) { + return used_loss->calculateWeightedEmpiricalRisk(response, getPredictionTransform(), weights); + } else { + return used_loss->calculateEmpiricalRisk(response, getPredictionTransform()); + } +} + +arma::mat Response::getPredictionTransform () const +{ + return getPredictionTransform(prediction_scores); +} + +arma::mat Response::getPredictionResponse () const +{ + return getPredictionResponse(prediction_scores); +} + +// -------------------------------------------------------------------------- // +// Response implementations: +// -------------------------------------------------------------------------- // + +// Regression + +ResponseRegr::ResponseRegr (const std::string& target_name0, const arma::mat& response0) +{ + target_name = target_name0; + response = response0; + task_id = "regression"; // set parent + arma::mat temp_mat(response.n_rows, response.n_cols, arma::fill::zeros); + prediction_scores = temp_mat; // set parent + pseudo_residuals = temp_mat; // set parent +} + +ResponseRegr::ResponseRegr (const std::string& target_name0, const arma::mat& response0, const arma::mat& weights0) +{ + helper::checkMatrixDim(response0, weights0); + target_name = target_name0; + response = response0; + weights = weights0; + use_weights = true; + task_id = "regression"; // set parent + arma::mat temp_mat(response.n_rows, response.n_cols, arma::fill::zeros); + prediction_scores = temp_mat; // set parent + pseudo_residuals = temp_mat; // set parent +} + +arma::mat ResponseRegr::calculateInitialPrediction (const arma::mat& response) const +{ + arma::mat init(response.n_rows, response.n_cols, arma::fill::zeros); + + if (! is_initialization_initialized) { + Rcpp::stop("Response is not initialized, call 'constantInitialization()' first."); + } + // Use just first element to correctly use .fill: + init.fill(initialization[0]); + return init; +} + +void ResponseRegr::initializePrediction () +{ + if (is_initialization_initialized) { + if (! is_model_initialized) { + prediction_scores = calculateInitialPrediction(response); + is_model_initialized = true; + } else { + Rcpp::stop("Prediction is already initialized."); + } + } else { + Rcpp::stop("Initialize constant initialization first by calling 'constantInitialization()'."); + } +} + +arma::mat ResponseRegr::getPredictionTransform (const arma::mat& pred_scores) const +{ + // No transformation is done in regression + return pred_scores; +} + +arma::mat ResponseRegr::getPredictionResponse (const arma::mat& pred_scores) const +{ + return pred_scores; +} + +void ResponseRegr::filter (const arma::uvec& idx) +{ + response = response.elem(idx); + if (use_weights) { + weights = weights.elem(idx); + } + pseudo_residuals = pseudo_residuals.elem(idx); + prediction_scores = prediction_scores.elem(idx); +} + +// Binary Classification + +ResponseBinaryClassif::ResponseBinaryClassif (const std::string& target_name0, const arma::mat& response0) +{ + helper::checkForBinaryClassif(response0, -1, 1); + target_name = target_name0; + response = response0; + task_id = "binary_classif"; // set parent + arma::mat temp_mat(response.n_rows, response.n_cols, arma::fill::zeros); + prediction_scores = temp_mat; // set parent + pseudo_residuals = temp_mat; // set parent +} + +ResponseBinaryClassif::ResponseBinaryClassif (const std::string& target_name0, const arma::mat& response0, const arma::mat& weights0) +{ + helper::checkForBinaryClassif(response0, -1, 1); + helper::checkMatrixDim(response0, weights0); + target_name = target_name0; + response = response0; + weights = weights0; + use_weights = true; + task_id = "binary_classif"; // set parent + arma::mat temp_mat(response.n_rows, response.n_cols, arma::fill::zeros); + prediction_scores = temp_mat; // set parent + pseudo_residuals = temp_mat; // set parent +} + +arma::mat ResponseBinaryClassif::calculateInitialPrediction (const arma::mat& response) const +{ + arma::mat init(response.n_rows, response.n_cols, arma::fill::zeros); + + if (! is_initialization_initialized) { + Rcpp::stop("Response is not initialized, call 'constantInitialization()' first."); + } + // Use just first element to correctly use .fill: + init.fill(initialization[0]); + return init; +} + +void ResponseBinaryClassif::initializePrediction () +{ + if (is_initialization_initialized) { + if (! is_model_initialized) { + prediction_scores = calculateInitialPrediction(response); + is_model_initialized = true; + } else { + Rcpp::stop("Prediction is already initialized."); + } + } else { + Rcpp::stop("Initialize constant initialization first by calling 'constantInitialization()'."); + } +} + +arma::mat ResponseBinaryClassif::getPredictionTransform (const arma::mat& pred_scores) const +{ + return helper::sigmoid(pred_scores); +} + +arma::mat ResponseBinaryClassif::getPredictionResponse (const arma::mat& pred_scores) const +{ + return helper::transformToBinaryResponse(getPredictionTransform(pred_scores), threshold, 1, -1); +} + +void ResponseBinaryClassif::filter (const arma::uvec& idx) +{ + response = response.elem(idx); + if (use_weights) { + weights = weights.elem(idx); + } + pseudo_residuals = pseudo_residuals.elem(idx); + prediction_scores = prediction_scores.elem(idx); +} + +void ResponseBinaryClassif::setThreshold (const double& new_thresh) +{ + if ((new_thresh < 0) || (new_thresh > 1)) { + Rcpp::stop("Threshold must be element of [0,1]"); + } + threshold = new_thresh; +} + +// Functional Data Response + +ResponseFDA::ResponseFDA (const std::string& target_name0, const arma::mat& response0, const arma::mat& grid0) +{ + target_name = target_name0; + response = response0; + task_id = "regression"; // set parent + arma::mat temp_mat(response.n_rows, response.n_cols, arma::fill::zeros); + prediction_scores = temp_mat; // set parent + pseudo_residuals = temp_mat; // set parent + // FDA specifics + grid = grid0; + arma::mat temp_mat_1(response.n_rows, response.n_cols, arma::fill::ones); + weights = temp_mat_1; + trapez_weights = tensors::trapezWeights(grid0); +} + +ResponseFDA::ResponseFDA (const std::string& target_name0, const arma::mat& response0, const arma::mat& weights0, const arma::mat& grid0) +{ + helper::checkMatrixDim(response0, weights0); + target_name = target_name0; + response = response0; + weights = weights0; + task_id = "regression"; // set parent + arma::mat temp_mat(response.n_rows, response.n_cols, arma::fill::zeros); + prediction_scores = temp_mat; // set parent + pseudo_residuals = temp_mat; // set parent + // FDA specifics + grid = grid0; + trapez_weights = tensors::trapezWeights(grid0); +} + +arma::mat ResponseFDA::calculateInitialPrediction (const arma::mat& response) const +{ + arma::mat init(response.n_rows, response.n_cols, arma::fill::zeros); + + if (! is_initialization_initialized) { + Rcpp::stop("Response is not initialized, call 'constantInitialization()' first."); + } + // Use just first element to correctly use .fill: + init.fill(initialization[0]); + return init; +} + +void ResponseFDA::initializePrediction () +{ + if (is_initialization_initialized) { + if (! is_model_initialized) { + prediction_scores = calculateInitialPrediction(response); + is_model_initialized = true; + } else { + Rcpp::stop("Prediction is already initialized."); + } + } else { + Rcpp::stop("Initialize constant initialization first by calling 'constantInitialization()'."); + } +} + +void ResponseFDA::updatePseudoResiduals (loss::Loss* used_loss) +{ + checkLossCompatibility(used_loss); + weights = weights.each_row() % trapez_weights.t(); + pseudo_residuals = used_loss->calculateWeightedPseudoResiduals(response, prediction_scores, weights); +} + + +arma::mat ResponseFDA::getPredictionTransform (const arma::mat& pred_scores) const +{ + // No transformation is done in regression + return pred_scores; +} + +arma::mat ResponseFDA::getPredictionResponse (const arma::mat& pred_scores) const +{ + return pred_scores; +} + +void ResponseFDA::filter (const arma::uvec& idx) +{ + response = response.elem(idx); + if (use_weights) { + weights = weights.elem(idx); + } + pseudo_residuals = pseudo_residuals.elem(idx); + prediction_scores = prediction_scores.elem(idx); +} + +} // namespace response \ No newline at end of file diff --git a/src/response.h b/src/response.h new file mode 100644 index 00000000..5644d4a1 --- /dev/null +++ b/src/response.h @@ -0,0 +1,148 @@ +// ========================================================================== // +// ___. __ // +// ____ ____ _____ ______\_ |__ ____ ____ _______/ |_ // +// _/ ___\/ _ \ / \\____ \| __ \ / _ \ / _ \/ ___/\ __\ // +// \ \__( <_> ) Y Y \ |_> > \_\ ( <_> | <_> )___ \ | | // +// \___ >____/|__|_| / __/|___ /\____/ \____/____ > |__| // +// \/ \/|__| \/ \/ // +// // +// ========================================================================== // +// +// Compboost is free software: you can redistribute it and/or modify +// it under the terms of the MIT License. +// Compboost is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// MIT License for more details. You should have received a copy of +// the MIT License along with compboost. +// +// ========================================================================== // + +#ifndef RESPONSE_H_ +#define RESPONSE_H_ + +#include "RcppArmadillo.h" +#include "loss.h" +#include "helper.h" +#include "tensors.h" + +namespace response +{ + +// -------------------------------------------------------------------------- // +// Abstract 'Response' class: +// -------------------------------------------------------------------------- // + +class Response +{ +protected: + std::string target_name; + std::string task_id; + arma::mat response; + arma::mat weights; + arma::mat initialization; + arma::mat pseudo_residuals; + arma::mat prediction_scores; + + unsigned int actual_iteration = 0; + bool is_initialization_initialized = false; + bool is_model_initialized = false; + +public: + + Response (); + + void setActualIteration (const unsigned int&); + void setActualPredictionScores (const arma::mat&, const unsigned int&); + + std::string getTargetName () const; + std::string getTaskIdentifier () const; + arma::mat getResponse () const; + arma::mat getWeights () const; + arma::mat getInitialization () const; + arma::mat getPseudoResiduals () const; + arma::mat getPredictionScores () const; + + bool use_weights = false; + + void checkLossCompatibility (loss::Loss*) const; + + void updatePseudoResiduals (loss::Loss*); + void updatePrediction (const double&, const double&, const arma::mat&); + + void constantInitialization (loss::Loss*); + void constantInitialization (const arma::mat&); + virtual arma::mat calculateInitialPrediction (const arma::mat&) const = 0; + virtual void initializePrediction () = 0; + arma::mat getPredictionTransform () const; + virtual arma::mat getPredictionTransform (const arma::mat&) const = 0; + arma::mat getPredictionResponse () const; + virtual arma::mat getPredictionResponse (const arma::mat&) const = 0; + + double calculateEmpiricalRisk (loss::Loss*) const; + + virtual void filter (const arma::uvec&) = 0; + + virtual ~Response () { }; +}; + + +// -------------------------------------------------------------------------- // +// Response implementations: +// -------------------------------------------------------------------------- // + +class ResponseRegr : public Response +{ + +public: + ResponseRegr (const std::string&, const arma::mat&); + ResponseRegr (const std::string&, const arma::mat&, const arma::mat&); + + arma::mat calculateInitialPrediction (const arma::mat&) const; + void initializePrediction (); + arma::mat getPredictionTransform (const arma::mat&) const; + arma::mat getPredictionResponse (const arma::mat&) const; + void filter (const arma::uvec&); +}; + +class ResponseBinaryClassif : public Response +{ +public: + double threshold = 0.5; + + ResponseBinaryClassif (const std::string&, const arma::mat&); + ResponseBinaryClassif (const std::string&, const arma::mat&, const arma::mat&); + + arma::mat calculateInitialPrediction (const arma::mat&) const; + void initializePrediction (); + arma::mat getPredictionTransform (const arma::mat&) const; + arma::mat getPredictionResponse (const arma::mat&) const; + + void filter (const arma::uvec&); + + void setThreshold (const double&); +}; + +// ----------------------------------------------------------------------------------------------------------------- +class ResponseFDA : public Response +{ + +public: + arma::mat grid; + arma::mat trapez_weights; + + ResponseFDA (const std::string&, const arma::mat&, const arma::mat&); + ResponseFDA (const std::string&, const arma::mat&, const arma::mat&, const arma::mat&); + + arma::mat calculateInitialPrediction (const arma::mat&) const; + void initializePrediction (); + void updatePseudoResiduals (loss::Loss*); + arma::mat getPredictionTransform (const arma::mat&) const; + arma::mat getPredictionResponse (const arma::mat&) const; + void filter (const arma::uvec&); +}; + + +} // namespace response + +#endif // RESPONSE_H_ \ No newline at end of file diff --git a/src/splines.cpp b/src/splines.cpp index ac7e6d24..bccd5a81 100644 --- a/src/splines.cpp +++ b/src/splines.cpp @@ -13,7 +13,7 @@ // Compboost is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// MIT License for more details. You should have received a copy of +// MIT License for more details. You should have received a copy of // the MIT License along with compboost. // // Written by: @@ -35,6 +35,9 @@ #include "splines.h" +namespace splines +{ + /** * \brief Calculating penalty matrix * @@ -335,3 +338,5 @@ arma::mat filterKnotRange (const arma::mat& newdata, const double& range_min, co return temp; } + +} // namespace splines diff --git a/src/splines.h b/src/splines.h index 97dfa727..4cef6760 100644 --- a/src/splines.h +++ b/src/splines.h @@ -13,8 +13,8 @@ // Compboost is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// MIT License for more details. You should have received a copy of -// the MIT License along with compboost. +// MIT License for more details. You should have received a copy of +// the MIT License along with compboost. // // Written by: // ----------- @@ -38,6 +38,8 @@ #include +namespace splines { + arma::mat penaltyMat (const unsigned int&, const unsigned int&); unsigned int findSpan (const double&, const arma::vec&); arma::vec createKnots (const arma::vec&, const unsigned int&,const unsigned int&); @@ -45,4 +47,6 @@ arma::mat createSplineBasis (const arma::vec&, const unsigned int&, const arma:: arma::sp_mat createSparseSplineBasis (const arma::vec&, const unsigned int&, const arma::vec&); arma::mat filterKnotRange (const arma::mat&, const double&, const double&, const std::string&); +} // namespace splines + # endif // SPLINE_H_ \ No newline at end of file diff --git a/src/tensors.cpp b/src/tensors.cpp new file mode 100644 index 00000000..d673b650 --- /dev/null +++ b/src/tensors.cpp @@ -0,0 +1,126 @@ +// ========================================================================== // +// ___. __ // +// ____ ____ _____ ______\_ |__ ____ ____ _______/ |_ // +// _/ ___\/ _ \ / \\____ \| __ \ / _ \ / _ \/ ___/\ __\ // +// \ \__( <_> ) Y Y \ |_> > \_\ ( <_> | <_> )___ \ | | // +// \___ >____/|__|_| / __/|___ /\____/ \____/____ > |__| // +// \/ \/|__| \/ \/ // +// // +// ========================================================================== // +// +// Compboost is free software: you can redistribute it and/or modify +// it under the terms of the MIT License. +// Compboost is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// MIT License for more details. You should have received a copy of +// the MIT License along with compboost. +// +// Written by: +// ----------- +// +// Daniel Schalk +// Department of Statistics +// Ludwig-Maximilians-University Munich +// Ludwigstrasse 33 +// D-80539 München +// +// https://www.compstat.statistik.uni-muenchen.de +// +// Contact +// e: contact@danielschalk.com +// w: danielschalk.com +// +// =========================================================================== # + +#include "tensors.h" + +namespace tensors +{ + +arma::mat rowWiseKronecker (const arma::mat& A, const arma::mat& B) +{ + // Variables + arma::mat out; + arma::rowvec vecA = arma::rowvec(A.n_rows, arma::fill::ones); + arma::rowvec vecB = arma::rowvec(B.n_rows, arma::fill::ones); + + // Multiply both kronecker products element-wise + out = arma::kron(A,vecA) % arma::kron(vecB, B); + + return out; +} + +arma::mat penaltySumKronecker (const arma::mat& Pa, const arma::mat& Pb) +{ + // Variables + arma::mat out; + // Create Diagonal matrices + arma::mat eyePa = arma::diagmat( arma::vec(Pa.n_cols, arma::fill::ones) ); + arma::mat eyePb = arma::diagmat( arma::vec(Pb.n_cols, arma::fill::ones) ); + + + // sum of Kroneckers with diagonal marices + out = arma::kron(Pa,eyePa) + arma::kron(eyePb, Pb); + + return out; +} + + +std::map centerDesignMatrix (const arma::mat& X1, const arma::mat& P1, const arma::mat& X2) +{ + + // Cross Product X1 and X2 + arma::mat cross = X1.t() * X2 ; + + // QR decomp + // We require and orthogonal matrix Q + arma::mat R; + arma::mat Q; + arma::qr(Q,R,cross); + + // get rank of R and add 1 + int rankR = arma::rank(R); + + // construct Z from rows 0 to last row and column R+1 to last column + arma::mat Z = Q( arma::span(0, Q.n_rows-1), arma::span(rankR, Q.n_cols-1) ); + + // Construct the rotated X1 + arma::mat X1_out = X1 * Z; + + // Construct the rotated Penalty Matrix + arma::mat P1_out = Z.t() * P1 * Z; + + // Construct out + std::map out; + out["X1"] = X1_out; + out["P1"] = P1_out; + + return out; + + /// return X1_out; +} + +arma::vec trapezWeights (const arma::vec& time_points) +{ + + /// Get Differences of the current function + arma::vec t_diffs = arma::diff( time_points ) ; + arma::vec weights = time_points; + + /// change the border values + weights(arma::span(0)) = t_diffs(arma::span(0)); + weights(arma::span(weights.size()-1)) = t_diffs(arma::span(t_diffs.size()-1)); + + /// divide all by half except for beginning and end, add to get mean per value + arma::vec t_diff_halfs = t_diffs / 2; + + weights(arma::span(1,weights.size()-2)) = t_diff_halfs(arma::span(1,t_diff_halfs.size()-1)) + + t_diff_halfs(arma::span(0,t_diff_halfs.size()-2)); + + return weights; +} + +} // namespace tensors + + diff --git a/src/tensors.h b/src/tensors.h new file mode 100644 index 00000000..fd39f3de --- /dev/null +++ b/src/tensors.h @@ -0,0 +1,34 @@ +// ========================================================================== // +// ___. __ // +// ____ ____ _____ ______\_ |__ ____ ____ _______/ |_ // +// _/ ___\/ _ \ / \\____ \| __ \ / _ \ / _ \/ ___/\ __\ // +// \ \__( <_> ) Y Y \ |_> > \_\ ( <_> | <_> )___ \ | | // +// \___ >____/|__|_| / __/|___ /\____/ \____/____ > |__| // +// \/ \/|__| \/ \/ // +// // +// ========================================================================== // +// +// Compboost is free software: you can redistribute it and/or modify +// it under the terms of the MIT License. +// Compboost is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// MIT License for more details. You should have received a copy of +// the MIT License along with compboost. +// +// +// =========================================================================== # +#ifndef TENSORS_H_ +#define TENSORS_H_ + +#include "RcppArmadillo.h" + +namespace tensors +{ +arma::mat rowWiseKronecker (const arma::mat&, const arma::mat&); +arma::mat penaltySumKronecker (const arma::mat&, const arma::mat&); +arma::vec trapezWeights (const arma::vec&); +std::map centerDesignMatrix (const arma::mat&, const arma::mat&, const arma::mat&); +} // namespace tensors + +# endif // SPLINE_H_ diff --git a/tests/testthat/test_api.R b/tests/testthat/test_api.R index 68337116..c3877b57 100644 --- a/tests/testthat/test_api.R +++ b/tests/testthat/test_api.R @@ -2,15 +2,15 @@ context("API works correctly") test_that("train works", { - mtcars$mpg_cat = ifelse(mtcars$mpg > 15, "A", "B") - + mtcars$mpg_cat = ifelse(mtcars$mpg > 15, "A", "B") + expect_error({ cboost = Compboost$new(mtcars, "i_am_no_feature", loss = LossQuadratic$new()) }) expect_error({ cboost = Compboost$new(mtcars, "mpg", loss = LossQuadratic) }) expect_error({ cboost = Compboost$new(mtcars, "mpg", loss = LossAbsolute) }) expect_error({ cboost = Compboost$new(mtcars, "mpg", loss = LossBinomial) }) expect_error({ cboost = Compboost$new(mtcars, "mpg", loss = LossCustom) }) expect_error({ cboost = Compboost$new(mtcars, "mpg", loss = LossCustomCpp) }) - + expect_silent({ cboost = Compboost$new(mtcars, "mpg", loss = LossQuadratic$new()) }) expect_output(cboost$print()) @@ -22,23 +22,23 @@ test_that("train works", { expect_error(cboost$train(10)) expect_error(cboost$train(10, trace = 20)) expect_error( - cboost$addBaselearner(c("hp", "wt"), "spline", BaselearnerPSpline, degree = 3, + cboost$addBaselearner(c("hp", "wt"), "spline", BaselearnerPSpline, degree = 3, n.knots = 10, penalty = 2, differences = 2) ) expect_silent( - cboost$addBaselearner("mpg_cat", "linear", BaselearnerPolynomial, degree = 1, + cboost$addBaselearner("mpg_cat", "linear", BaselearnerPolynomial, degree = 1, intercept = FALSE) ) expect_silent( - cboost$addBaselearner("hp", "spline", BaselearnerPSpline, degree = 3, + cboost$addBaselearner("hp", "spline", BaselearnerPSpline, degree = 3, n.knots = 10, penalty = 2, differences = 2) ) expect_output(cboost$train(4000)) expect_output(cboost$print()) expect_error( - cboost$addBaselearner("wt", "spline", BaselearnerPSpline, degree = 3, + cboost$addBaselearner("wt", "spline", BaselearnerPSpline, degree = 3, n.knots = 10, penalty = 2, differences = 2) ) @@ -48,7 +48,7 @@ test_that("train works", { expect_s4_class(cboost$optimizer, "Rcpp_OptimizerCoordinateDescent") expect_equal(cboost$target, "mpg") - expect_equal(cboost$response, mtcars[["mpg"]]) + expect_equal(cboost$response$getResponse(), as.matrix(mtcars[["mpg"]])) expect_equal(cboost$data, mtcars[, -which(names(mtcars) == "mpg")]) expect_equal(cboost$bl.factory.list$getNumberOfRegisteredFactories(), 3L) expect_equal(sort(cboost$getBaselearnerNames()), sort(c("mpg_cat_A_linear", "mpg_cat_B_linear", "hp_spline"))) @@ -73,13 +73,13 @@ test_that("train works", { }) test_that("predict works", { - mtcars$mpg_cat = ifelse(mtcars$mpg > 15, "A", "B") + mtcars$mpg_cat = ifelse(mtcars$mpg > 15, "A", "B") - expect_silent({ + expect_silent({ cboost = Compboost$new(mtcars, "mpg", loss = LossQuadratic$new()) - cboost$addBaselearner("mpg_cat", "linear", BaselearnerPolynomial, degree = 1, + cboost$addBaselearner("mpg_cat", "linear", BaselearnerPolynomial, degree = 1, intercept = FALSE) - cboost$addBaselearner("hp", "spline", BaselearnerPSpline, degree = 3, + cboost$addBaselearner("hp", "spline", BaselearnerPSpline, degree = 3, n.knots = 10, penalty = 2, differences = 2) }) @@ -93,18 +93,18 @@ test_that("predict works", { test_that("plot works", { - mtcars$mpg_cat = ifelse(mtcars$mpg > 15, "A", "B") + mtcars$mpg_cat = ifelse(mtcars$mpg > 15, "A", "B") - expect_silent({ + expect_silent({ cboost = Compboost$new(mtcars, "mpg", loss = LossQuadratic$new()) - cboost$addBaselearner("mpg_cat", "linear", BaselearnerPolynomial, degree = 1, + cboost$addBaselearner("mpg_cat", "linear", BaselearnerPolynomial, degree = 1, intercept = TRUE) - cboost$addBaselearner("hp", "spline", BaselearnerPSpline, degree = 3, + cboost$addBaselearner("hp", "spline", BaselearnerPSpline, degree = 3, n.knots = 10, penalty = 2, differences = 2) cboost$addBaselearner(c("hp", "wt"), "quadratic", BaselearnerPolynomial, degree = 2, intercept = TRUE) cboost$addBaselearner("wt", "linear", BaselearnerPolynomial, degree = 1, - intercept = TRUE) + intercept = TRUE) }) expect_error(cboost$plot("hp_spline")) @@ -126,29 +126,29 @@ test_that("plot works", { expect_s3_class(cboost$plot("hp_spline", from = 150, to = 250), "ggplot") expect_warning(cboost$plot("wt_linear", iters = c(1, 10))) - + expect_silent(cboost$train(200, trace = 0)) expect_error(cboost$plot("mpg_cat_A_linear")) - + }) test_that("multiple logger works", { - expect_silent({ + expect_silent({ cboost = Compboost$new(mtcars, "mpg", loss = LossQuadratic$new()) - cboost$addBaselearner("hp", "spline", BaselearnerPSpline, degree = 3, + cboost$addBaselearner("hp", "spline", BaselearnerPSpline, degree = 3, n.knots = 10, penalty = 2, differences = 2) cboost$addBaselearner(c("hp", "wt"), "quadratic", BaselearnerPolynomial, degree = 2, intercept = TRUE) }) expect_silent( - cboost$addLogger(logger = LoggerTime, use.as.stopper = FALSE, logger.id = "time", + cboost$addLogger(logger = LoggerTime, use.as.stopper = FALSE, logger.id = "time", max.time = 0, time.unit = "microseconds") ) expect_silent( cboost$addLogger(logger = LoggerOobRisk, use.as.stopper = TRUE, logger.id = "oob", - LossQuadratic$new(), 0.01, cboost$prepareData(mtcars), mtcars[["mpg"]]) + LossQuadratic$new(), 0.01, cboost$prepareData(mtcars), ResponseRegr$new("oob_response", as.matrix(mtcars[["mpg"]]))) ) expect_silent( cboost$addLogger(logger = LoggerInbagRisk, use.as.stopper = TRUE, logger.id = "inbag", @@ -180,9 +180,9 @@ test_that("custom base-learner works through api", { return(model) } - expect_silent({ - cboost$addBaselearner("hp", "custom", BaselearnerCustom, instantiate.fun = instantiateData, - train.fun = trainFun, predict.fun = predictFun, param.fun = extractParameter) + expect_silent({ + cboost$addBaselearner("hp", "custom", BaselearnerCustom, instantiate.fun = instantiateData, + train.fun = trainFun, predict.fun = predictFun, param.fun = extractParameter) }) expect_output(cboost$train(100)) @@ -203,10 +203,10 @@ test_that("custom base-learner works through api", { test_that("custom cpp base-learner works through api", { expect_silent({ cboost = Compboost$new(mtcars, "mpg", loss = LossQuadratic$new()) }) - expect_silent({ Rcpp::sourceCpp(code = getCustomCppExample(silent = TRUE)) }) - expect_silent({ - cboost$addBaselearner("hp", "custom", BaselearnerCustomCpp, instantiate.ptr = dataFunSetter(), - train.ptr = trainFunSetter(), predict.ptr = predictFunSetter()) + expect_silent({ Rcpp::sourceCpp(code = getCustomCppExample(silent = TRUE)) }) + expect_silent({ + cboost$addBaselearner("hp", "custom", BaselearnerCustomCpp, instantiate.ptr = dataFunSetter(), + train.ptr = trainFunSetter(), predict.ptr = predictFunSetter()) }) expect_output(cboost$train(100)) @@ -220,7 +220,6 @@ test_that("custom cpp base-learner works through api", { expect_equivalent(cboost$getEstimatedCoef(), cboost1$getEstimatedCoef()) expect_equal(cboost$predict(), cboost1$predict()) expect_equal(cboost$predict(), cboost$predict(mtcars)) - }) test_that("custom loss works through api", { @@ -228,15 +227,15 @@ test_that("custom loss works through api", { myLossFun = function (true.value, prediction) { return(0.5 * (true.value - prediction)^2) } myGradientFun = function (true.value, prediction) { return(prediction - true.value) } myConstantInitializerFun = function (true.value) { mean.default(true.value) } - + expect_silent({ custom.loss = LossCustom$new(myLossFun, myGradientFun, myConstantInitializerFun) }) expect_silent({ cboost = Compboost$new(mtcars, "mpg", loss = custom.loss) }) - expect_silent({ Rcpp::sourceCpp(code = getCustomCppExample(silent = TRUE)) }) - expect_silent({ + expect_silent({ Rcpp::sourceCpp(code = getCustomCppExample(silent = TRUE)) }) + expect_silent({ cboost$addBaselearner("hp", "linear", BaselearnerPolynomial, degree = 1, intercept = FALSE) cboost$addBaselearner("wt", "linear", BaselearnerPolynomial, degree = 1, - intercept = FALSE) + intercept = FALSE) cboost$addBaselearner("qsec", "linear", BaselearnerPolynomial, degree = 1, intercept = FALSE) }) @@ -247,7 +246,7 @@ test_that("custom loss works through api", { cboost1$addBaselearner("hp", "linear", BaselearnerPolynomial, degree = 1, intercept = FALSE) cboost1$addBaselearner("wt", "linear", BaselearnerPolynomial, degree = 1, - intercept = FALSE) + intercept = FALSE) cboost1$addBaselearner("qsec", "linear", BaselearnerPolynomial, degree = 1, intercept = FALSE) }) @@ -259,21 +258,20 @@ test_that("custom loss works through api", { expect_equal(cboost$predict(mtcars), cboost$predict()) expect_equal(cboost$predict(), cboost$predict(response = TRUE)) expect_equal(cboost$predict(mtcars, response = TRUE), cboost$predict(response = TRUE)) - }) test_that("custom cpp loss works through api", { expect_output(Rcpp::sourceCpp(code = getCustomCppExample(example = "loss"))) - + expect_silent({ custom.loss = LossCustomCpp$new(lossFunSetter(), gradFunSetter(), constInitFunSetter()) }) expect_silent({ cboost = Compboost$new(mtcars, "mpg", loss = custom.loss) }) - expect_silent({ Rcpp::sourceCpp(code = getCustomCppExample(silent = TRUE)) }) - expect_silent({ + expect_silent({ Rcpp::sourceCpp(code = getCustomCppExample(silent = TRUE)) }) + expect_silent({ cboost$addBaselearner("hp", "linear", BaselearnerPolynomial, degree = 1, intercept = FALSE) cboost$addBaselearner("wt", "linear", BaselearnerPolynomial, degree = 1, - intercept = FALSE) + intercept = FALSE) cboost$addBaselearner("qsec", "linear", BaselearnerPolynomial, degree = 1, intercept = FALSE) }) @@ -284,7 +282,7 @@ test_that("custom cpp loss works through api", { cboost1$addBaselearner("hp", "linear", BaselearnerPolynomial, degree = 1, intercept = FALSE) cboost1$addBaselearner("wt", "linear", BaselearnerPolynomial, degree = 1, - intercept = FALSE) + intercept = FALSE) cboost1$addBaselearner("qsec", "linear", BaselearnerPolynomial, degree = 1, intercept = FALSE) }) @@ -295,7 +293,6 @@ test_that("custom cpp loss works through api", { expect_equal(cboost$getSelectedBaselearner(), cboost1$getSelectedBaselearner()) expect_equal(cboost$predict(mtcars), cboost$predict()) expect_equal(cboost$predict(mtcars, response = TRUE), cboost$predict(response = TRUE)) - }) test_that("training with absolute loss works", { @@ -312,25 +309,24 @@ test_that("training with absolute loss works", { expect_equal(cboost$getEstimatedCoef()$offset, median(mtcars$hp)) expect_equal(cboost$predict(), cboost$predict(response = TRUE)) expect_equal(cboost$predict(mtcars), cboost$predict(mtcars, response = TRUE)) - }) test_that("training throws an error with pre-defined iteration logger", { - + expect_silent({ cboost = Compboost$new(mtcars, "hp", loss = LossAbsolute$new()) cboost$addLogger(LoggerIteration, use.as.stopper = TRUE, "iteration", max.iter = 1000) cboost$addBaselearner("wt", "linear", BaselearnerPolynomial, degree = 1, intercept = FALSE) }) - - expect_output(expect_warning(cboost$train(200))) + expect_output(expect_warning(cboost$train(200))) expect_length(cboost$getInbagRisk(), 1001) }) test_that("training with binomial loss works", { - mtcars$hp.cat = ifelse(mtcars$hp > 150, 1, -1) + hp.classes = ifelse(mtcars$hp > 150, 1, -1) + mtcars$hp.cat = factor(hp.classes, levels = c(1, -1)) expect_warning({ bin.loss = LossBinomial$new(2) }) @@ -340,12 +336,12 @@ test_that("training with binomial loss works", { intercept = FALSE) }) expect_output(cboost$train(100, trace = 50)) - + expect_output(cboost$print()) expect_length(cboost$getSelectedBaselearner(), 100) expect_length(cboost$getInbagRisk(), 101) - expect_equal(cboost$getEstimatedCoef()$offset, 0.5 * log(sum(mtcars$hp.cat > 0)/ sum(mtcars$hp.cat < 0))) + expect_equal(cboost$getEstimatedCoef()$offset, 0.5 * log(sum(hp.classes > 0)/ sum(hp.classes < 0))) expect_equal(1 / (1 + exp(-cboost$predict())), cboost$predict(response = TRUE)) expect_equal(1 / (1 + exp(-cboost$predict(mtcars))), cboost$predict(mtcars, response = TRUE)) @@ -386,20 +382,20 @@ test_that("custom poisson family does the same as mboost", { return (log(mean.default(truth))) } expect_silent({ my.poisson.loss = LossCustom$new(lossPoisson, gradPoisson, constInitPoisson) }) - + expect_silent({ cboost = Compboost$new(iris, "Sepal.Length", loss = my.poisson.loss) - cboost$addBaselearner("Sepal.Width", "linear", BaselearnerPolynomial, + cboost$addBaselearner("Sepal.Width", "linear", BaselearnerPolynomial, degree = 1, intercept = TRUE) - cboost$addBaselearner("Petal.Length", "spline", BaselearnerPSpline, + cboost$addBaselearner("Petal.Length", "spline", BaselearnerPSpline, degree = 3, n.knots = 10, penalty = 2, differences = 2) }) expect_output(cboost$train(100, trace = 10)) - - mod = mboost(Sepal.Length ~ bols(Sepal.Width) + bbs(Petal.Length, differences = 2, lambda = 2, - degree = 3, knots = 10), data = iris, family = Poisson(), + + mod = mboost(Sepal.Length ~ bols(Sepal.Width) + bbs(Petal.Length, differences = 2, lambda = 2, + degree = 3, knots = 10), data = iris, family = Poisson(), control = boost_control(mstop = 100, nu = 0.05)) - + expect_silent({ coef.cboost = cboost$getEstimatedCoef() coef.mboost = coef(mod) @@ -421,16 +417,16 @@ test_that("quadratic loss does the same as mboost", { expect_silent({ cboost = Compboost$new(iris, "Sepal.Width", loss = LossQuadratic$new()) - cboost$addBaselearner("Sepal.Length", "linear", BaselearnerPolynomial, + cboost$addBaselearner("Sepal.Length", "linear", BaselearnerPolynomial, degree = 1, intercept = TRUE) - cboost$addBaselearner("Petal.Length", "spline", BaselearnerPSpline, + cboost$addBaselearner("Petal.Length", "spline", BaselearnerPSpline, degree = 3, n.knots = 10, penalty = 2, differences = 2) }) expect_output(cboost$train(100, trace = 0)) - - mod = mboost(Sepal.Width ~ bols(Sepal.Length) + bbs(Petal.Length, differences = 2, lambda = 2, + + mod = mboost(Sepal.Width ~ bols(Sepal.Length) + bbs(Petal.Length, differences = 2, lambda = 2, degree = 3, knots = 10), data = iris, control = boost_control(mstop = 100, nu = 0.05)) - + expect_silent({ coef.cboost = cboost$getEstimatedCoef() coef.mboost = coef(mod) @@ -451,10 +447,10 @@ test_that("handler throws warnings", { cboost = Compboost$new(iris, "Sepal.Width", loss = LossQuadratic$new()) }) - expect_warning(cboost$addBaselearner("Sepal.Length", "linear", BaselearnerPolynomial, + expect_warning(cboost$addBaselearner("Sepal.Length", "linear", BaselearnerPolynomial, degree = 1, false.intercept = TRUE)) - - expect_warning(cboost$addBaselearner("Petal.Length", "spline", BaselearnerPSpline, + + expect_warning(cboost$addBaselearner("Petal.Length", "spline", BaselearnerPSpline, degree = 3, n.knots = 10, penalty = 2, differences = 2, i.am.not.used = NULL)) instantiateData = function (X) { @@ -470,12 +466,12 @@ test_that("handler throws warnings", { return(model) } - expect_warning(cboost$addBaselearner("Sepal.Length", "custom", BaselearnerCustom, instantiate.fun = instantiateData, + expect_warning(cboost$addBaselearner("Sepal.Length", "custom", BaselearnerCustom, instantiate.fun = instantiateData, train.fun = trainFun, predict.fun = predictFun, param.fun = extractParameter, i.am.not.used = NULL)) - expect_silent({ Rcpp::sourceCpp(code = getCustomCppExample(silent = TRUE)) }) - expect_warning(cboost$addBaselearner("Sepal.Length", "custom", BaselearnerCustomCpp, instantiate.ptr = dataFunSetter(), - train.ptr = trainFunSetter(), predict.ptr = predictFunSetter(), i.am.not.used = NULL)) + expect_silent({ Rcpp::sourceCpp(code = getCustomCppExample(silent = TRUE)) }) + expect_warning(cboost$addBaselearner("Sepal.Length", "custom", BaselearnerCustomCpp, instantiate.ptr = dataFunSetter(), + train.ptr = trainFunSetter(), predict.ptr = predictFunSetter(), i.am.not.used = NULL)) }) @@ -490,9 +486,9 @@ test_that("default values are used by handler", { }) test_that("out of range values are set correctly", { - + data(cars) - nuisance = capture.output({ + nuisance = capture.output({ mod = boostSplines(data = cars, loss = LossQuadratic$new(), target = "speed", optimizer = OptimizerCoordinateDescent$new()) }) @@ -502,6 +498,5 @@ test_that("out of range values are set correctly", { expect_warning({ pred_broken = mod$predict(data.frame(dist = c(-10, 2, 100, 120, 200))) }) - expect_equal(pred_broken, mod$predict(data.frame(dist = c(2, 2, 100, 120, 120)))) }) \ No newline at end of file diff --git a/tests/testthat/test_compboost_internal.R b/tests/testthat/test_compboost_internal.R index ea304354..08358ed5 100644 --- a/tests/testthat/test_compboost_internal.R +++ b/tests/testthat/test_compboost_internal.R @@ -9,6 +9,8 @@ test_that("Compboost loggs correctly", { X.wt = as.matrix(df[["wt"]], ncol = 1) y = df[["mpg"]] + response = ResponseRegr$new("mpg", as.matrix(y)) + response.oob = ResponseRegr$new("mpg_oog", as.matrix(y)) expect_silent({ data.source.hp = InMemoryData$new(X.hp, "hp") }) expect_silent({ data.source.wt = InMemoryData$new(X.wt, "wt") }) @@ -21,11 +23,11 @@ test_that("Compboost loggs correctly", { learning.rate = 0.05 iter.max = 500 - expect_silent({ linear.factory.hp = BaselearnerPolynomial$new(data.source.hp, data.target.hp1, + expect_silent({ linear.factory.hp = BaselearnerPolynomial$new(data.source.hp, data.target.hp1, list(degree = 1, intercept = FALSE)) }) - expect_silent({ linear.factory.wt = BaselearnerPolynomial$new(data.source.wt, data.target.wt, + expect_silent({ linear.factory.wt = BaselearnerPolynomial$new(data.source.wt, data.target.wt, list(degree = 1, intercept = FALSE)) }) - expect_silent({ quadratic.factory.hp = BaselearnerPolynomial$new(data.source.hp, data.target.hp2, + expect_silent({ quadratic.factory.hp = BaselearnerPolynomial$new(data.source.hp, data.target.hp2, list(degree = 2, intercept = FALSE)) }) expect_silent({ factory.list = BlearnerFactoryList$new() }) expect_silent({ factory.list$registerFactory(linear.factory.hp) }) @@ -38,7 +40,7 @@ test_that("Compboost loggs correctly", { expect_silent({ log.time.sec = LoggerTime$new("time.seconds", TRUE, 10, "seconds") }) expect_silent({ log.time.min = LoggerTime$new("time.minutes", TRUE, 10, "minutes") }) expect_silent({ log.inbag = LoggerInbagRisk$new("inbag.risk", FALSE, loss.quadratic, 0.01) }) - expect_silent({ log.oob = LoggerOobRisk$new("oob.risk", FALSE, loss.quadratic, 0.01, eval.oob.test, y) }) + expect_silent({ log.oob = LoggerOobRisk$new("oob.risk", FALSE, loss.quadratic, 0.01, eval.oob.test, response.oob) }) expect_silent({ logger.list = LoggerList$new() }) expect_silent({ logger.list$registerLogger(log.iterations) }) expect_silent({ logger.list$registerLogger(log.time.ms) }) @@ -46,14 +48,14 @@ test_that("Compboost loggs correctly", { expect_silent({ logger.list$registerLogger(log.time.min) }) expect_silent({ logger.list$registerLogger(log.inbag) }) expect_silent({ logger.list$registerLogger(log.oob) }) - + expect_output(show(log.inbag)) expect_output(show(log.oob)) expect_output(logger.list$printRegisteredLogger()) expect_silent({ cboost = Compboost_internal$new( - response = y, + response = response, learning_rate = learning.rate, stop_if_all_stopper_fulfilled = FALSE, factory_list = factory.list, @@ -68,7 +70,7 @@ test_that("Compboost loggs correctly", { expect_equal(dim(logger.data$logger.data), c(iter.max, logger.list$getNumberOfRegisteredLogger())) expect_equal(cboost$getLoggerData()$logger.data[, 1], 1:500) expect_equal(cboost$getLoggerData()$logger.data[, 2], cboost$getLoggerData()$logger.data[, 3]) - + }) test_that("compboost does the same as mboost", { @@ -80,6 +82,7 @@ test_that("compboost does the same as mboost", { X.wt = as.matrix(df[["wt"]], ncol = 1) y = df[["mpg"]] + response = ResponseRegr$new("mpg", as.matrix(y)) expect_silent({ data.source.hp = InMemoryData$new(X.hp, "hp") }) expect_silent({ data.source.wt = InMemoryData$new(X.wt, "wt") }) @@ -92,11 +95,11 @@ test_that("compboost does the same as mboost", { learning.rate = 0.05 iter.max = 500 - expect_silent({ linear.factory.hp = BaselearnerPolynomial$new(data.source.hp, data.target.hp1, + expect_silent({ linear.factory.hp = BaselearnerPolynomial$new(data.source.hp, data.target.hp1, list(degree = 1, intercept = FALSE)) }) - expect_silent({ linear.factory.wt = BaselearnerPolynomial$new(data.source.wt, data.target.wt, + expect_silent({ linear.factory.wt = BaselearnerPolynomial$new(data.source.wt, data.target.wt, list(degree = 1, intercept = FALSE)) }) - expect_silent({ quadratic.factory.hp = BaselearnerPolynomial$new(data.source.hp, data.target.hp2, + expect_silent({ quadratic.factory.hp = BaselearnerPolynomial$new(data.source.hp, data.target.hp2, list(degree = 2, intercept = FALSE)) }) expect_silent({ factory.list = BlearnerFactoryList$new() }) @@ -113,7 +116,7 @@ test_that("compboost does the same as mboost", { expect_silent({ logger.list$registerLogger(log.time) }) expect_silent({ cboost = Compboost_internal$new( - response = y, + response = response, learning_rate = learning.rate, stop_if_all_stopper_fulfilled = TRUE, factory_list = factory.list, @@ -203,7 +206,8 @@ test_that("compboost does the same as mboost", { } expect_equal(cboost$getParameterMatrix()$parameter.matrix[idx, ], matrix.compare) expect_equal(cboost$predict(eval.oob.test, FALSE), predict(mod, df)) - expect_equal(cboost$predictAtIteration(eval.oob.test, 200, FALSE), predict(mod.reduced, df)) + expect_silent(cboost$setToIteration(200)) + expect_equal(cboost$predict(eval.oob.test, FALSE), predict(mod.reduced, df)) suppressWarnings({ mod.new = mboost( diff --git a/tests/testthat/test_factory_list.R b/tests/testthat/test_factory_list.R index 99dd9e1b..44b49986 100644 --- a/tests/testthat/test_factory_list.R +++ b/tests/testthat/test_factory_list.R @@ -50,5 +50,4 @@ test_that("factory list works", { expect_silent(factory.list$clearRegisteredFactories()) expect_equal(factory.list$getNumberOfRegisteredFactories(), 0) - }) diff --git a/tests/testthat/test_inbag_vs_oob.R b/tests/testthat/test_inbag_vs_oob.R index 50373fa7..52c6a341 100644 --- a/tests/testthat/test_inbag_vs_oob.R +++ b/tests/testthat/test_inbag_vs_oob.R @@ -5,7 +5,7 @@ test_that("Internal oob is the same as the logger", { df = mtcars target_var = "mpg" char_vars = c("cyl", "vs", "am", "gear", "carb") - + for (feature in char_vars) { df[[feature]] = as.factor(df[[feature]]) } @@ -15,33 +15,33 @@ test_that("Internal oob is the same as the logger", { set.seed(31415) idx_test = sample(x = seq_len(n_data), size = floor(n_data * 0.25)) idx_train = setdiff(x = seq_len(n_data), idx_test) - - cboost = Compboost$new(data = df[idx_train, ], target = target_var, + + cboost = Compboost$new(data = df[idx_train, ], target = target_var, loss = LossQuadratic$new(), learning.rate = 0.005) - + for (feature_name in setdiff(names(df), target_var)) { if (feature_name %in% char_vars) { - cboost$addBaselearner(feature = feature_name, id = "category", + cboost$addBaselearner(feature = feature_name, id = "category", bl.factory = BaselearnerPolynomial, intercept = FALSE) } else { - cboost$addBaselearner(feature = feature_name, id = "spline", + cboost$addBaselearner(feature = feature_name, id = "spline", bl.factory = BaselearnerPSpline, degree = 3, n.knots = 10) } } - + oob_data = cboost$prepareData(df[idx_test,]) - oob_response = df[[target_var]][idx_test] - + oob_response = ResponseRegr$new("oob_response", as.matrix(df[[target_var]][idx_test])) + cboost$addLogger(logger = LoggerOobRisk, logger.id = "oob_risk", used.loss = LossQuadratic$new(), eps.for.break = 0, oob.data = oob_data, oob.response = oob_response) - + nuisance = capture.output(suppressWarnings({ cboost$train(6000) })) set.seed(31415) nuisance = capture.output(suppressWarnings({ - cboost1 = boostSplines(data = df, target = target_var, loss = LossQuadratic$new(), learning.rate = 0.005, + cboost1 = boostSplines(data = df, target = target_var, loss = LossQuadratic$new(), learning.rate = 0.005, iterations = 6000L, degree = 3, n.knots = 10, oob.fraction = 0.25) })) expect_equal(rownames(df)[idx_train], rownames(cboost1$data)) diff --git a/tests/testthat/test_logger_list.R b/tests/testthat/test_logger_list.R index 50450c8c..4f9d5930 100644 --- a/tests/testthat/test_logger_list.R +++ b/tests/testthat/test_logger_list.R @@ -17,6 +17,5 @@ test_that("register and delete of logger entries works", { expect_silent(logger.list$clearRegisteredLogger()) expect_equal(logger.list$getNumberOfRegisteredLogger(), 0) - expect_equal(logger.list$getNamesOfRegisteredLogger(), character(0L)) - + expect_equal(logger.list$getNamesOfRegisteredLogger(), character(0L)) }) diff --git a/tests/testthat/test_loss.R b/tests/testthat/test_loss.R index 81bcf2d1..43df76fe 100644 --- a/tests/testthat/test_loss.R +++ b/tests/testthat/test_loss.R @@ -1,24 +1,18 @@ context("The implemented loss object") test_that("Quadratic loss works", { - expect_silent({ quadratic.loss = LossQuadratic$new() }) expect_silent({ quadratic.loss.custom = LossQuadratic$new(2) }) - }) test_that("Absolute loss works", { - expect_silent({ absolute.loss = LossAbsolute$new() }) expect_silent({ absolute.loss.custom = LossAbsolute$new(pi) }) - }) test_that("Binomial loss works", { - expect_silent({ binomial.loss = LossBinomial$new() }) expect_silent({ binomial.loss.custom = LossBinomial$new(0.7) }) - }) test_that("Custom loss works", { @@ -38,8 +32,7 @@ test_that("Custom loss works", { }) -test_that("Custom cpp loss works", { - +test_that("Custom cpp loss works", { expect_silent({ Rcpp::sourceCpp(code = getCustomCppExample(example = "loss", silent = TRUE)) }) expect_silent({ custom.cpp.loss = LossCustomCpp$new(lossFunSetter(), gradFunSetter(), constInitFunSetter()) }) }) \ No newline at end of file diff --git a/tests/testthat/test_printer.R b/tests/testthat/test_printer.R index 25bba5bc..1062594f 100644 --- a/tests/testthat/test_printer.R +++ b/tests/testthat/test_printer.R @@ -26,7 +26,7 @@ test_that("Loss printer works", { expect_silent({ quadratic.loss = LossQuadratic$new() }) expect_silent({ absolute.loss = LossAbsolute$new() }) expect_silent({ binomial.loss = LossBinomial$new() }) - expect_silent({ Rcpp::sourceCpp(code = getCustomCppExample(example = "loss", silent = TRUE)) }) + expect_silent({ Rcpp::sourceCpp(code = getCustomCppExample(example = "loss", silent = TRUE)) }) myLossFun = function (true.value, prediction) NULL myGradientFun = function (true.value, prediction) NULL @@ -59,28 +59,28 @@ test_that("Baselearner factory printer works", { expect_silent({ data.source = InMemoryData$new(X.hp, "hp") }) expect_silent({ data.source.sp = InMemoryData$new(X.hp.sp, "hp") }) expect_silent({ data.target = InMemoryData$new() }) - - expect_silent({ linear.factory.hp = BaselearnerPolynomial$new(data.source, data.target, + + expect_silent({ linear.factory.hp = BaselearnerPolynomial$new(data.source, data.target, list(degree = 1, intercept = FALSE)) }) expect_output({ linear.factory.hp.printer = show(linear.factory.hp) }) expect_equal(linear.factory.hp.printer, "BaselearnerPolynomialPrinter") - - expect_silent({ quad.factory.hp = BaselearnerPolynomial$new(data.source, data.target, + + expect_silent({ quad.factory.hp = BaselearnerPolynomial$new(data.source, data.target, list(degree = 2, intercept = FALSE)) }) expect_output({ quad.factory.hp.printer = show(quad.factory.hp) }) expect_equal(quad.factory.hp.printer, "BaselearnerPolynomialPrinter") - - expect_silent({ cubic.factory.hp = BaselearnerPolynomial$new(data.source, data.target, + + expect_silent({ cubic.factory.hp = BaselearnerPolynomial$new(data.source, data.target, list(degree = 3, intercept = FALSE)) }) expect_output({ cubic.factory.hp.printer = show(cubic.factory.hp) }) expect_equal(cubic.factory.hp.printer, "BaselearnerPolynomialPrinter") - - expect_silent({ poly.factory.hp = BaselearnerPolynomial$new(data.source, data.target, + + expect_silent({ poly.factory.hp = BaselearnerPolynomial$new(data.source, data.target, list(degree = 4, intercept = FALSE)) }) expect_output({ poly.factory.hp.printer = show(poly.factory.hp) }) expect_equal(poly.factory.hp.printer, "BaselearnerPolynomialPrinter") - - expect_silent({ spline.factory = BaselearnerPSpline$new(data.source.sp, data.target, + + expect_silent({ spline.factory = BaselearnerPSpline$new(data.source.sp, data.target, list(degree = 3, n.knots = 5, penalty = 2.5, differences = 2)) }) expect_output({ spline.printer = show(spline.factory) }) expect_equal(spline.printer, "BaselearnerPSplinePrinter") @@ -101,7 +101,7 @@ test_that("Baselearner factory printer works", { expect_silent({ custom.factory = BaselearnerCustom$new(data.source, data.target, - list(instantiate.fun = instantiateData, train.fun = trainFun, + list(instantiate.fun = instantiateData, train.fun = trainFun, predict.fun = predictFun, param.fun = extractParameter)) }) expect_output({ custom.factory.printer = show(custom.factory) }) @@ -141,11 +141,12 @@ test_that("Logger(List) printer works", { }) y = NA_real_ + response.oob = ResponseRegr$new("mpg_oog", as.matrix(y)) expect_silent({ log.iterations = LoggerIteration$new("iterations", TRUE, 500) }) expect_silent({ log.time = LoggerTime$new("time", FALSE, 500, "microseconds") }) expect_silent({ log.inbag = LoggerInbagRisk$new("inbag.risk", FALSE, loss.quadratic, 0.05) }) - expect_silent({ log.oob = LoggerOobRisk$new("oob.risk", FALSE, loss.quadratic, 0.05, eval.oob.test, y) }) + expect_silent({ log.oob = LoggerOobRisk$new("oob.risk", FALSE, loss.quadratic, 0.05, eval.oob.test, response.oob) }) expect_silent({ logger.list = LoggerList$new() }) expect_output({ logger.list.printer = show(logger.list) }) @@ -180,6 +181,8 @@ test_that("Compboost printer works", { X.wt = as.matrix(df[["wt"]], ncol = 1) y = df[["mpg"]] + response = ResponseRegr$new("mpg", as.matrix(y)) + response.oob = ResponseRegr$new("mpg_oog", as.matrix(y)) expect_silent({ data.source.hp = InMemoryData$new(X.hp, "hp") }) expect_silent({ data.source.wt = InMemoryData$new(X.wt, "wt") }) @@ -193,11 +196,11 @@ test_that("Compboost printer works", { learning.rate = 0.05 iter.max = 500 - expect_silent({ linear.factory.hp = BaselearnerPolynomial$new(data.source.hp, data.target.hp1, + expect_silent({ linear.factory.hp = BaselearnerPolynomial$new(data.source.hp, data.target.hp1, list(degree = 1, intercept = FALSE)) }) - expect_silent({ linear.factory.wt = BaselearnerPolynomial$new(data.source.wt, data.target.wt, + expect_silent({ linear.factory.wt = BaselearnerPolynomial$new(data.source.wt, data.target.wt, list(degree = 1, intercept = FALSE)) }) - expect_silent({ quadratic.factory.hp = BaselearnerPolynomial$new(data.source.hp, data.target.hp2, + expect_silent({ quadratic.factory.hp = BaselearnerPolynomial$new(data.source.hp, data.target.hp2, list(degree = 2, intercept = FALSE)) }) expect_silent({ factory.list = BlearnerFactoryList$new() }) @@ -213,7 +216,7 @@ test_that("Compboost printer works", { expect_silent({ log.time.sec = LoggerTime$new("time.sec", TRUE, 2, "seconds") }) expect_silent({ log.time.min = LoggerTime$new("time.min", TRUE, 1, "minutes") }) expect_silent({ log.inbag = LoggerInbagRisk$new("inbag.risk", FALSE, loss.quadratic, 0.01) }) - expect_silent({ log.oob = LoggerOobRisk$new("oob.risk", FALSE, loss.quadratic, 0.01, eval.oob.test, y) }) + expect_silent({ log.oob = LoggerOobRisk$new("oob.risk", FALSE, loss.quadratic, 0.01, eval.oob.test, response.oob) }) expect_silent({ logger.list = LoggerList$new() }) expect_silent({ logger.list$registerLogger(log.iterations) }) @@ -225,7 +228,7 @@ test_that("Compboost printer works", { expect_silent({ cboost = Compboost_internal$new( - response = y, + response = response, learning_rate = learning.rate, stop_if_all_stopper_fulfilled = FALSE, factory_list = factory.list, diff --git a/tests/testthat/test_response.R b/tests/testthat/test_response.R new file mode 100644 index 00000000..607b872f --- /dev/null +++ b/tests/testthat/test_response.R @@ -0,0 +1,86 @@ +context("Response") + +test_that("Regression response works correctly", { + + target = "x" + X = as.matrix(1:10) + loss = LossQuadratic$new() + loss.false = LossBinomial$new() + + expect_silent({ response = ResponseRegr$new(target, X) }) + expect_equal(response$getTargetName(), target) + expect_equal(response$getResponse(), X) + expect_equal(response$getPrediction(), X * 0) + expect_equal(response$getPredictionTransform(), X * 0) + expect_equal(response$getPredictionResponse(), X * 0) + expect_equal(response$calculateEmpiricalRisk(loss), mean(X^2) / 2) + expect_error(response$calculateEmpiricalRisk(loss.false)) + + expect_error({ response = ResponseRegr$new(id, X, cbind(weights, weights)) }) + + idx = 3:8 + expect_silent(response$filter(idx)) + expect_equal(response$getResponse(), X[idx, , drop = FALSE]) + expect_equal(response$getPrediction(), (X * 0)[idx, , drop = FALSE]) + expect_equal(response$getPredictionTransform(), (X * 0)[idx, , drop = FALSE]) + expect_equal(response$getPredictionResponse(), (X * 0)[idx, , drop = FALSE]) + + expect_error(response$filter(1:100)) +}) + +test_that("Regression response with weights works correctly", { + + target = "x" + X = as.matrix(1:10) + weights = as.matrix(rep(c(0.5, 2), 5)) + loss = LossQuadratic$new() + + expect_silent({ response = ResponseRegr$new(target, X, weights) }) + expect_equal(response$getWeights(), weights) + expect_equal(response$calculateEmpiricalRisk(loss), mean(weights * X^2) / 2) +}) + +test_that("Binary classification response works correctly", { + + target = "x" + threshold = 0.5 + X.false = as.matrix(1:10) + X.correct = as.matrix(sample(c(1,-1), 10, TRUE)) + sigmoid = 1 / (1 + exp(-X.correct * 0)) + pred_response = ifelse(sigmoid < threshold, -1, 1) + loss = LossBinomial$new() + + expect_error({ response = ResponseBinaryClassif$new(target, X.false) }) + expect_silent({ response = ResponseBinaryClassif$new(target, X.correct) }) + expect_equal(response$getTargetName(), target) + expect_equal(response$getResponse(), X.correct) + expect_equal(response$getPrediction(), X.correct * 0) + expect_equal(response$getPredictionTransform(), sigmoid) + expect_equal(response$getPredictionResponse(), pred_response) + expect_equal(response$calculateEmpiricalRisk(loss), mean(log(1 + exp(-2 * X.correct * response$getPredictionTransform())))) + + expect_error({ response = ResponseRegr$new(id, X, cbind(weights, weights)) }) + + threshold = 0.8 + pred_response = ifelse(sigmoid < threshold, -1, 1) + expect_silent({ response$setThreshold(threshold) }) + expect_equal(response$getThreshold(), threshold) + expect_equal(response$getPredictionResponse(), pred_response) + expect_error(response$setThreshold(1.1)) + expect_equal(response$getThreshold(), threshold) +}) + +test_that("Binary classification response with weights works correctly", { + + target = "x" + threshold = 0.5 + X.correct = as.matrix(sample(c(1,-1), 10, TRUE)) + weights = as.matrix(rep(c(0.5, 2), 5)) + sigmoid = 1 / (1 + exp(-X.correct * 0)) + pred_response = ifelse(sigmoid < threshold, -1, 1) + loss = LossBinomial$new() + + expect_silent({ response = ResponseBinaryClassif$new(target, X.correct, weights) }) + expect_equal(response$getWeights(), weights) + expect_equal(response$calculateEmpiricalRisk(loss), mean(weights * log(1 + exp(-2 * X.correct * response$getPredictionTransform())))) +}) \ No newline at end of file diff --git a/vignettes/.build.timestamp b/vignettes/.build.timestamp new file mode 100644 index 00000000..e69de29b diff --git a/vignettes/compboost.R b/vignettes/compboost.R new file mode 100644 index 00000000..0e2c4687 --- /dev/null +++ b/vignettes/compboost.R @@ -0,0 +1,56 @@ +## ---- include=FALSE----------------------------------------------------------- +knitr::opts_chunk$set(collapse = TRUE) +# devtools::load_all() +library(compboost) + +options(width = 80) + +required.pcks = c("ggplot2") + +dependencies = all( + unlist(lapply(required.pcks, requireNamespace, quietly = TRUE)) +) + +## ----------------------------------------------------------------------------- +# Store train and test data: +df.train = na.omit(titanic::titanic_train) +df.test = na.omit(titanic::titanic_test) + +str(df.train) + +## ----------------------------------------------------------------------------- +df.train$Survived = factor(df.train$Survived, labels = c("no", "yes")) + +# Train and evaluation split for training: +set.seed(1111) + +idx.train = sample(x = seq_len(nrow(df.train)), size = 0.6 * nrow(df.train)) +idx.eval = setdiff(seq_len(nrow(df.train)), idx.train) + +## ----------------------------------------------------------------------------- +cboost = Compboost$new(data = df.train[idx.train, ], target = "Survived", + loss = LossBinomial$new()) + +## ----------------------------------------------------------------------------- +# Spline base-learner of age: +cboost$addBaselearner("Age", "spline", BaselearnerPSpline) + +# Linear base-learner of age (degree = 1 with intercept is default): +cboost$addBaselearner("Age", "linear", BaselearnerPolynomial) + +## ----------------------------------------------------------------------------- +# Spline base-learner of fare: +cboost$addBaselearner("Fare", "spline", BaselearnerPSpline, degree = 2, + n.knots = 14, penalty = 10, differences = 2) + +## ----------------------------------------------------------------------------- +cboost$addBaselearner("Sex", "categorical", BaselearnerPolynomial, + intercept = FALSE) + +## ----------------------------------------------------------------------------- +cboost$getBaselearnerNames() + +## ----------------------------------------------------------------------------- +cboost$addLogger(logger = LoggerTime, use.as.stopper = FALSE, logger.id = "time", + max.time = 0, time.unit = "microseconds") + diff --git a/vignettes/compboost.Rmd b/vignettes/compboost.Rmd index ad7b87ef..4f4ce067 100644 --- a/vignettes/compboost.Rmd +++ b/vignettes/compboost.Rmd @@ -2,7 +2,7 @@ title: "compboost: Fast and Flexible Component-Wise Boosting Framework" author: "Daniel Schalk" date: "`r Sys.Date()`" -output: +output: rmarkdown::html_vignette: css: compboost.css vignette: > @@ -12,7 +12,7 @@ vignette: > --- ```{r, include=FALSE} -knitr::opts_chunk$set(collapse = TRUE) +knitr::opts_chunk$set(collapse = TRUE, eval = FALSE) # devtools::load_all() library(compboost) @@ -23,6 +23,7 @@ required.pcks = c("ggplot2") dependencies = all( unlist(lapply(required.pcks, requireNamespace, quietly = TRUE)) ) +dependencies = FALSE ``` @@ -43,7 +44,7 @@ str(df.train) ``` In the next step we transform the response to a factor with more intuitive levels: - + ```{r} df.train$Survived = factor(df.train$Survived, labels = c("no", "yes")) @@ -51,17 +52,17 @@ df.train$Survived = factor(df.train$Survived, labels = c("no", "yes")) set.seed(1111) idx.train = sample(x = seq_len(nrow(df.train)), size = 0.6 * nrow(df.train)) -idx.eval = setdiff(seq_len(nrow(df.train)), idx.train) +idx.eval = setdiff(seq_len(nrow(df.train)), idx.train) ``` -This split will be used while the training to calculate the out of bag risk. +This split will be used while the training to calculate the out of bag risk. ## Initializing Model Due to the `R6` API it is necessary to create a new class object which gets the data, the target as character, and the used loss. Note that it is important to give an initialized loss object: ```{r} -cboost = Compboost$new(data = df.train[idx.train, ], target = "Survived", +cboost = Compboost$new(data = df.train[idx.train, ], target = "Survived", loss = LossBinomial$new()) ``` @@ -93,7 +94,7 @@ cboost$addBaselearner("Fare", "spline", BaselearnerPSpline, degree = 2, When adding categorical features each group is added as single base-learner to avoid biased feature selection. Also note that we don't need an intercept here: ```{r} -cboost$addBaselearner("Sex", "categorical", BaselearnerPolynomial, +cboost$addBaselearner("Sex", "categorical", BaselearnerPolynomial, intercept = FALSE) ``` @@ -109,7 +110,7 @@ cboost$getBaselearnerNames() This logger logs the elapsed time. The time unit can be one of `microseconds`, `seconds` or `minutes`. The logger stops if `max_time` is reached. But we do not use that logger as stopper here: ```{r} -cboost$addLogger(logger = LoggerTime, use.as.stopper = FALSE, logger.id = "time", +cboost$addLogger(logger = LoggerTime, use.as.stopper = FALSE, logger.id = "time", max.time = 0, time.unit = "microseconds") ``` @@ -117,9 +118,10 @@ cboost$addLogger(logger = LoggerTime, use.as.stopper = FALSE, logger.id = "time" The out of bag risk logger does basically the same as the inbag risk logger but calculates the empirical risk using another data source. Therefore, the new data object have to be a list with data sources containing the evaluation data. This is automatically done by the `prepareData()` member of `Compboost`: ```{r} +oob.response = ResponseBinaryClassif$new("oob_response", as.matrix(df.train[["Survived"]][idx.eval])) cboost$addLogger(logger = LoggerOobRisk, use.as.stopper = FALSE, logger.id ="oob", - LossBinomial$new(), 0.01, cboost$prepareData(df.train[idx.eval, ]), - df.train[["Survived"]][idx.eval]) + LossBinomial$new(), 0.01, cboost$prepareData(df.train[idx.eval, ]), + oob.response) ```