/
Compboost.Rd
379 lines (333 loc) · 14 KB
/
Compboost.Rd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/compboost.R
\name{Compboost}
\alias{Compboost}
\title{Compboost API}
\format{
\code{\link{R6Class}} object.
}
\description{
\code{Compboost} wraps the \code{S4} class system exposed by \code{Rcpp} to make defining
objects, adding objects, the training, calculating predictions, and plotting much easier.
As already mentioned, the \code{Compboost R6} class is just a wrapper and compatible
with the most \code{S4} classes.
}
\section{Usage}{
\preformatted{
# Constructor
cboost = Compboost$new(data, target, optimizer = OptimizerCoordinateDescent$new(), loss,
learning_rate = 0.05, oob_fraction = NULL)
# Member functions
cboost$addLogger(logger, use_as_stopper = FALSE, logger_id, ...)
cbboost$addBaselearner(feature, id, bl_factory, data_source = InMemoryData,
data_target = InMemoryData, ...)
cbboost$train(iteration = 100, trace = -1)
cboost$getCurrentIteration()
cboost$prepareData(newdata)
cboost$prepareResponse(response)
cboost$predict(newdata = NULL, as_response = FALSE)
cboost$getInbagRisk()
cboost$getSelectedBaselearner()
cboost$getEstimatedCoef()
cboost$plot(blearner_name = NULL, iters = NULL, from = NULL, to = NULL, length_out = 1000)
cboost$getBaselearnerNames()
cboost$getLoggerData()
cboost$calculateFeatureImportance(num_feats = NULL)
cboost$plotFeatureImportance(num_feats = NULL)
cboost$plotInbagVsOobRisk()
cboost$plotBlearnerTraces(value = 1, n_legend = 5L)
}
}
\section{Arguments}{
\strong{For Compboost$new()}:
\describe{
\item{\code{data}}{[\code{data.frame}]\cr
A data frame containing the data (features as well as target).
}
\item{\code{target}}{[\code{character(1)} or \code{S4 Response}]\cr
Character value containing the target variable or \code{Response} object. Note that the loss has to match the
data type of the target.
}
\item{\code{optimizer}}{[\code{S4 Optimizer}]\cr
An initialized \code{S4 Optimizer} object exposed by Rcpp (e.g. \code{OptimizerCoordinateDescent$new()})
to specify how features are selected in each iteration.
}
\item{\code{loss}}{[\code{S4 Loss}]\cr
Initialized \code{S4 Loss} object exposed by Rcpp which is used to calculate the risk and pseudo
residuals (e.g. \code{LossQuadratic$new()}).
}
\item{\code{learning_rage}}{[\code{numeric(1)}]\cr
Learning rate to shrink the new parameters in each iteration.
}
\item{\code{oob_fraction}}{[\code{numeric(1)}]\cr
Fraction of how much data are used to calculate the out of bag risk.
}
}
\strong{For cboost$addLogger()}:
\describe{
\item{\code{logger}}{[\code{S4 Logger}]\cr
Uninitialized \code{S4 Logger} class object that is registered in the model.
See the details for possible choices.
}
\item{\code{use_as_stopper}}{[\code{logical(1)}]\cr
Logical value indicating whether the new logger should also be used as stopper
(early stopping). Default value is \code{FALSE}.
}
\item{\code{logger_id}}{[\code{character(1)}]\cr
Id of the new logger. This is necessary to be able to register multiple logger.
}
\item{}{\code{...}\cr
Further arguments passed to the constructor of the \code{S4 Logger} class specified in
\code{logger}. For possible arguments see details or the help pages (e.g. \code{?LoggerIteration}).
}
}
\strong{For cboost$addBaselearner()}:
\describe{
\item{\code{feature}}{[\code{character()}]\cr
Vector of column names that are used as input data matrix for a single base-learner. Note that not
every base-learner supports the use of multiple features (e.g. the spline base-learner does not).
}
\item{\code{id}}{[\code{character(1)}]\cr
Id of the base-learners. This is necessary since it is possible to define multiple learners using equal features.
}
\item{\code{bl_factory}}{[\code{S4 Factory}]\cr
Uninitialized base-learner factory given as \code{S4 Factory} class. See the details
for possible choices.
}
\item{\code{data_source}}{[\code{S4 Data}]\cr
Data source object. Just in memory data objects are supported at the moment.
}
\item{\code{data_target}}{[\code{S4 Data}]\cr
Data target object. Just in memory data objects are supported at the moment.
}
\item{}{\code{...}\cr
Further arguments passed to the constructor of the \code{S4 Factory} class specified in
\code{bl_factory}. For possible arguments see the help pages (e.g. \code{?BaselearnerPSplineFactory})
of the \code{S4} classes.
}
}
\strong{For cboost$train()}:
\describe{
\item{\code{iteration}}{[\code{integer(1)}]\cr
Number of iterations that are trained. If the model is already trained it sets to the given number
by going back to already trained base-learners or it trains new ones. Note: This function defines an
iteration logger with the id \code{_iterations} which is used as stopper for the new training.
}
\item{\code{trace}}{[\code{integer(1)}]\cr
Integer indicating after how many iterations a trace should be printed. Specifying \code{trace = 10}, then every
10th iteration is printed. If you do not want to print the trace set \code{trace = 0}. Default is
-1 which means that in total 40 iterations are printed.
}
}
\strong{For cboost$predict()}:
\describe{
\item{\code{newdata}}{[\code{data.frame()}]\cr
Data to predict on. If newdata equals \code{NULL} predictions on the training data are returned.
}
}
\strong{For cboost$plot()}:
\describe{
\item{\code{blearner_name}}{[\code{character(1)}]\cr
Character name of the base-learner to plot the contribution to the response. Available choices for
\code{blearner_name} use \code{cboost$getBaselearnerNames()}.
}
\item{\code{iters}}{[\code{integer()}]\cr
Integer vector containing the iterations the user wants to visualize.
}
\item{\code{from}}{[\code{numeric(1)}]\cr
Lower bound for the x axis (should be smaller than \code{to}).
}
\item{\code{to}}{[\code{numeric(1)}]\cr
Upper bound for the x axis (should be greater than \code{from}).
}
\item{\code{length_out}}{[\code{integer(1)}]\cr
Number of equidistant points between \code{from} and \code{to} used for plotting.
}
}
\strong{For cboost$calculateFeatureImportance() and cboost$plotFeatureImportance()}:
\describe{
\item{\code{num_feats}}{[\code{integer(1)}]\cr
Number of features for which the Importance will be returned.
}
}
\strong{For cboost$plotBlearnerTraces}:
\describe{
\item{\code{value}}{[\code{numeric()}]\cr
Numeric value of length 1 or same length as the number iterations which is accumulated by the selected base-learner.
}
\item{\code{n_legend}}{[\code{integer(1L)}]\cr
Number of how many base-learner are highlighted (base-learner are highlighted by choosing the top \code{n_legend}
accumulated values).
}
}
}
\section{Details}{
\strong{Loss}\cr
Available choices for the loss are:
\itemize{
\item
\code{LossQuadratic} (Regression)
\item
\code{LossAbsolute} (Regression)
\item
\code{LossQuantile} (Regression)
\describe{
\item{\code{quantile}}{[\code{numeric(1)}]\cr
Quantile that is boosted.
}
}
\item
\code{LossHuber} (Regression)
\describe{
\item{\code{delta}}{[\code{numeric(1)}]\cr
Defining the interval [-d,d] around 0 for quadratic approximation.
}
}
\item
\code{LossBinomial} (Binary Classification)
\item
\code{LossCustom} (Custom)
}
(For each loss take also a look at the help pages (e.g. \code{?LossBinomial}))
\strong{Logger}\cr
Available choices for the logger are:
\itemize{
\item
\code{LoggerIteration}: Logs the current iteration. Additional arguments:
\describe{
\item{\code{max_iterations} [\code{integer(1)}]}{
Maximal number of iterations.
}
}
\item
\code{LoggerTime}: Logs the elapsed time. Additional arguments:
\describe{
\item{\code{max_time} [\code{integer(1)}]}{
Maximal time for the computation.
}
\item{\code{time_unit} [\code{character(1)}]}{
Character to specify the time unit. Possible choices are \code{minutes}, \code{seconds}, or \code{microseconds}.
}
}
\item
\code{LoggerInbagRisk}:
\describe{
\item{\code{used_loss} [\code{S4 Loss}]}{
Loss as initialized \code{S4 Loss} which is used to calculate the empirical risk. See the
details for possible choices.
}
\item{\code{eps_for_break} [\code{numeric(1)}]}{
This argument is used if the logger is also used as stopper. If the relative improvement
of the logged inbag risk falls below this boundary, then the stopper breaks the algorithm.
}
\item{\code{patience} [\code{integer(1)}]}{
Specifying, how many iteration should fall consecutively below \code{eps_for_break} before we stop.
}
}
\item
\code{LoggerOobRisk}:
\describe{
\item{\code{used_loss} [\code{S4 Loss}]}{
Loss as initialized \code{S4 Loss} which is used to calculate the empirical risk. See the
details for possible choices.
}
\item{\code{eps_for_break} [\code{numeric(1)}]}{
This argument is used if the logger is also used as stopper. If the relative improvement
of the logged inbag risk falls above this boundary the stopper breaks the algorithm.
}
\item{\code{oob_data} [\code{list}]}{
A list which contains data source objects which corresponds to the source data of each registered factory.
The source data objects should contain the out of bag data. This data is then used to calculate the
new predictions in each iteration.
}
\item{\code{oob_response} [\code{vector}]}{
Vector which contains the response for the out of bag data given within \code{oob_data}.
}
\item{\code{patience} [\code{integer(1)}]}{
Specifying, how many iteration should fall consecutively below \code{eps_for_break} before we stop.
}
}
}
\strong{Note}:
\itemize{
\item
Even if you do not use the logger as stopper you have to define the arguments such as \code{max_time}.
}
}
\section{Fields}{
\describe{
\item{\code{data} [\code{data.frame}]}{
Data used for training the algorithm.
}
\item{\code{data_oob} [\code{data.frame}]}{
Data used for out of bag tracking.
}
\item{\code{oob_fraction} [\code{numeric(1)}]}{
Fraction of how much data are used to track the out of bag risk.
}
\item{\code{response} [\code{vector}]}{
Response object that is created or passed in target for training the model.
}
\item{\code{response_oob} [\code{vector}]}{
Response object that is created by specifying the \code{oob_fraction} to evaluate each iteration.
}
\item{\code{target} [\code{character(1)}]}{
Name of the target variable.
}
\item{\code{id} [\code{character(1)}]}{
Name of the given dataset.
}
\item{\code{optimizer} [\code{S4 Optimizer}]}{
Optimizer used within the fitting process.
}
\item{\code{loss} [\code{S4 Loss}]}{
Loss used to calculate pseudo residuals and empirical risk.
}
\item{\code{learning_rate} [\code{numeric(1)}]}{
Learning rate used to shrink the estimated parameter in each iteration.
}
\item{\code{model} [\code{S4 Compboost_internal}]}{
\code{S4 Compboost_internal} class object from which the main operations (such as train) are called.
}
\item{\code{bl_factory_list} [\code{S4 FactoryList}]}{
List of all registered factories represented as \code{S4 FactoryList} class.
}
\item{\code{positive_category} [\code{character(1)}]}{
Character containing the name of the positive class in the case of (binary) classification.
}
\item{\code{stop_if_all_stoppers_fulfilled} [\code{logical(1)}]}{
Logical indicating whether all stopper should be used simultaneously or if it is sufficient
to just use the first stopper to stop the algorithm.
}
}
}
\section{Methods}{
\describe{
\item{\code{addLogger}}{method to add a logger to the algorithm (Note: This is just possible before the training).}
\item{\code{addBaselearner}}{method to add a new base-learner to the algorithm (Note: This is just possible before the training).}
\item{\code{getCurrentIteration}}{method to get the current iteration on which the algorithm is set.}
\item{\code{train}}{method to train the algorithm.}
\item{\code{predict}}{method to predict on a trained object.}
\item{\code{getSelectedBaselearner}}{method to get a character vector of selected base-learner.}
\item{\code{getEstimatedCoef}}{method to get a list of estimated coefficient of each selected base-learner.}
\item{\code{plot}}{method to plot individual feature effects.}
\item{\code{getBaselearnerNames}}{method to get the names of the registered factories.}
\item{\code{prepareData}}{method to prepare data to track the out of bag risk of an arbitrary loss/performance function.}
\item{\code{getLoggerData}}{method to the the logged data from all registered logger.}
\item{\code{calculateFeatureImportance}}{method to calculate feature importance.}
\item{\code{plotFeatureImportance}}{method to plot the feature importance calculated by \code{calulateFeatureImportance}.}
\item{\code{plotInbagVsOobRisk}}{method to plot the inbag vs the out of bag behavior. This is just applicable if a logger with name \code{oob_logger} was registered. This is automatically done if the \code{oob_fraction} is set.}
\item{\code{plotBlearnerTraces}}{method to plot traces how the base-learner are selected in combination with a measure of interest, e.g. how the empirical risk was minimized throughout the selection process.}
}
}
\examples{
cboost = Compboost$new(mtcars, "mpg", loss = LossQuadratic$new(), oob_fraction = 0.3)
cboost$addBaselearner("hp", "spline", BaselearnerPSpline, degree = 3,
n_knots = 10, penalty = 2, differences = 2)
cboost$addBaselearner("wt", "spline", BaselearnerPSpline)
cboost$train(1000)
table(cboost$getSelectedBaselearner())
cboost$plot("hp_spline")
cboost$plotInbagVsOobRisk()
cboost$plotBlearnerTraces()
}