From fcf1408ca7dee7a15e69f3fe3d77241866e5f50d Mon Sep 17 00:00:00 2001 From: fweber144 Date: Tue, 4 Apr 2023 10:21:49 +0200 Subject: [PATCH] `subfit`s: Fix an error thrown if `newdata` contains only a single level of a `factor`-coerced `character` variable. Also fix a bug causing a re-ordering of the levels of a `factor` in `newdata` (compared to that `factor` in the original dataset) not to get recognized by `predict.subfit()`, which in turn causes `x %*% rbind(alpha, beta)` to be incorrect in such cases. Also fix an error thrown by `fit_glm_ridge_callback()` in case of unused levels of a `factor`, namely: ``` warning: solve(): system is singular; attempting approx solution Error: solve(): solution not found ``` --- R/divergence_minimizers.R | 25 ++++++++++++++++--------- R/search.R | 23 +++++++++++++++++------ 2 files changed, 33 insertions(+), 15 deletions(-) diff --git a/R/divergence_minimizers.R b/R/divergence_minimizers.R index f21a8eeb6..ed26c347b 100644 --- a/R/divergence_minimizers.R +++ b/R/divergence_minimizers.R @@ -102,7 +102,18 @@ fit_glm_ridge_callback <- function(formula, data, projpred_var = matrix(nrow = nrow(data)), projpred_regul = 1e-4, ...) { # Preparations: - fr <- model.frame(formula, data = data) + fr <- model.frame(formula, data = data, drop.unused.levels = TRUE) + da_classes <- attr(attr(fr, "terms"), "dataClasses") + nms_chr_fac <- names(da_classes)[da_classes %in% c("character", "factor")] + resp_nm <- all.vars(attr(fr, "terms"))[attr(attr(fr, "terms"), "response")] + nms_chr_fac <- setdiff(nms_chr_fac, resp_nm) + if (length(nms_chr_fac) > 0) { + xlvls <- lapply(setNames(nm = nms_chr_fac), function(nm_chr_fac) { + levels(as.factor(fr[[nm_chr_fac]])) + }) + } else { + xlvls <- NULL + } # TODO: In the following model.matrix() call, allow user-specified contrasts # to be passed to argument `contrasts.arg`. The `contrasts.arg` default # (`NULL`) uses `options("contrasts")` internally, but it might be more @@ -124,13 +135,8 @@ fit_glm_ridge_callback <- function(formula, data, )) # Post-processing: rownames(fit$beta) <- colnames(x) - sub <- nlist( - alpha = fit$beta0, - beta = fit$beta, - w = fit$w, - formula, - x, y - ) + sub <- nlist(alpha = fit$beta0, beta = fit$beta, w = fit$w, formula, x, y, + xlvls) class(sub) <- "subfit" return(sub) } @@ -1067,7 +1073,8 @@ predict.subfit <- function(subfit, newdata = NULL) { # (`NULL`) uses `options("contrasts")` internally, but it might be more # convenient to let users specify contrasts directly. At that occasion, # contrasts should also be tested thoroughly (not done until now). - x <- model.matrix(delete.response(terms(subfit$formula)), data = newdata) + x <- model.matrix(delete.response(terms(subfit$formula)), data = newdata, + xlev = subfit$xlvls) if (is.null(beta)) { return(as.matrix(rep(alpha, nrow(x)))) } else { diff --git a/R/search.R b/R/search.R index 16ea114b1..de3eaecf7 100644 --- a/R/search.R +++ b/R/search.R @@ -135,12 +135,26 @@ search_L1 <- function(p_ref, refmodel, nterms_max, penalty, opt) { stop("L1 search cannot be used for an empty (i.e. intercept-only) ", "full-model formula or `nterms_max = 0`.") } + # Preparations: + fr <- model.frame(refmodel$formula, data = refmodel$fetch_data(), + drop.unused.levels = TRUE) + da_classes <- attr(attr(fr, "terms"), "dataClasses") + nms_chr_fac <- names(da_classes)[da_classes %in% c("character", "factor")] + resp_nm <- all.vars(attr(fr, "terms"))[attr(attr(fr, "terms"), "response")] + nms_chr_fac <- setdiff(nms_chr_fac, resp_nm) + if (length(nms_chr_fac) > 0) { + xlvls <- lapply(setNames(nm = nms_chr_fac), function(nm_chr_fac) { + levels(as.factor(fr[[nm_chr_fac]])) + }) + } else { + xlvls <- NULL + } # TODO: In the following model.matrix() call, allow user-specified contrasts # to be passed to argument `contrasts.arg`. The `contrasts.arg` default # (`NULL`) uses `options("contrasts")` internally, but it might be more # convenient to let users specify contrasts directly. At that occasion, # contrasts should also be tested thoroughly (not done until now). - x <- model.matrix(refmodel$formula, data = refmodel$fetch_data()) + x <- model.matrix(refmodel$formula, data = fr) x <- x[, colnames(x) != "(Intercept)", drop = FALSE] ## it's important to keep the original order because that's the order ## in which lasso will estimate the parameters @@ -183,11 +197,8 @@ search_L1 <- function(p_ref, refmodel, nterms_max, penalty, opt) { # re-use of `colnames(x)` should provide another sanity check: x <- x[, colnames(x)[search_path$solution_terms[indices]], drop = FALSE] } - sub <- nlist(alpha = search_path$alpha[nterms + 1], - beta, - w = search_path$w[, nterms + 1], - formula, - x) + sub <- nlist(alpha = search_path$alpha[nterms + 1], beta, + w = search_path$w[, nterms + 1], formula, x, xlvls) class(sub) <- "subfit" return(list(sub)) })