From fcf1408ca7dee7a15e69f3fe3d77241866e5f50d Mon Sep 17 00:00:00 2001
From: fweber144 <fweber144@protonmail.com>
Date: Tue, 4 Apr 2023 10:21:49 +0200
Subject: [PATCH] `subfit`s: Fix an error thrown if `newdata` contains only a
 single

level of a `factor`-coerced `character` variable.

Also fix a bug causing a re-ordering of the levels of a `factor` in
`newdata` (compared to that `factor` in the original dataset) not to get
recognized by `predict.subfit()`, which in turn causes
`x %*% rbind(alpha, beta)` to be incorrect in such cases.

Also fix an error thrown by `fit_glm_ridge_callback()` in case of unused levels
of a `factor`, namely:
```
warning: solve(): system is singular; attempting approx solution
Error: solve(): solution not found
```
---
 R/divergence_minimizers.R | 25 ++++++++++++++++---------
 R/search.R                | 23 +++++++++++++++++------
 2 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/R/divergence_minimizers.R b/R/divergence_minimizers.R
index f21a8eeb6..ed26c347b 100644
--- a/R/divergence_minimizers.R
+++ b/R/divergence_minimizers.R
@@ -102,7 +102,18 @@ fit_glm_ridge_callback <- function(formula, data,
                                    projpred_var = matrix(nrow = nrow(data)),
                                    projpred_regul = 1e-4, ...) {
   # Preparations:
-  fr <- model.frame(formula, data = data)
+  fr <- model.frame(formula, data = data, drop.unused.levels = TRUE)
+  da_classes <- attr(attr(fr, "terms"), "dataClasses")
+  nms_chr_fac <- names(da_classes)[da_classes %in% c("character", "factor")]
+  resp_nm <- all.vars(attr(fr, "terms"))[attr(attr(fr, "terms"), "response")]
+  nms_chr_fac <- setdiff(nms_chr_fac, resp_nm)
+  if (length(nms_chr_fac) > 0) {
+    xlvls <- lapply(setNames(nm = nms_chr_fac), function(nm_chr_fac) {
+      levels(as.factor(fr[[nm_chr_fac]]))
+    })
+  } else {
+    xlvls <- NULL
+  }
   # TODO: In the following model.matrix() call, allow user-specified contrasts
   # to be passed to argument `contrasts.arg`. The `contrasts.arg` default
   # (`NULL`) uses `options("contrasts")` internally, but it might be more
@@ -124,13 +135,8 @@ fit_glm_ridge_callback <- function(formula, data,
   ))
   # Post-processing:
   rownames(fit$beta) <- colnames(x)
-  sub <- nlist(
-    alpha = fit$beta0,
-    beta = fit$beta,
-    w = fit$w,
-    formula,
-    x, y
-  )
+  sub <- nlist(alpha = fit$beta0, beta = fit$beta, w = fit$w, formula, x, y,
+               xlvls)
   class(sub) <- "subfit"
   return(sub)
 }
@@ -1067,7 +1073,8 @@ predict.subfit <- function(subfit, newdata = NULL) {
     # (`NULL`) uses `options("contrasts")` internally, but it might be more
     # convenient to let users specify contrasts directly. At that occasion,
     # contrasts should also be tested thoroughly (not done until now).
-    x <- model.matrix(delete.response(terms(subfit$formula)), data = newdata)
+    x <- model.matrix(delete.response(terms(subfit$formula)), data = newdata,
+                      xlev = subfit$xlvls)
     if (is.null(beta)) {
       return(as.matrix(rep(alpha, nrow(x))))
     } else {
diff --git a/R/search.R b/R/search.R
index 16ea114b1..de3eaecf7 100644
--- a/R/search.R
+++ b/R/search.R
@@ -135,12 +135,26 @@ search_L1 <- function(p_ref, refmodel, nterms_max, penalty, opt) {
     stop("L1 search cannot be used for an empty (i.e. intercept-only) ",
          "full-model formula or `nterms_max = 0`.")
   }
+  # Preparations:
+  fr <- model.frame(refmodel$formula, data = refmodel$fetch_data(),
+                    drop.unused.levels = TRUE)
+  da_classes <- attr(attr(fr, "terms"), "dataClasses")
+  nms_chr_fac <- names(da_classes)[da_classes %in% c("character", "factor")]
+  resp_nm <- all.vars(attr(fr, "terms"))[attr(attr(fr, "terms"), "response")]
+  nms_chr_fac <- setdiff(nms_chr_fac, resp_nm)
+  if (length(nms_chr_fac) > 0) {
+    xlvls <- lapply(setNames(nm = nms_chr_fac), function(nm_chr_fac) {
+      levels(as.factor(fr[[nm_chr_fac]]))
+    })
+  } else {
+    xlvls <- NULL
+  }
   # TODO: In the following model.matrix() call, allow user-specified contrasts
   # to be passed to argument `contrasts.arg`. The `contrasts.arg` default
   # (`NULL`) uses `options("contrasts")` internally, but it might be more
   # convenient to let users specify contrasts directly. At that occasion,
   # contrasts should also be tested thoroughly (not done until now).
-  x <- model.matrix(refmodel$formula, data = refmodel$fetch_data())
+  x <- model.matrix(refmodel$formula, data = fr)
   x <- x[, colnames(x) != "(Intercept)", drop = FALSE]
   ## it's important to keep the original order because that's the order
   ## in which lasso will estimate the parameters
@@ -183,11 +197,8 @@ search_L1 <- function(p_ref, refmodel, nterms_max, penalty, opt) {
       # re-use of `colnames(x)` should provide another sanity check:
       x <- x[, colnames(x)[search_path$solution_terms[indices]], drop = FALSE]
     }
-    sub <- nlist(alpha = search_path$alpha[nterms + 1],
-                 beta,
-                 w = search_path$w[, nterms + 1],
-                 formula,
-                 x)
+    sub <- nlist(alpha = search_path$alpha[nterms + 1], beta,
+                 w = search_path$w[, nterms + 1], formula, x, xlvls)
     class(sub) <- "subfit"
     return(list(sub))
   })