Identify columns with the same name, and either delete them (if they …

…have the same content) or rename them and copy their attributes (+ respective unit tests)
sjentsch · Nov 18, 2023 · fb79c34 · fb79c34
1 parent 83c4a6a
commit fb79c34
Show file tree

Hide file tree

Showing 2 changed files with 49 additions and 1 deletion.
diff --git a/R/merge_cols_omv.R b/R/merge_cols_omv.R
@@ -93,7 +93,7 @@ merge_cols_omv <- function(dtaInp = NULL, fleOut = "", typMrg = c("outer", "inne
     # store attributes and remove empty lines from the data sets
     attCol <- list()
     for (i in seq_along(dtaFrm)) {
-        attCol <- c(attCol, sapply(dtaFrm[[i]][, setdiff(names(dtaFrm[[i]]), names(attCol))], attributes))
+        attCol <- c(attCol, sapply(dtaFrm[[i]][, setdiff(names(dtaFrm[[i]]), names(attCol))], attributes, simplify = FALSE))
         dtaFrm[[i]] <- dtaFrm[[i]][!apply(is.na(dtaFrm[[i]]), 1, all), ]
     }
     attDF <- attributes(dtaFrm[[1]])
@@ -108,6 +108,23 @@ merge_cols_omv <- function(dtaInp = NULL, fleOut = "", typMrg = c("outer", "inne
     tmpMrg <- dtaFrm[[1]]
     for (i in setdiff(seq_along(dtaFrm), 1)) {
         tmpMrg <- do.call(merge, c(list(x = tmpMrg, y = dtaFrm[[i]], by.x = varBy[[1]], by.y = varBy[[i]]), crrArg[!grepl("^x$|^y$|^by.x$|^by.y$", names(crrArg))]))
+        # if there are duplicate columns (i.e., columns with the same name in two of the input data sets), unify them
+        for (unfClm in setdiff(names(attCol), names(tmpMrg))) {
+            dplClm <- grep(paste0(unfClm, "\\."), names(tmpMrg))
+            if (length(dplClm) == 0) next
+            names(tmpMrg)[dplClm[1]] <- unfClm
+            rmvClm <- NULL
+            for (i in seq(2, length(dplClm))) {
+                if (identical(tmpMrg[, dplClm[1]], tmpMrg[, dplClm[i]])) {
+                    rmvClm <- c(rmvClm, -dplClm[i])
+                } else {
+                    addClm <- sprintf("%s_%d", unfClm, sum(grepl(paste0(unfClm, "_"), names(tmpMrg))) + 2)
+                    names(tmpMrg)[dplClm[i]] <- addClm
+                    attCol[[addClm]] <- attCol[[unfClm]]
+                }
+            }
+            if (length(rmvClm) > 0) tmpMrg <- tmpMrg[, rmvClm]
+        }
     }
     dtaFrm <- tmpMrg
 

diff --git a/tests/testthat/test-merge_cols_omv.R b/tests/testthat/test-merge_cols_omv.R
@@ -32,6 +32,37 @@ test_that("merge_cols_omv works", {
     dtaFrm <- merge_cols_omv(dtaInp = nmeInp[-2], typMrg = "inner", varBy = "ID", varSrt = c("gender_3", "age_3"))
     expect_s3_class(dtaFrm, "data.frame")
     expect_equal(dim(dtaFrm), c(245, 57))
+    unlink(nmeInp)
+
+    nmeInp <- vector(mode = "character", length = 3)
+    dtaTmp <- jmvReadWrite::bfi_sample2
+    for (i in seq_along(nmeInp)) {
+        nmeInp[i] <- tempfile(fileext = ".rds")
+        saveRDS(dtaTmp, nmeInp[i])
+    }
+    dtaFrm <- merge_cols_omv(dtaInp = nmeInp, typMrg = "outer", varBy = "ID", varSrt = c("ID"))
+    expect_s3_class(dtaFrm, "data.frame")
+    expect_equal(dim(dtaFrm), c(250, 29))
+    expect_true(all(dtaFrm == dtaTmp[order(dtaTmp[, "ID"]), ]))
+    unlink(nmeInp)
+
+    nmeInp <- vector(mode = "character", length = 5)
+    dtaTmp <- jmvReadWrite::bfi_sample2
+    for (i in seq_along(nmeInp)) {
+        nmeInp[i] <- tempfile(fileext = ".rds")
+        strAtt <- attributes(dtaTmp[, "age"])
+        dtaTmp[, "age"] <- dtaTmp[sample(nrow(dtaTmp)), "age"]
+        attributes(dtaTmp[, "age"]) <- strAtt
+        saveRDS(dtaTmp, nmeInp[i])
+    }
+    dtaFrm <- merge_cols_omv(dtaInp = nmeInp, typMrg = "outer", varBy = "ID", varSrt = c("ID"))
+    dplClm <- gsub("age_1", "age", paste0("age_", seq(5)))
+    expect_s3_class(dtaFrm, "data.frame")
+    expect_equal(dim(dtaFrm), c(250, 33))
+    expect_true(all(dplClm %in% names(dtaFrm)))
+    expect_true(all(apply(sapply(dtaFrm[, dplClm], sort), 1, diff) == 0))
+    expect_true(all(diff(colMeans(dtaFrm[, dplClm])) == 0))
+    expect_true(all(sapply(dtaFrm[, dplClm], attributes) == "Age of the respondent (years)"))
 
     # test cases for code coverage ============================================================================================================================
     expect_error(merge_cols_omv(fleInp = nmeInp, typMrg = "outer", varBy = "ID"), regexp = "Please use the argument dtaInp instead of fleInp\\.")