From 5263a2f268bbfcdd3ec767a27ea5364757f01fcf Mon Sep 17 00:00:00 2001
From: Sebastian Jentschke <Sebastian.Jentschke@gmail.com>
Date: Fri, 14 Jul 2023 21:51:24 +0200
Subject: [PATCH] Preparation for CRAN-submission (v0.3.8)

---
 NEWS.md                             |  2 +-
 R/long2wide_omv.R                   | 17 +++++++++--------
 README.md                           |  4 ++--
 man/long2wide_omv.Rd                |  3 +++
 tests/testthat/test-long2wide_omv.R |  2 +-
 5 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 1e38e75..988c8a3 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -4,7 +4,7 @@
 * adjustments to make `jmvReadWrite` more suited for using it together with the jamovi-module `jTransform` (https://github.com/sjentsch/jTransform)
 * implement reading both data frames or file names for merge_cols_omv and merge_rows_omv (incl. phasing out fleInp as parameter for the helper functions: it now throws an error to prevent using it)
 * initial handling of weights in `read_omv` and `write_omv`
-* correction / bug fix for `long2wide_omv` (the original bug led to incorrect transformations when transforming complex data sets)
+* improvements and corrections for `long2wide_omv` (added aggregation, mean or take first, and fixed a bug that led to incorrect naming of variables when transforming complex data sets)
 * improved unit tests (implementation of regular expressions for expect_error and expect_warning, bug-fixes and additional coverage)
 * improvements to setting attributes (`setAtt`, e.g., from the metadata to the data frame and vice versa)
 * improved handling of factors with numerical values (measureType Nominal or Ordinal and dataType Integer)
diff --git a/R/long2wide_omv.R b/R/long2wide_omv.R
index 6f00279..4fe8183 100644
--- a/R/long2wide_omv.R
+++ b/R/long2wide_omv.R
@@ -8,6 +8,7 @@
 #' @param varTgt Names of one or more variables to be transformed / reshaped (other variables are excluded, if empty(c()) all variables except varTme, varID and varExc are included; default: c())
 #' @param varSep Separator character when concatenating the fixed and time-varying part of the variable name ("VAR1_1", "VAR1_2"; default: "_")
 #' @param varOrd How variables / columns are organized: for "times" (default) the steps of the time varying variable are adjacent, for "vars" the steps of the original columns in the long dataset
+#' @param varAgg How multiple occurrences of particular combinations of time varying variables are aggregated: either "mean" (calculate the mean over occurrences), or "first" (take the first occurrence)
 #' @param varSrt Variable(s) that are used to sort the data frame (see Details; if empty, the order returned from reshape is kept; default: c())
 #' @param usePkg Name of the package: "foreign" or "haven" that shall be used to read SPSS, Stata and SAS files; "foreign" is the default (it comes with base R), but "haven" is newer and more comprehensive
 #' @param selSet Name of the data set that is to be selected from the workspace (only applies when reading .RData-files)
@@ -105,7 +106,7 @@ long2wide_omv <- function(dtaInp = NULL, fleOut = "", varID = "ID", varTme = c()
     # [b] store the original variable labels, the original time-varying / target variable,
     # and an empty vector for storing labels
     lstLbl <- list(orgLbl = sapply(dtaFrm, attr, "jmv-desc"), orgTgt = varTgt)
-    
+
     # [c] there might be several occurrences for each combination of varID and varTme; aggregate them
     dtaFrm <- aggDta(dtaFrm = dtaFrm, varAgg = varAgg, varID = varID, varTme = varTme, varExc = varExc, varTgt = varTgt)
 
@@ -148,16 +149,16 @@ aggDta <- function(dtaFrm = NULL, varAgg = "", varID = c(), varTme = c(), varExc
     # if there exists only one occurence of each possible combination of the variables in varID and
     # varTme, the data don't need to be aggregated, just return the data frame with the relevant
     # columns selected
-    if (!any(aggregate(dtaFrm[, varTgt[1]], by = dtaFrm[, c(varID, varTme)], FUN = length)[["x"]] > 1)) {
+    if (!any(stats::aggregate(dtaFrm[, varTgt[1]], by = dtaFrm[, c(varID, varTme)], FUN = length)[["x"]] > 1)) {
         dtaFrm[, c(varID, varTme, varExc, varTgt)]
     # otherwise (with more than one occurence), values are aggregate at each possible combination of the
     # variables in varID and varTme
     } else if (varAgg == "first") {
         # [1] if "first" is chosen as aggregation function, the first occurence at each step is returned
-        aggregate(x = dtaFrm[, c(varTgt, varExc), drop = FALSE], by = dtaFrm[, c(varID, varTme), drop = FALSE], FUN = "[[", 1)
+        stats::aggregate(x = dtaFrm[, c(varTgt, varExc), drop = FALSE], by = dtaFrm[, c(varID, varTme), drop = FALSE], FUN = "[[", 1)
     } else if (varAgg == "mean")  {
         # [2] if "mean" is chosen as aggregation function, it becomes (a little) more complicated
-        # [a] the target variables (for which the mean is calculated) should be numeric 
+        # [a] the target variables (for which the mean is calculated) should be numeric
         if (!all(sapply(dtaFrm[, varTgt], is.numeric))) {
             stop(paste("In order to calculate the mean when aggregating the data, all target variables (varTgt) need to be numeric. Use varAgg = \"first\" instead",
                        "(to use the first occuring value) or convert the target variables to numeric."))
@@ -170,12 +171,12 @@ aggDta <- function(dtaFrm = NULL, varAgg = "", varID = c(), varTme = c(), varExc
         # participant [ID]); finally the results from the two aggregate-functions are merged again
         # to return the complete data set
         if (length(varExc) > 0) {
-            merge(aggregate(x = dtaFrm[, c(varTgt), drop = FALSE], by = dtaFrm[, c(varID, varTme), drop = FALSE], FUN = mean),
-                  aggregate(x = dtaFrm[, c(varExc), drop = FALSE], by = dtaFrm[, c(varID, varTme), drop = FALSE], FUN = "[[", 1))
+            merge(stats::aggregate(x = dtaFrm[, c(varTgt), drop = FALSE], by = dtaFrm[, c(varID, varTme), drop = FALSE], FUN = mean),
+                  stats::aggregate(x = dtaFrm[, c(varExc), drop = FALSE], by = dtaFrm[, c(varID, varTme), drop = FALSE], FUN = "[[", 1))
         # [c] if there is no “excluded” variable, the mean is calculated for the target variables
         # at each possible combination of the variables varID and varTme
         } else {
-            aggregate(x = dtaFrm[, c(varTgt), drop = FALSE], by = dtaFrm[, c(varID, varTme), drop = FALSE], FUN = mean)
+            stats::aggregate(x = dtaFrm[, c(varTgt), drop = FALSE], by = dtaFrm[, c(varID, varTme), drop = FALSE], FUN = mean)
         }
     }
 }
@@ -191,7 +192,7 @@ rstLbl <- function(dtaFrm = NULL, lstLbl = list(), varTgt = c(), varTme = c(), v
                     attr(dtaFrm[[varTgt[i]]], "jmv-desc") <-
                       sprintf("%s (%s)", lstLbl$orgLbl[[crrNme]], paste0(apply(rbind(varTme, splTgt[[i]][-1]), 2, paste0, collapse = ": "), collapse = ", "))
                 }
-            }        
+            }
         }
     }
 
diff --git a/README.md b/README.md
index 5909ac5..5d2de10 100644
--- a/README.md
+++ b/README.md
@@ -198,9 +198,9 @@ list.files(".", "Trial.omv")
 #> [1] "Trial.omv"
 file.info("Trial.omv")
 #>           size isdir mode               mtime               ctime
-#> Trial.omv 1573 FALSE  664 2023-07-13 17:04:11 2023-07-13 17:04:11
+#> Trial.omv 1573 FALSE  664 2023-07-14 21:45:22 2023-07-14 21:45:22
 #>                         atime  uid  gid    uname   grname
-#> Trial.omv 2023-07-13 17:04:11 1000 1000 sjentsch sjentsch
+#> Trial.omv 2023-07-14 21:45:22 1000 1000 sjentsch sjentsch
 unlink("Trial.omv")
 ```
 
diff --git a/man/long2wide_omv.Rd b/man/long2wide_omv.Rd
index aa87b7a..09a0861 100644
--- a/man/long2wide_omv.Rd
+++ b/man/long2wide_omv.Rd
@@ -13,6 +13,7 @@ long2wide_omv(
   varTgt = c(),
   varSep = "_",
   varOrd = c("times", "vars"),
+  varAgg = c("mean", "first"),
   varSrt = c(),
   usePkg = c("foreign", "haven"),
   selSet = "",
@@ -36,6 +37,8 @@ long2wide_omv(
 
 \item{varOrd}{How variables / columns are organized: for "times" (default) the steps of the time varying variable are adjacent, for "vars" the steps of the original columns in the long dataset}
 
+\item{varAgg}{How multiple occurrences of particular combinations of time varying variables are aggregated: either "mean" (calculate the mean over occurrences), or "first" (take the first occurrence)}
+
 \item{varSrt}{Variable(s) that are used to sort the data frame (see Details; if empty, the order returned from reshape is kept; default: c())}
 
 \item{usePkg}{Name of the package: "foreign" or "haven" that shall be used to read SPSS, Stata and SAS files; "foreign" is the default (it comes with base R), but "haven" is newer and more comprehensive}
diff --git a/tests/testthat/test-long2wide_omv.R b/tests/testthat/test-long2wide_omv.R
index 992ab46..3286f1d 100644
--- a/tests/testthat/test-long2wide_omv.R
+++ b/tests/testthat/test-long2wide_omv.R
@@ -115,7 +115,7 @@ test_that("long2wide_omv works", {
     expect_equal(unname(as.matrix(df4Chk[3:8])), cbind(matrix(avgTmp[, "rspCrr"], ncol = 3), matrix(avgTmp[, "rspTme"], ncol = 3)))
     expect_equal(unname(unlist(sapply(df4Chk, attr, "jmv-desc"))), c(unname(unlist(lblTmp[1:2])), sprintf("%s (cond: %s)",
       rep(unname(unlist(lblTmp[6:7])), each = 3), rep(c("cong", "incong", "neutral"), times = 2))))
-   
+
     dtaTmp$rspCrr <- as.factor(dtaTmp$rspCrr)
     expect_error(long2wide_omv(dtaInp = dtaTmp, varID = "ID", varTme = "cond", varTgt = c("rspCrr", "rspTme"), varExc = "sex"),
       regexp = "^In order to calculate the mean when aggregating the data, all target variables \\(varTgt\\) need to be numeric\\.")