From 5263a2f268bbfcdd3ec767a27ea5364757f01fcf Mon Sep 17 00:00:00 2001 From: Sebastian Jentschke Date: Fri, 14 Jul 2023 21:51:24 +0200 Subject: [PATCH] Preparation for CRAN-submission (v0.3.8) --- NEWS.md | 2 +- R/long2wide_omv.R | 17 +++++++++-------- README.md | 4 ++-- man/long2wide_omv.Rd | 3 +++ tests/testthat/test-long2wide_omv.R | 2 +- 5 files changed, 16 insertions(+), 12 deletions(-) diff --git a/NEWS.md b/NEWS.md index 1e38e75..988c8a3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,7 +4,7 @@ * adjustments to make `jmvReadWrite` more suited for using it together with the jamovi-module `jTransform` (https://github.com/sjentsch/jTransform) * implement reading both data frames or file names for merge_cols_omv and merge_rows_omv (incl. phasing out fleInp as parameter for the helper functions: it now throws an error to prevent using it) * initial handling of weights in `read_omv` and `write_omv` -* correction / bug fix for `long2wide_omv` (the original bug led to incorrect transformations when transforming complex data sets) +* improvements and corrections for `long2wide_omv` (added aggregation, mean or take first, and fixed a bug that led to incorrect naming of variables when transforming complex data sets) * improved unit tests (implementation of regular expressions for expect_error and expect_warning, bug-fixes and additional coverage) * improvements to setting attributes (`setAtt`, e.g., from the metadata to the data frame and vice versa) * improved handling of factors with numerical values (measureType Nominal or Ordinal and dataType Integer) diff --git a/R/long2wide_omv.R b/R/long2wide_omv.R index 6f00279..4fe8183 100644 --- a/R/long2wide_omv.R +++ b/R/long2wide_omv.R @@ -8,6 +8,7 @@ #' @param varTgt Names of one or more variables to be transformed / reshaped (other variables are excluded, if empty(c()) all variables except varTme, varID and varExc are included; default: c()) #' @param varSep Separator character when concatenating the fixed and time-varying part of the variable name ("VAR1_1", "VAR1_2"; default: "_") #' @param varOrd How variables / columns are organized: for "times" (default) the steps of the time varying variable are adjacent, for "vars" the steps of the original columns in the long dataset +#' @param varAgg How multiple occurrences of particular combinations of time varying variables are aggregated: either "mean" (calculate the mean over occurrences), or "first" (take the first occurrence) #' @param varSrt Variable(s) that are used to sort the data frame (see Details; if empty, the order returned from reshape is kept; default: c()) #' @param usePkg Name of the package: "foreign" or "haven" that shall be used to read SPSS, Stata and SAS files; "foreign" is the default (it comes with base R), but "haven" is newer and more comprehensive #' @param selSet Name of the data set that is to be selected from the workspace (only applies when reading .RData-files) @@ -105,7 +106,7 @@ long2wide_omv <- function(dtaInp = NULL, fleOut = "", varID = "ID", varTme = c() # [b] store the original variable labels, the original time-varying / target variable, # and an empty vector for storing labels lstLbl <- list(orgLbl = sapply(dtaFrm, attr, "jmv-desc"), orgTgt = varTgt) - + # [c] there might be several occurrences for each combination of varID and varTme; aggregate them dtaFrm <- aggDta(dtaFrm = dtaFrm, varAgg = varAgg, varID = varID, varTme = varTme, varExc = varExc, varTgt = varTgt) @@ -148,16 +149,16 @@ aggDta <- function(dtaFrm = NULL, varAgg = "", varID = c(), varTme = c(), varExc # if there exists only one occurence of each possible combination of the variables in varID and # varTme, the data don't need to be aggregated, just return the data frame with the relevant # columns selected - if (!any(aggregate(dtaFrm[, varTgt[1]], by = dtaFrm[, c(varID, varTme)], FUN = length)[["x"]] > 1)) { + if (!any(stats::aggregate(dtaFrm[, varTgt[1]], by = dtaFrm[, c(varID, varTme)], FUN = length)[["x"]] > 1)) { dtaFrm[, c(varID, varTme, varExc, varTgt)] # otherwise (with more than one occurence), values are aggregate at each possible combination of the # variables in varID and varTme } else if (varAgg == "first") { # [1] if "first" is chosen as aggregation function, the first occurence at each step is returned - aggregate(x = dtaFrm[, c(varTgt, varExc), drop = FALSE], by = dtaFrm[, c(varID, varTme), drop = FALSE], FUN = "[[", 1) + stats::aggregate(x = dtaFrm[, c(varTgt, varExc), drop = FALSE], by = dtaFrm[, c(varID, varTme), drop = FALSE], FUN = "[[", 1) } else if (varAgg == "mean") { # [2] if "mean" is chosen as aggregation function, it becomes (a little) more complicated - # [a] the target variables (for which the mean is calculated) should be numeric + # [a] the target variables (for which the mean is calculated) should be numeric if (!all(sapply(dtaFrm[, varTgt], is.numeric))) { stop(paste("In order to calculate the mean when aggregating the data, all target variables (varTgt) need to be numeric. Use varAgg = \"first\" instead", "(to use the first occuring value) or convert the target variables to numeric.")) @@ -170,12 +171,12 @@ aggDta <- function(dtaFrm = NULL, varAgg = "", varID = c(), varTme = c(), varExc # participant [ID]); finally the results from the two aggregate-functions are merged again # to return the complete data set if (length(varExc) > 0) { - merge(aggregate(x = dtaFrm[, c(varTgt), drop = FALSE], by = dtaFrm[, c(varID, varTme), drop = FALSE], FUN = mean), - aggregate(x = dtaFrm[, c(varExc), drop = FALSE], by = dtaFrm[, c(varID, varTme), drop = FALSE], FUN = "[[", 1)) + merge(stats::aggregate(x = dtaFrm[, c(varTgt), drop = FALSE], by = dtaFrm[, c(varID, varTme), drop = FALSE], FUN = mean), + stats::aggregate(x = dtaFrm[, c(varExc), drop = FALSE], by = dtaFrm[, c(varID, varTme), drop = FALSE], FUN = "[[", 1)) # [c] if there is no “excluded” variable, the mean is calculated for the target variables # at each possible combination of the variables varID and varTme } else { - aggregate(x = dtaFrm[, c(varTgt), drop = FALSE], by = dtaFrm[, c(varID, varTme), drop = FALSE], FUN = mean) + stats::aggregate(x = dtaFrm[, c(varTgt), drop = FALSE], by = dtaFrm[, c(varID, varTme), drop = FALSE], FUN = mean) } } } @@ -191,7 +192,7 @@ rstLbl <- function(dtaFrm = NULL, lstLbl = list(), varTgt = c(), varTme = c(), v attr(dtaFrm[[varTgt[i]]], "jmv-desc") <- sprintf("%s (%s)", lstLbl$orgLbl[[crrNme]], paste0(apply(rbind(varTme, splTgt[[i]][-1]), 2, paste0, collapse = ": "), collapse = ", ")) } - } + } } } diff --git a/README.md b/README.md index 5909ac5..5d2de10 100644 --- a/README.md +++ b/README.md @@ -198,9 +198,9 @@ list.files(".", "Trial.omv") #> [1] "Trial.omv" file.info("Trial.omv") #> size isdir mode mtime ctime -#> Trial.omv 1573 FALSE 664 2023-07-13 17:04:11 2023-07-13 17:04:11 +#> Trial.omv 1573 FALSE 664 2023-07-14 21:45:22 2023-07-14 21:45:22 #> atime uid gid uname grname -#> Trial.omv 2023-07-13 17:04:11 1000 1000 sjentsch sjentsch +#> Trial.omv 2023-07-14 21:45:22 1000 1000 sjentsch sjentsch unlink("Trial.omv") ``` diff --git a/man/long2wide_omv.Rd b/man/long2wide_omv.Rd index aa87b7a..09a0861 100644 --- a/man/long2wide_omv.Rd +++ b/man/long2wide_omv.Rd @@ -13,6 +13,7 @@ long2wide_omv( varTgt = c(), varSep = "_", varOrd = c("times", "vars"), + varAgg = c("mean", "first"), varSrt = c(), usePkg = c("foreign", "haven"), selSet = "", @@ -36,6 +37,8 @@ long2wide_omv( \item{varOrd}{How variables / columns are organized: for "times" (default) the steps of the time varying variable are adjacent, for "vars" the steps of the original columns in the long dataset} +\item{varAgg}{How multiple occurrences of particular combinations of time varying variables are aggregated: either "mean" (calculate the mean over occurrences), or "first" (take the first occurrence)} + \item{varSrt}{Variable(s) that are used to sort the data frame (see Details; if empty, the order returned from reshape is kept; default: c())} \item{usePkg}{Name of the package: "foreign" or "haven" that shall be used to read SPSS, Stata and SAS files; "foreign" is the default (it comes with base R), but "haven" is newer and more comprehensive} diff --git a/tests/testthat/test-long2wide_omv.R b/tests/testthat/test-long2wide_omv.R index 992ab46..3286f1d 100644 --- a/tests/testthat/test-long2wide_omv.R +++ b/tests/testthat/test-long2wide_omv.R @@ -115,7 +115,7 @@ test_that("long2wide_omv works", { expect_equal(unname(as.matrix(df4Chk[3:8])), cbind(matrix(avgTmp[, "rspCrr"], ncol = 3), matrix(avgTmp[, "rspTme"], ncol = 3))) expect_equal(unname(unlist(sapply(df4Chk, attr, "jmv-desc"))), c(unname(unlist(lblTmp[1:2])), sprintf("%s (cond: %s)", rep(unname(unlist(lblTmp[6:7])), each = 3), rep(c("cong", "incong", "neutral"), times = 2)))) - + dtaTmp$rspCrr <- as.factor(dtaTmp$rspCrr) expect_error(long2wide_omv(dtaInp = dtaTmp, varID = "ID", varTme = "cond", varTgt = c("rspCrr", "rspTme"), varExc = "sex"), regexp = "^In order to calculate the mean when aggregating the data, all target variables \\(varTgt\\) need to be numeric\\.")