Add new merge_clin() function

- new function wrapper to allow users to merge in clinical variables to `soma_adat` objects easily - closes SomaLogic#80
stufield · Mar 12, 2024 · efad5e7 · efad5e7
1 parent e433267
commit efad5e7
Show file tree

Hide file tree

Showing 6 changed files with 227 additions and 0 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -106,6 +106,7 @@ export(loadAdatsAsList)
 export(locateSeqId)
 export(matchSeqIds)
 export(meltExpressionSet)
+export(merge_clin)
 export(mutate)
 export(parseHeader)
 export(pivotExpressionSet)
@@ -172,6 +173,7 @@ importFrom(tidyr,unite)
 importFrom(tools,md5sum)
 importFrom(utils,capture.output)
 importFrom(utils,head)
+importFrom(utils,read.csv)
 importFrom(utils,read.delim)
 importFrom(utils,tail)
 importFrom(utils,write.table)
diff --git a/R/merge-clin.R b/R/merge-clin.R
@@ -0,0 +1,95 @@
+#' Merge Clinical Data into SomaScan
+#'
+#' Occasionally, additional clinical data is obtained _after_ samples
+#' have been submitted to SomaLogic, or even after 'SomaScan'
+#' results have been delivered.
+#' This requires the new clinical variables, i.e. non-proteomic, data to be
+#' merged with 'SomaScan' data into a "new" ADAT prior to analysis.
+#' [merge_clin()] easily merges such clinical variables into an
+#' existing `soma_adat` object and is a simple wrapper around [dplyr::left_join()].
+#'
+#' This funtionality also exists as a command-line tool (R script) contained
+#' in `merge_clin.R` that lives in the `cli/merge` system file directory.
+#' Please see:
+#' \itemize{
+#'   \item `dir(system.file("cli/merge", package = "SomaDataIO"), full.names = TRUE)`
+#'   \item `vignette("cli-merge-tool", package = "SomaDataIO")`
+#' }
+#'
+#' @inheritParams params
+#' @param clin_data One of 2 options:
+#' \itemize{
+#'   \item A data frame containing clinical variables to merge into `x`, or
+#'   \item A path to a file, typically a `*.csv`,
+#'     containing clinical variables to merge into `x`.
+#' }
+#' @param by A character vector of variables to join by.
+#'   See [dplyr::left_join()] for more details.
+#' @param by_class If `clin_data` is a file path, a named character vector
+#'   of the variable its class. This ensures the `by-key` is compatible
+#'   for the join. For example, `c(SampleId = "character")`.
+#'   See [read.table()] for details about the `colClasses` argument, and
+#'   the examples below.
+#' @param ... Additional parameters passed to [dplyr::left_join()].
+#' @return An object of the same class as `x` with new clinical
+#'   variables merged.
+#' @author Stu Field
+#' @seealso [dplyr::left_join()]
+#' @examples
+#' # retrieve clinical data
+#' clin_file <- system.file("cli/merge", "meta.csv",
+#'                          package = "SomaDataIO",
+#'                          mustWork = TRUE)
+#' clin_file
+#'
+#' # view clinical data to be merged:
+#' # 1) `group`
+#' # 2) `newvar`
+#' clin_df <- read.csv(clin_file, colClasses = c(SampleId = "character"))
+#' clin_df
+#'
+#' # create mini-adat
+#' apts <- withr::with_seed(123, sample(getAnalytes(example_data), 2L))
+#' adat <- head(example_data, 9L) |>   # 9 x 2
+#'   dplyr::select(SampleId, all_of(apts))
+#'
+#' # merge clinical variables
+#' merged <- merge_clin(adat, clin_df, by = "SampleId")
+#' merged
+#'
+#' # Alternative syntax:
+#' #   1) pass file path
+#' #   2) merge on different variable names
+#' #   3) convert join type on-the-fly
+#' clin_file2 <- system.file("cli/merge", "meta2.csv",
+#'                           package = "SomaDataIO",
+#'                           mustWork = TRUE)
+#'
+#' id_type <- typeof(adat$SampleId)
+#' merged2 <- merge_clin(adat, clin_file2,                # file path
+#'                       by = c(SampleId = "ClinKey"),    # join on 2 variables
+#'                       by_class = c(ClinKey = id_type)) # match types
+#' merged2
+#' @importFrom utils read.csv
+#' @importFrom dplyr left_join
+#' @export
+merge_clin <- function(x, clin_data, by = NULL, by_class = NULL, ...) {
+
+  stopifnot("`x` must be a `soma_adat`."  = is.soma_adat(x))
+
+  if ( inherits(clin_data, "data.frame") ) {
+    clin_df <- clin_data
+  } else if ( is.character(clin_data) &&
+              length(clin_data) == 1L &&
+              file.exists(clin_data) ) {
+    clin_df <- normalizePath(clin_data, mustWork = TRUE) |>
+      utils::read.csv(header = TRUE, colClasses = by_class, row.names = NULL,
+                      stringsAsFactors = FALSE)
+  } else {
+    stop(
+      "Invalid `clin_data` argument: ", .value(class(clin_data)),
+      "\n`clin_data` must be either a `data.frame` or file path.", call. = FALSE)
+  }
+
+  dplyr::left_join(x, clin_df, by = by, ...)
+}
diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -136,6 +136,7 @@ reference:
     - starts_with("getAnalyte")
     - getMeta
     - diffAdats
+    - merge_clin
 
   - title: Transform Between SomaScan Versions
     desc: >

diff --git a/man/merge_clin.Rd b/man/merge_clin.Rd
diff --git a/tests/testthat/test-merge-clin.R b/tests/testthat/test-merge-clin.R
@@ -0,0 +1,32 @@
+
+# Setup ----
+clin_file <- system.file("cli/merge", "meta.csv", package = "SomaDataIO",
+                          mustWork = TRUE)
+clin_df <- read.csv(clin_file, header = TRUE, colClasses = c(SampleId = "character"))
+apts <- withr::with_seed(123, sample(getAnalytes(example_data), 2L))
+adat <- head(example_data, 9L) |> dplyr::select(SampleId, all_of(apts))
+
+test_that("merge_clin() errors on bad `clin_data` argument", {
+  merged <- merge_clin(adat, clin_df, by = "SampleId")
+  expect_true(all(names(adat) %in% names(merged)))
+  expect_equal(setdiff(names(merged), names(adat)), c("group", "newvar"))
+  expect_equal(dim(merged), c(9, 5L))
+  expect_equal(sum(is.na(merged)), 8L)
+  expect_equal(sum(merged$newvar, na.rm = TRUE), -1.779255)
+})
+
+test_that("merge_clin() generates same result on `clin_data` argument", {
+  expect_equal(
+    merge_clin(adat, clin_df, by = "SampleId"),
+    merge_clin(adat, clin_file, by = "SampleId", by_class = c(SampleId = "character"))
+  )
+})
+
+test_that("merge_clin() errors on bad `clin_data` argument", {
+  expect_error( merge_clin(adat, letters) )
+  expect_error( merge_clin(adat, 1:10L) )
+  expect_error( merge_clin(adat, "Samples") )
+  expect_error( merge_clin(adat, NA) )
+  expect_error( merge_clin(adat, NA_character_) )
+  expect_error( merge_clin(data.frame(adat)) )
+})
diff --git a/vignettes/cli-merge-tool.Rmd b/vignettes/cli-merge-tool.Rmd
@@ -39,6 +39,9 @@ in the `cli/merge/` directory, which allows one to
 generate an updated `*.adat` file via the command-line without
 having to launch an integrated development environment ("IDE"), e.g. `RStudio`.
 
+To use `SomaDataIO`s exported functionality fro _within_ and R session,
+please see `merge_clin()`.
+
 
 ----------------