Update read_annotations() with 11k content

- closes SomaLogic#85
stufield · Feb 17, 2024 · c66a387 · c66a387
1 parent 29922f1
commit c66a387
Show file tree

Hide file tree

Showing 4 changed files with 50 additions and 16 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -152,6 +152,7 @@ importFrom(magrittr,"%>%")
 importFrom(methods,new)
 importFrom(methods,setGeneric)
 importFrom(methods,validObject)
+importFrom(readxl,read_xlsx)
 importFrom(stats,IQR)
 importFrom(stats,mad)
 importFrom(stats,median)
@@ -164,6 +165,7 @@ importFrom(tibble,tibble)
 importFrom(tidyr,pivot_longer)
 importFrom(tidyr,separate)
 importFrom(tidyr,unite)
+importFrom(tools,md5sum)
 importFrom(utils,capture.output)
 importFrom(utils,head)
 importFrom(utils,read.delim)

diff --git a/R/read-annotations.R b/R/read-annotations.R
@@ -2,23 +2,30 @@
 #'
 #' @param file A path to an annotations file location.
 #'   This is a sanctioned, versioned file provided by
-#'   SomaLogic Operating Co., Inc. and should be an unmodified
+#'   SomaLogic Operating Co., Inc. and should be an _unmodified_
 #'   `*.xlsx` file.
 #' @return A `tibble` containing analyte-specific annotations and
-#'   related information (e.g. lift/scale information), keyed on SomaLogic
-#'   `"SeqId"` which is a unique analyte identifier.
+#'   related (e.g. lift/bridging) information, keyed on SomaLogic
+#'   [SeqId], the unique SomaScan analyte identifier.
 #' @examples
 #' \dontrun{
-#' anno <- read_annotations("~/Desktop/SomaScan_V4.1_7K_Annotated_Content_20210616.xlsx")
+#'   # for example
+#'   file <- "~/Desktop/SomaScan_V4.1_7K_Annotated_Content_20210616.xlsx"
+#'   anno_tbl <- read_annotations(file)
 #' }
+#' @importFrom readxl read_xlsx
+#' @importFrom tools md5sum
 #' @export
 read_annotations <- function(file) {
 
-  ext <- gsub("(.*)[.]([^.]+)$", "\\2", file)
-  stopifnot(ext %in% c("xlsx", "json"))
+  if ( !(endsWith(file, "xlsx") || endsWith(file, "json")) ) {
+    stop("Annotations file must be either ", .value("*.xlsx"),
+         " or ", .value("*.json"), ".", call. = FALSE)
+  }
 
   ver <- getAnnoVer(file)
 
+  # cannot determine version
   if ( !grepl("^SL-[0-9]+-rev[0-9]+", ver) ) {
     stop(
       "Unable to determine annotations file version: ", .value(ver),
@@ -27,13 +34,25 @@ read_annotations <- function(file) {
     )
   }
 
+  # do not recognize version
   if ( !ver %in% names(ver_dict) ) {
     stop(
       "Unknown version of the annotations file: ", .value(ver), ".",
       call. = FALSE
     )
   }
 
+  md5_file <- strtrim(md5sum(file), 7L) |> unname()
+  md5_true <- strtrim(ver_dict[[ver]]$sha, 7L)
+
+  # file modified
+  if ( !identical(md5_file, md5_true) ) {
+    warning(
+      "Checksum mismatch.", .value(basename(file)), " may have been modified.",
+      call. = FALSE
+    )
+  }
+
   skip <- ver_dict[[ver]]$skip
   tbl  <- readxl::read_xlsx(file, sheet = "Annotations", skip = skip)
 
@@ -64,11 +83,13 @@ getAnnoVer <- function(file) {
 
 # version dictionary of key-value pairs
 # for file characteristics
+# SHA hashes are calculated with `tools::md5sum()`
 ver_dict <- list(
-  # dummy version; v4.0 -> v4.1
+  # The first 2 are for testing
+  # dummy version; 5k -> 7k
   "SL-99999999-rev99-1999-01" = list(col_serum  = "Serum Scalar v4.0 to v4.1",
                                      col_plasma = "Plasma Scalar v4.0 to v4.1"),
-  # test-anno.xlsx file; v4.1 -> v4.0
+  # test-anno.xlsx file; 7k -> 5k
   "SL-12345678-rev0-2021-01" = list(sha = "8a345fa621377d0bac40fc8c47f5579d",
                                     col_serum  = "Serum Scalar v4.1 to v4.0",
                                     col_plasma = "Plasma Scalar v4.1 to v4.0",
@@ -77,7 +98,7 @@ ver_dict <- list(
                                     skip = 8L,
                                     rows = 1,
                                     cols = 43),
-  # v4.1 -> v4.0
+  # 7k -> 5k
   "SL-00000571-rev2-2021-06" = list(sha = "5fa46834ed826eb1e8dba88698cf7a76",
                                     col_serum  = "Serum Scalar v4.1 to v4.0",
                                     col_plasma = "Plasma Scalar v4.1 to v4.0",
@@ -86,13 +107,22 @@ ver_dict <- list(
                                     skip = 8L,
                                     rows = 7596,
                                     cols = 43),
-  # v4.0 -> v4.1
+  # 5k -> 7k
   "SL-00000246-rev5-2021-06" = list(sha = "7d92666369d4e33364b11804f2d1f8ce",
                                     col_serum  = "Serum Scalar v4.0 to v4.1",
                                     col_plasma = "Plasma Scalar v4.0 to v4.1",
                                     which_serum  = 40,
                                     which_plasma = 42,
                                     skip = 8L,
                                     rows = 5284,
+                                    cols = 43),
+  # 11k
+  "SL-00000246-rev3-2024-06" = list(sha = "",
+                                    col_serum  = "",
+                                    col_plasma = "",
+                                    which_serum  = 40,
+                                    which_plasma = 42,
+                                    skip = 8L,
+                                    rows = 11083,
                                     cols = 43)
 )
diff --git a/man/read_annotations.Rd b/man/read_annotations.Rd
diff --git a/tests/testthat/test-read-annotations.R b/tests/testthat/test-read-annotations.R
@@ -1,11 +1,11 @@
 
 file <- test_path("testdata", "test-anno.xlsx")
 
-test_that("getAnnoVer() parses the version correctly", {
+test_that("`getAnnoVer()` parses the version correctly", {
   expect_equal(getAnnoVer(file), "SL-12345678-rev0-2021-01")
 })
 
-test_that("read_annotations() parses the annotations file correctly", {
+test_that("`read_annotations()` parses the annotations file correctly", {
   tbl <- read_annotations(file)
   expect_s3_class(tbl, "tbl_df")
   expect_equal(dim(tbl), c(1L, 43L))