Update read_annotations() with 11k content

- closes SomaLogic#85
stufield · Feb 22, 2024 · 3090f8e · 3090f8e
1 parent 2fe5009
commit 3090f8e
Show file tree

Hide file tree

Showing 4 changed files with 93 additions and 19 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -153,6 +153,7 @@ importFrom(magrittr,"%>%")
 importFrom(methods,new)
 importFrom(methods,setGeneric)
 importFrom(methods,validObject)
+importFrom(readxl,read_xlsx)
 importFrom(stats,IQR)
 importFrom(stats,mad)
 importFrom(stats,median)
@@ -165,6 +166,7 @@ importFrom(tibble,tibble)
 importFrom(tidyr,pivot_longer)
 importFrom(tidyr,separate)
 importFrom(tidyr,unite)
+importFrom(tools,md5sum)
 importFrom(utils,capture.output)
 importFrom(utils,head)
 importFrom(utils,read.delim)

diff --git a/R/read-annotations.R b/R/read-annotations.R
@@ -2,23 +2,30 @@
 #'
 #' @param file A path to an annotations file location.
 #'   This is a sanctioned, versioned file provided by
-#'   SomaLogic Operating Co., Inc. and should be an unmodified
+#'   SomaLogic Operating Co., Inc. and should be an _unmodified_
 #'   `*.xlsx` file.
 #' @return A `tibble` containing analyte-specific annotations and
-#'   related information (e.g. lift/scale information), keyed on SomaLogic
-#'   `"SeqId"` which is a unique analyte identifier.
+#'   related (e.g. lift/bridging) information, keyed on SomaLogic
+#'   [SeqId], the unique SomaScan analyte identifier.
 #' @examples
 #' \dontrun{
-#' anno <- read_annotations("~/Desktop/SomaScan_V4.1_7K_Annotated_Content_20210616.xlsx")
+#'   # for example
+#'   file <- "~/Desktop/SomaScan_V4.1_7K_Annotated_Content_20210616.xlsx"
+#'   anno_tbl <- read_annotations(file)
 #' }
+#' @importFrom readxl read_xlsx
+#' @importFrom tools md5sum
 #' @export
 read_annotations <- function(file) {
 
-  ext <- gsub("(.*)[.]([^.]+)$", "\\2", file)
-  stopifnot(ext %in% c("xlsx", "json"))
+  if ( !(endsWith(file, "xlsx") || endsWith(file, "json")) ) {
+    stop("Annotations file must be either ", .value("*.xlsx"),
+         " or ", .value("*.json"), ".", call. = FALSE)
+  }
 
   ver <- getAnnoVer(file)
 
+  # cannot determine version
   if ( !grepl("^SL-[0-9]+-rev[0-9]+", ver) ) {
     stop(
       "Unable to determine annotations file version: ", .value(ver),
@@ -27,13 +34,25 @@ read_annotations <- function(file) {
     )
   }
 
+  # do not recognize version
   if ( !ver %in% names(ver_dict) ) {
     stop(
       "Unknown version of the annotations file: ", .value(ver), ".",
       call. = FALSE
     )
   }
 
+  md5_file <- strtrim(md5sum(file), 7L) |> unname()
+  md5_true <- strtrim(ver_dict[[ver]]$sha, 7L)
+
+  # file modified
+  if ( !identical(md5_file, md5_true) ) {
+    warning(
+      "Checksum mismatch.", .value(basename(file)), " may have been modified.",
+      call. = FALSE
+    )
+  }
+
   skip <- ver_dict[[ver]]$skip
   tbl  <- readxl::read_xlsx(file, sheet = "Annotations", skip = skip)
 
@@ -64,11 +83,13 @@ getAnnoVer <- function(file) {
 
 # version dictionary of key-value pairs
 # for file characteristics
+# SHA hashes are calculated with `tools::md5sum()`
 ver_dict <- list(
-  # dummy version; v4.0 -> v4.1
+  # The first 2 are for testing
+  # dummy version; 5k -> 7k
   "SL-99999999-rev99-1999-01" = list(col_serum  = "Serum Scalar v4.0 to v4.1",
                                      col_plasma = "Plasma Scalar v4.0 to v4.1"),
-  # test-anno.xlsx file; v4.1 -> v4.0
+  # test-anno.xlsx file; 7k -> 5k
   "SL-12345678-rev0-2021-01" = list(sha = "8a345fa621377d0bac40fc8c47f5579d",
                                     col_serum  = "Serum Scalar v4.1 to v4.0",
                                     col_plasma = "Plasma Scalar v4.1 to v4.0",
@@ -77,22 +98,60 @@ ver_dict <- list(
                                     skip = 8L,
                                     rows = 1,
                                     cols = 43),
-  # v4.1 -> v4.0
+  # 7k -> 5k
   "SL-00000571-rev2-2021-06" = list(sha = "5fa46834ed826eb1e8dba88698cf7a76",
                                     col_serum  = "Serum Scalar v4.1 to v4.0",
                                     col_plasma = "Plasma Scalar v4.1 to v4.0",
                                     which_serum  = 40,
                                     which_plasma = 42,
                                     skip = 8L,
-                                    rows = 7596,
+                                    rows = 7605,
                                     cols = 43),
-  # v4.0 -> v4.1
+  # 5k -> 7k
   "SL-00000246-rev5-2021-06" = list(sha = "7d92666369d4e33364b11804f2d1f8ce",
                                     col_serum  = "Serum Scalar v4.0 to v4.1",
                                     col_plasma = "Plasma Scalar v4.0 to v4.1",
                                     which_serum  = 40,
                                     which_plasma = 42,
                                     skip = 8L,
-                                    rows = 5284,
-                                    cols = 43)
+                                    rows = 5293,
+                                    cols = 43),
+  # 11k
+  # SomaScan_V5.0_11K_Annotated_Content_20240214
+  "SL-906-rev3-2024-02" = list(sha = "44352af60dc0152d65f3dad1f0c54abc",
+                               col_serum  = c("Serum Scalar v5.0 11K to v4.1 7K",
+                                              "Serum Scalar v5.0 11K to v4.0 5K"),
+                               col_plasma = c("Plasma Scalar v5.0 11K to v4.1 7K",
+                                              "Plasma Scalar v5.0 11K to v4.0 5K"),
+                               which_serum  = c(43, 47),
+                               which_plasma = c(45, 49),
+                               skip = 8L,
+                               rows = 11092,
+                               cols = 51),
+
+  # 7k
+  # SomaScan_V4.1_7K_Annotated_Content_20240214
+  "SL-00000571-rev7-2024-02" = list(sha = "ab6ecbce3565c2c6049c0150f653f51b",
+                                    col_serum  = c("Serum Scalar v4.1 7K to v4.0 5K",
+                                                   "Serum Scalar v4.1 7K to v5.0 11K"),
+                                    col_plasma = c("Plasma Scalar v4.1 7K to v4.0 5K",
+                                                   "Plasma Scalar v4.1 7K to v5.0 11K"),
+                                    which_serum  = c(43, 47),
+                                    which_plasma = c(45, 49),
+                                    skip = 8L,
+                                    rows = 7605,
+                                    cols = 50)
+  # 5k
+  # SomaScan_V4.0_5K_Annotated_Content_20240216.xlsx
+  # (may not be released)
+  # "SL-00000246-revnotreleased-2024-02" = list(sha = "xxxxxx",
+  #                                   col_serum  = c("Serum Scalar v4.0 5K to v4.1 7K",
+  #                                                  "Serum Scalar v4.0 5K to v5.0 11K"),
+  #                                   col_plasma = c("Plasma Scalar v4.0 5K to v4.1 7K",
+  #                                                  "Plasma Scalar v4.0 5K to v5.0 11K"),
+  #                                   which_serum  = c(43, 47),
+  #                                   which_plasma = c(45, 49),
+  #                                   skip = 8L,
+  #                                   rows = 5293,
+  #                                   cols = 50)
 )
diff --git a/man/read_annotations.Rd b/man/read_annotations.Rd
diff --git a/tests/testthat/test-read-annotations.R b/tests/testthat/test-read-annotations.R
@@ -1,11 +1,22 @@
 
 file <- test_path("testdata", "test-anno.xlsx")
 
-test_that("getAnnoVer() parses the version correctly", {
+test_that("`ver_dict` is updated and correct", {
+  expect_length(ver_dict, 6L)
+  expect_named(ver_dict,
+               c("SL-99999999-rev99-1999-01",
+                 "SL-12345678-rev0-2021-01",
+                 "SL-00000571-rev2-2021-06",
+                 "SL-00000246-rev5-2021-06",
+                 "SL-906-rev3-2024-02",
+                 "SL-00000571-rev7-2024-02"))
+})
+
+test_that("`getAnnoVer()` parses the version correctly", {
   expect_equal(getAnnoVer(file), "SL-12345678-rev0-2021-01")
 })
 
-test_that("read_annotations() parses the annotations file correctly", {
+test_that("`read_annotations()` parses the annotations file correctly", {
   tbl <- read_annotations(file)
   expect_s3_class(tbl, "tbl_df")
   expect_equal(dim(tbl), c(1L, 43L))