Skip to content

Commit

Permalink
Update read_annotations() with 11k content
Browse files Browse the repository at this point in the history
- closes SomaLogic#85
  • Loading branch information
stufield committed Feb 17, 2024
1 parent 29922f1 commit c66a387
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 16 deletions.
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ importFrom(magrittr,"%>%")
importFrom(methods,new)
importFrom(methods,setGeneric)
importFrom(methods,validObject)
importFrom(readxl,read_xlsx)
importFrom(stats,IQR)
importFrom(stats,mad)
importFrom(stats,median)
Expand All @@ -164,6 +165,7 @@ importFrom(tibble,tibble)
importFrom(tidyr,pivot_longer)
importFrom(tidyr,separate)
importFrom(tidyr,unite)
importFrom(tools,md5sum)
importFrom(utils,capture.output)
importFrom(utils,head)
importFrom(utils,read.delim)
Expand Down
50 changes: 40 additions & 10 deletions R/read-annotations.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,30 @@
#'
#' @param file A path to an annotations file location.
#' This is a sanctioned, versioned file provided by
#' SomaLogic Operating Co., Inc. and should be an unmodified
#' SomaLogic Operating Co., Inc. and should be an _unmodified_
#' `*.xlsx` file.
#' @return A `tibble` containing analyte-specific annotations and
#' related information (e.g. lift/scale information), keyed on SomaLogic
#' `"SeqId"` which is a unique analyte identifier.
#' related (e.g. lift/bridging) information, keyed on SomaLogic
#' [SeqId], the unique SomaScan analyte identifier.
#' @examples
#' \dontrun{
#' anno <- read_annotations("~/Desktop/SomaScan_V4.1_7K_Annotated_Content_20210616.xlsx")
#' # for example
#' file <- "~/Desktop/SomaScan_V4.1_7K_Annotated_Content_20210616.xlsx"
#' anno_tbl <- read_annotations(file)
#' }
#' @importFrom readxl read_xlsx
#' @importFrom tools md5sum
#' @export
read_annotations <- function(file) {

ext <- gsub("(.*)[.]([^.]+)$", "\\2", file)
stopifnot(ext %in% c("xlsx", "json"))
if ( !(endsWith(file, "xlsx") || endsWith(file, "json")) ) {
stop("Annotations file must be either ", .value("*.xlsx"),
" or ", .value("*.json"), ".", call. = FALSE)
}

ver <- getAnnoVer(file)

# cannot determine version
if ( !grepl("^SL-[0-9]+-rev[0-9]+", ver) ) {
stop(
"Unable to determine annotations file version: ", .value(ver),
Expand All @@ -27,13 +34,25 @@ read_annotations <- function(file) {
)
}

# do not recognize version
if ( !ver %in% names(ver_dict) ) {
stop(
"Unknown version of the annotations file: ", .value(ver), ".",
call. = FALSE
)
}

md5_file <- strtrim(md5sum(file), 7L) |> unname()
md5_true <- strtrim(ver_dict[[ver]]$sha, 7L)

# file modified
if ( !identical(md5_file, md5_true) ) {
warning(
"Checksum mismatch.", .value(basename(file)), " may have been modified.",
call. = FALSE
)
}

skip <- ver_dict[[ver]]$skip
tbl <- readxl::read_xlsx(file, sheet = "Annotations", skip = skip)

Expand Down Expand Up @@ -64,11 +83,13 @@ getAnnoVer <- function(file) {

# version dictionary of key-value pairs
# for file characteristics
# SHA hashes are calculated with `tools::md5sum()`
ver_dict <- list(
# dummy version; v4.0 -> v4.1
# The first 2 are for testing
# dummy version; 5k -> 7k
"SL-99999999-rev99-1999-01" = list(col_serum = "Serum Scalar v4.0 to v4.1",
col_plasma = "Plasma Scalar v4.0 to v4.1"),
# test-anno.xlsx file; v4.1 -> v4.0
# test-anno.xlsx file; 7k -> 5k
"SL-12345678-rev0-2021-01" = list(sha = "8a345fa621377d0bac40fc8c47f5579d",
col_serum = "Serum Scalar v4.1 to v4.0",
col_plasma = "Plasma Scalar v4.1 to v4.0",
Expand All @@ -77,7 +98,7 @@ ver_dict <- list(
skip = 8L,
rows = 1,
cols = 43),
# v4.1 -> v4.0
# 7k -> 5k
"SL-00000571-rev2-2021-06" = list(sha = "5fa46834ed826eb1e8dba88698cf7a76",
col_serum = "Serum Scalar v4.1 to v4.0",
col_plasma = "Plasma Scalar v4.1 to v4.0",
Expand All @@ -86,13 +107,22 @@ ver_dict <- list(
skip = 8L,
rows = 7596,
cols = 43),
# v4.0 -> v4.1
# 5k -> 7k
"SL-00000246-rev5-2021-06" = list(sha = "7d92666369d4e33364b11804f2d1f8ce",
col_serum = "Serum Scalar v4.0 to v4.1",
col_plasma = "Plasma Scalar v4.0 to v4.1",
which_serum = 40,
which_plasma = 42,
skip = 8L,
rows = 5284,
cols = 43),
# 11k
"SL-00000246-rev3-2024-06" = list(sha = "",
col_serum = "",
col_plasma = "",
which_serum = 40,
which_plasma = 42,
skip = 8L,
rows = 11083,
cols = 43)
)
10 changes: 6 additions & 4 deletions man/read_annotations.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions tests/testthat/test-read-annotations.R
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@

file <- test_path("testdata", "test-anno.xlsx")

test_that("getAnnoVer() parses the version correctly", {
test_that("`getAnnoVer()` parses the version correctly", {
expect_equal(getAnnoVer(file), "SL-12345678-rev0-2021-01")
})

test_that("read_annotations() parses the annotations file correctly", {
test_that("`read_annotations()` parses the annotations file correctly", {
tbl <- read_annotations(file)
expect_s3_class(tbl, "tbl_df")
expect_equal(dim(tbl), c(1L, 43L))
Expand Down

0 comments on commit c66a387

Please sign in to comment.