Skip to content

Commit

Permalink
Update read_annotations() with 11k content
Browse files Browse the repository at this point in the history
- closes SomaLogic#85
  • Loading branch information
stufield committed Feb 22, 2024
1 parent 2fe5009 commit 3090f8e
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 19 deletions.
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ importFrom(magrittr,"%>%")
importFrom(methods,new)
importFrom(methods,setGeneric)
importFrom(methods,validObject)
importFrom(readxl,read_xlsx)
importFrom(stats,IQR)
importFrom(stats,mad)
importFrom(stats,median)
Expand All @@ -165,6 +166,7 @@ importFrom(tibble,tibble)
importFrom(tidyr,pivot_longer)
importFrom(tidyr,separate)
importFrom(tidyr,unite)
importFrom(tools,md5sum)
importFrom(utils,capture.output)
importFrom(utils,head)
importFrom(utils,read.delim)
Expand Down
85 changes: 72 additions & 13 deletions R/read-annotations.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,30 @@
#'
#' @param file A path to an annotations file location.
#' This is a sanctioned, versioned file provided by
#' SomaLogic Operating Co., Inc. and should be an unmodified
#' SomaLogic Operating Co., Inc. and should be an _unmodified_
#' `*.xlsx` file.
#' @return A `tibble` containing analyte-specific annotations and
#' related information (e.g. lift/scale information), keyed on SomaLogic
#' `"SeqId"` which is a unique analyte identifier.
#' related (e.g. lift/bridging) information, keyed on SomaLogic
#' [SeqId], the unique SomaScan analyte identifier.
#' @examples
#' \dontrun{
#' anno <- read_annotations("~/Desktop/SomaScan_V4.1_7K_Annotated_Content_20210616.xlsx")
#' # for example
#' file <- "~/Desktop/SomaScan_V4.1_7K_Annotated_Content_20210616.xlsx"
#' anno_tbl <- read_annotations(file)
#' }
#' @importFrom readxl read_xlsx
#' @importFrom tools md5sum
#' @export
read_annotations <- function(file) {

ext <- gsub("(.*)[.]([^.]+)$", "\\2", file)
stopifnot(ext %in% c("xlsx", "json"))
if ( !(endsWith(file, "xlsx") || endsWith(file, "json")) ) {
stop("Annotations file must be either ", .value("*.xlsx"),
" or ", .value("*.json"), ".", call. = FALSE)
}

ver <- getAnnoVer(file)

# cannot determine version
if ( !grepl("^SL-[0-9]+-rev[0-9]+", ver) ) {
stop(
"Unable to determine annotations file version: ", .value(ver),
Expand All @@ -27,13 +34,25 @@ read_annotations <- function(file) {
)
}

# do not recognize version
if ( !ver %in% names(ver_dict) ) {
stop(
"Unknown version of the annotations file: ", .value(ver), ".",
call. = FALSE
)
}

md5_file <- strtrim(md5sum(file), 7L) |> unname()
md5_true <- strtrim(ver_dict[[ver]]$sha, 7L)

# file modified
if ( !identical(md5_file, md5_true) ) {
warning(
"Checksum mismatch.", .value(basename(file)), " may have been modified.",
call. = FALSE
)
}

skip <- ver_dict[[ver]]$skip
tbl <- readxl::read_xlsx(file, sheet = "Annotations", skip = skip)

Expand Down Expand Up @@ -64,11 +83,13 @@ getAnnoVer <- function(file) {

# version dictionary of key-value pairs
# for file characteristics
# SHA hashes are calculated with `tools::md5sum()`
ver_dict <- list(
# dummy version; v4.0 -> v4.1
# The first 2 are for testing
# dummy version; 5k -> 7k
"SL-99999999-rev99-1999-01" = list(col_serum = "Serum Scalar v4.0 to v4.1",
col_plasma = "Plasma Scalar v4.0 to v4.1"),
# test-anno.xlsx file; v4.1 -> v4.0
# test-anno.xlsx file; 7k -> 5k
"SL-12345678-rev0-2021-01" = list(sha = "8a345fa621377d0bac40fc8c47f5579d",
col_serum = "Serum Scalar v4.1 to v4.0",
col_plasma = "Plasma Scalar v4.1 to v4.0",
Expand All @@ -77,22 +98,60 @@ ver_dict <- list(
skip = 8L,
rows = 1,
cols = 43),
# v4.1 -> v4.0
# 7k -> 5k
"SL-00000571-rev2-2021-06" = list(sha = "5fa46834ed826eb1e8dba88698cf7a76",
col_serum = "Serum Scalar v4.1 to v4.0",
col_plasma = "Plasma Scalar v4.1 to v4.0",
which_serum = 40,
which_plasma = 42,
skip = 8L,
rows = 7596,
rows = 7605,
cols = 43),
# v4.0 -> v4.1
# 5k -> 7k
"SL-00000246-rev5-2021-06" = list(sha = "7d92666369d4e33364b11804f2d1f8ce",
col_serum = "Serum Scalar v4.0 to v4.1",
col_plasma = "Plasma Scalar v4.0 to v4.1",
which_serum = 40,
which_plasma = 42,
skip = 8L,
rows = 5284,
cols = 43)
rows = 5293,
cols = 43),
# 11k
# SomaScan_V5.0_11K_Annotated_Content_20240214
"SL-906-rev3-2024-02" = list(sha = "44352af60dc0152d65f3dad1f0c54abc",
col_serum = c("Serum Scalar v5.0 11K to v4.1 7K",
"Serum Scalar v5.0 11K to v4.0 5K"),
col_plasma = c("Plasma Scalar v5.0 11K to v4.1 7K",
"Plasma Scalar v5.0 11K to v4.0 5K"),
which_serum = c(43, 47),
which_plasma = c(45, 49),
skip = 8L,
rows = 11092,
cols = 51),

# 7k
# SomaScan_V4.1_7K_Annotated_Content_20240214
"SL-00000571-rev7-2024-02" = list(sha = "ab6ecbce3565c2c6049c0150f653f51b",
col_serum = c("Serum Scalar v4.1 7K to v4.0 5K",
"Serum Scalar v4.1 7K to v5.0 11K"),
col_plasma = c("Plasma Scalar v4.1 7K to v4.0 5K",
"Plasma Scalar v4.1 7K to v5.0 11K"),
which_serum = c(43, 47),
which_plasma = c(45, 49),
skip = 8L,
rows = 7605,
cols = 50)
# 5k
# SomaScan_V4.0_5K_Annotated_Content_20240216.xlsx
# (may not be released)
# "SL-00000246-revnotreleased-2024-02" = list(sha = "xxxxxx",
# col_serum = c("Serum Scalar v4.0 5K to v4.1 7K",
# "Serum Scalar v4.0 5K to v5.0 11K"),
# col_plasma = c("Plasma Scalar v4.0 5K to v4.1 7K",
# "Plasma Scalar v4.0 5K to v5.0 11K"),
# which_serum = c(43, 47),
# which_plasma = c(45, 49),
# skip = 8L,
# rows = 5293,
# cols = 50)
)
10 changes: 6 additions & 4 deletions man/read_annotations.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 13 additions & 2 deletions tests/testthat/test-read-annotations.R
Original file line number Diff line number Diff line change
@@ -1,11 +1,22 @@

file <- test_path("testdata", "test-anno.xlsx")

test_that("getAnnoVer() parses the version correctly", {
test_that("`ver_dict` is updated and correct", {
expect_length(ver_dict, 6L)
expect_named(ver_dict,
c("SL-99999999-rev99-1999-01",
"SL-12345678-rev0-2021-01",
"SL-00000571-rev2-2021-06",
"SL-00000246-rev5-2021-06",
"SL-906-rev3-2024-02",
"SL-00000571-rev7-2024-02"))
})

test_that("`getAnnoVer()` parses the version correctly", {
expect_equal(getAnnoVer(file), "SL-12345678-rev0-2021-01")
})

test_that("read_annotations() parses the annotations file correctly", {
test_that("`read_annotations()` parses the annotations file correctly", {
tbl <- read_annotations(file)
expect_s3_class(tbl, "tbl_df")
expect_equal(dim(tbl), c(1L, 43L))
Expand Down

0 comments on commit 3090f8e

Please sign in to comment.