-
Notifications
You must be signed in to change notification settings - Fork 0
/
spec.R
185 lines (177 loc) · 7.07 KB
/
spec.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#' Read in the variable specification sheet for a SDTM data set
#'
#' Reads the specified domain variable specification sheet from an MS Excel file.
#'
#' The [readxl::read_excel()] function will causes an access denied warning when
#' reading in a read-only specification file. This does not affect the data
#' import. Variables will be arranged in descending order per the `"Order"`
#' column in the specification.
#'
#' @param domain string, SDTM domain or supplemental domain code
#' @param dir string, specification directory
#' @param filename string, file name of the specification
#' @param arrange_by character vector, the column(s) by which to sort the domain
#' sheet, default is `"Order"`
#'
#' @returns a data frame of the variable specification for `domain`
#' @export
#'
#' @seealso [get_key_vars()], [get_codelist()], [assign_meta_data()]
#'
#' @examples
#' work_dir <- system.file("extdata", package = "sdtmval")
#' spec <- get_data_spec(domain = "XX",
#' dir = work_dir,
#' filename = "spec.xlsx")
#'
get_data_spec <- function(domain, dir, filename, arrange_by = "Order") {
readxl::read_excel(file.path(dir, filename), sheet = domain) %>%
dplyr::arrange(dplyr::across(tidyselect::all_of(arrange_by)))
}
#' Read in the key variables for a SDTM domain
#'
#' Reads the `"Key Variables"` column from the SDTM specification MS Excel
#' file's `"Datasets"` sheet for the specified `domain`.
#'
#' The `readxl::read_excel()` function will causes an access denied warning when
#' reading in a read-only specification file. This does not affect the data
#' import.
#'
#' @inheritParams get_data_spec
#' @param datasets_sheet a string, the sheet name in the specification Excel
#' file that has the key variables, default is `"Datasets"`
#' @param dataset_col a string, the column name of the domains in the table in
#' `datasets_sheet`, default is `"Dataset"`
#' @param keyvar_col a string, the column name of the key variables in the table
#' in `datasets_sheet`, default is `"Key Variables"`
#'
#' @returns a character vector of key variables for the specified `domain`
#' @export
#'
#' @seealso [get_data_spec()], [get_codelist()], [assign_meta_data()]
#'
#' @examples
#' work_dir <- system.file("extdata", package = "sdtmval")
#' key_vars <- get_key_vars(domain = "XX",
#' dir = work_dir,
#' filename = "spec.xlsx")
#'
get_key_vars <- function(domain,
dir,
filename,
datasets_sheet = "Datasets",
dataset_col = "Dataset",
keyvar_col = "Key Variables") {
readxl::read_excel(file.path(dir, filename),
sheet = datasets_sheet
) %>%
dplyr::filter(!!rlang::sym(dataset_col) %in% domain) %>%
dplyr::pull(!!rlang::sym(keyvar_col)) %>%
stringr::str_split(pattern = ", ") %>%
unlist()
}
#' Read in the code list from the specification for a specific domain
#'
#' Reads-in the `"Codelists"` sheet from the study's specification MS Excel file
#' and then filters that code list by the variables in the domain
#'
#' @inheritParams get_data_spec
#' @param var_col a string, the column name in the domain spec sheet that
#' contains the variables for that domain, default is `"Variable"`
#' @param codelist_sheet a string, the sheet name of the spec's code list from
#' the spec's .xlsx file, default is `"Codelists"`
#' @param varid_col a string, the column name in the `codelist_sheet` table
#' from the spec's .xlsx file that contains the variable names, default is
#' `"ID"`
#'
#' @returns a data frame with the code list
#' @export
#'
#' @seealso [get_data_spec()], [get_key_vars()], [assign_meta_data()]
#'
#' @examples
#' work_dir <- system.file("extdata", package = "sdtmval")
#' codelists <- get_codelist(domain = 'XX',
#' dir = work_dir,
#' filename = "spec.xlsx")
#'
get_codelist <- function(domain,
dir,
filename,
var_col = "Variable",
codelist_sheet = "Codelists",
varid_col = "ID") {
spec_vars <- get_data_spec(domain, dir, filename)[[var_col]]
readxl::read_excel(file.path(dir, filename), sheet = codelist_sheet) %>%
dplyr::filter(!!rlang::sym(varid_col) %in% spec_vars) %>%
dplyr::filter(!!rlang::sym(varid_col) != "DOMAIN")
}
#' Assign meta data to columns in a SDTM table based on specification file
#'
#' Trims the length of each text and date variable to the length specified in
#' the spec and then assigns the attributes `"label"` and `"width"` to each
#' column.
#'
#' @param tbl a data frame containing a SDTM table
#' @param spec a data frame with the columns `"Variable"` which has a value for
#' each column in `tbl`, `"Data Type"` which specifies data types by column,
#' `"Length"` which specifies the character limit for each column, and `"Label"`
#' which specifies the label for each column
#' @param datatype_col a string, the column in `spec` that contains the data
#' types (which should include the values `"text"` and `"date"`); default is
#' `"Data Type"`
#' @param var_col a string, the column in `spec` that contains the domain
#' variable names
#' @param length_col a string, the column in `spec` that contains the character
#' count limits for each variable
#' @param label_col a string, the column in `spec` that contains the labels for
#' each variable
#'
#' @returns a modified copy of `tbl` with the meta data per specification
#' @export
#'
#' @seealso [get_data_spec()], [get_key_vars()], [get_codelist()]
#'
#' @examples
#' work_dir <- system.file("extdata", package = "sdtmval")
#' spec <- get_data_spec(domain = "XX",
#' dir = work_dir,
#' filename = "spec.xlsx")
#' after_meta_data <- assign_meta_data(sdtmval::xx_no_meta_data, spec = spec)
#' labels <- colnames(after_meta_data) |>
#' purrr::map(~ attr(after_meta_data[[.]], "label")) |>
#' unlist()
#' lengths <- colnames(after_meta_data) |>
#' purrr::map(~ attr(after_meta_data[[.]], "width")) |>
#' unlist()
#' data.frame(
#' column = colnames(after_meta_data),
#' labels = labels,
#' lengths = lengths
#' )
#'
assign_meta_data <- function(tbl,
spec,
datatype_col = "Data Type",
var_col = "Variable",
length_col = "Length",
label_col = "Label") {
# for each column in the table
for (i in 1:ncol(tbl)) {
# trim character variables to max length specified
if (spec[[datatype_col]][which(spec[[var_col]] == names(tbl)[i])] %in%
c("text", "date")) {
tbl[[i]] <-
strtrim(
tbl[[i]],
spec[[length_col]][which(spec[[var_col]] == names(tbl)[i])]
)
}
# assign variable labels and lengths
attr(tbl[[names(tbl)[i]]], "label") <-
spec[[label_col]][which(spec[[var_col]] == names(tbl)[i])]
attr(tbl[[names(tbl)[i]]], "width") <-
spec[[length_col]][which(spec[[var_col]] == names(tbl)[i])]
}
return(tbl)
}