-
Notifications
You must be signed in to change notification settings - Fork 0
/
methods.R
330 lines (287 loc) · 11.3 KB
/
methods.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
#' Create a BLFL column
#'
#' Utilizes the BLFL method from the SDTM spec to create a baseline flag: Equal
#' to "Y" for last record with non-missing --ORRES on or before first dose date
#' (RFSTDTC); `NA` otherwise.
#'
#' @param tbl a data frame with the variables `USUBJID`, `[domain]ORRES`,
#' `RFSTDTC`, and to column specified in the `sort_date` argument
#' @param sort_date a string, the column name by which to sort records within
#' each `USUBJID` entry before assigning the BLFL value. This is also the date
#' compared against `RFSTDTC` to determine the BLFL value. This column should
#' either already have a date class or be a character vector in the format
#' YYYY-MM-DD
#' @param domain a string, the SDTM domain abbreviation
#' @param grouping_vars a character vector of columns to group by when assigning
#' the BLFL, default is `"USUBJID"`. The order of this vector matters.
#' @param RFSTDTC a string, the column to use for `RFSTDTC`, default is
#' `"RFSTDTC"`; this columns should either have a date class or a characer class
#' in the YYYY-MM-DD format
#' @param compare_date_method a string, one of `c("on or before", "before")`
#' indicating where the baseline measurement should be evaluated on or before
#' the study start date or just before; default is `"on or before"`
#'
#' @returns a modified copy of `tbl` with the new column `[domain]BLFL`
#' @export
#'
#' @seealso [create_EPOCH()], [calc_DY()]
#'
#' @examples
#' df <- dplyr::tibble(
#' USUBJID = c(
#' rep(1, 3),
#' rep(2, 3)
#' ),
#' XXORRES = c(
#' 1, 2, 2,
#' 1, 2, NA
#' ),
#' XXDTC = as.Date(c(
#' "2017-02-05", "2017-02-06", "2017-02-07",
#' "2017-02-05", "2017-02-06", "2017-02-07"
#' )),
#' RFSTDTC = as.Date(c(
#' rep("2017-02-05", 3),
#' rep("2017-02-07", 3)
#' ))
#' )
#' create_BLFL(df, sort_date = "XXDTC", domain = "XX")
#'
create_BLFL <- function(tbl,
sort_date,
domain,
grouping_vars = "USUBJID",
RFSTDTC = "RFSTDTC",
compare_date_method = "on or before") {
# create a name for a temporary column
PDVN <- paste0(sample(letters, size = 25), collapse = "")
while (any(colnames(tbl) == PDVN)) {
PDVN <- paste0(sample(letters, size = 25), collapse = "")
}
tbl <- tbl %>%
dplyr::arrange(dplyr::across(tidyselect::all_of(c(grouping_vars, sort_date)))) %>%
dplyr::group_by(dplyr::across(tidyselect::all_of(grouping_vars)))
# evaluate baseline differently based on `compare_date_method`
if (compare_date_method == "on or before") {
tbl <- tbl %>%
dplyr::mutate(
"{PDVN}" := dplyr::if_else(
as.Date((!!rlang::sym(sort_date))) <= as.Date((!!rlang::sym(RFSTDTC))) &
!is.na(!!rlang::sym(paste0(stringr::str_to_upper(domain), "ORRES"))),
dplyr::row_number(),
NA_integer_)
)
} else if (compare_date_method == "before") {
tbl <- tbl %>%
dplyr::mutate(
"{PDVN}" := dplyr::if_else(
as.Date((!!rlang::sym(sort_date))) < as.Date((!!rlang::sym(RFSTDTC))) &
!is.na(!!rlang::sym(paste0(stringr::str_to_upper(domain), "ORRES"))),
dplyr::row_number(),
NA_integer_)
)
} else {
stop("Argument `compare_date_method` must be one of `c('on or before', 'before')`")
}
tbl <- tbl %>%
dplyr::mutate(
"{stringr::str_to_upper(domain)}BLFL" := dplyr::case_when(
all(is.na(!!rlang::sym(PDVN))) ~ NA_character_,
is.na(!!rlang::sym(PDVN)) ~ NA_character_,
(!!rlang::sym(PDVN)) == suppressWarnings(max(!!rlang::sym(PDVN),
na.rm = T)) ~ "Y",
TRUE ~ NA_character_
)
) %>%
dplyr::select(-tidyselect::all_of(PDVN)) %>%
dplyr::ungroup()
return(tbl)
}
#' Create the EPOCH variable
#'
#' Utilizes the EPOCH method from the SDTM spec: Missing when `--DTC` is blank;
#' equal to `'SCREENING'` if `--DTC` if before `RFXSTDTC`; equal to `'TREATMENT'`
#' if `--DTC` is on or after `RFXSTDTC` and on or before `RFXENDTC`; equal to
#' `'FOLLOW-UP'` if `--DTC` is after `RFXENDTC`.
#'
#' @param tbl a data frame with date class columns `RFXSTDTC` and `RFXENDTC` and
#' the column given in the `date_col` argument
#' @param date_col a string, the column name of the event date used to determine
#' the EPOCH; this column can either have a date class or a character class in
#' the YYYY-MM-DD format
#' @param RFXSTDTC a string, the date column to use for `RFXSTDTC`, default is
#' `"RFXSTDTC"`; this column can either have a date class or a character class in
#' the YYYY-MM-DD format
#' @param RFXENDTC a string, the date column to use for `RFXENDTC`, default is
#' `"RFXENDTC"`; this column can either have a date class or a character class in
#' the YYYY-MM-DD format
#'
#' @returns a modified copy of `tbl` with the `EPOCH` column
#' @export
#'
#' @seealso [create_BLFL()], [calc_DY()]
#'
#' @examples
#' df <- data.frame(
#' DTC = c("2023-08-01", "2023-08-02", "2023-08-03", "2023-08-04"),
#' RFXSTDTC = rep("2023-08-02", 4),
#' RFXENDTC = rep("2023-08-03", 4)
#' )
#' create_EPOCH(df, date_col = "DTC")
#'
create_EPOCH <- function(tbl,
date_col,
RFXSTDTC = "RFXSTDTC",
RFXENDTC = "RFXENDTC") {
tbl %>%
dplyr::mutate(
EPOCH = dplyr::case_when(
is.na(!!rlang::sym(date_col)) ~ NA_character_,
as.Date(!!rlang::sym(date_col)) < as.Date(!!rlang::sym(RFXSTDTC)) ~
"SCREENING",
as.Date(!!rlang::sym(date_col)) >= as.Date(!!rlang::sym(RFXSTDTC)) &
as.Date(!!rlang::sym(date_col)) <= as.Date(!!rlang::sym(RFXENDTC)) ~
"TREATMENT",
as.Date(!!rlang::sym(date_col)) > as.Date(!!rlang::sym(RFXENDTC)) ~
"FOLLOW-UP"
)
)
}
#' Calculate a DY variable (day of study)
#'
#' Utilizes the DY method from the SDTM spec: `--DTC-RFSTDTC+1` if `--DTC` is on
#' or after RFSTDTC. `--DTC-RFSTDTC` if `--DTC` precedes `RFSTDTC`. This
#' function can also be used for the ENDY method from the spec which has the
#' same logic.
#'
#' @param tbl a data frame with the date column `RFSTDTC` and the column
#' specified by the `DTC_col` argument
#' @param DY_col string, the name of the new DY column to create
#' @param DTC_col string, the column in `tbl` which has the dates for which to
#' calculated the DY value; should either already have a date class or be a
#' character vector in the format YYYY-MM-DD
#' @param RFSTDTC a string, the column to use for `RFSTDTC`, default is
#' `"RFSTDTC"`; should either already have a date class or be a
#' character vector in the format YYYY-MM-DD
#'
#' @returns a modified copy of `tbl` with the new DY column
#' @export
#'
#' @seealso [create_BLFL()], [create_EPOCH()]
#'
#' @examples
#' df <- data.frame(
#' DTC = c("2023-08-01", "2023-08-02", "2023-08-03", "2023-08-04"),
#' RFSTDTC = rep("2023-08-02", 4)
#' )
#' calc_DY(df, DY_col = "XXDY", DTC_col = "DTC")
#'
calc_DY <- function(tbl, DY_col, DTC_col, RFSTDTC = "RFSTDTC") {
tbl %>%
dplyr::mutate(
"{DY_col}" := dplyr::case_when(
as.Date(!!rlang::sym(DTC_col)) >= as.Date((!!rlang::sym(RFSTDTC))) ~
as.numeric(as.Date(!!rlang::sym(DTC_col)) - as.Date(!!rlang::sym(RFSTDTC)) + 1),
(as.Date(!!rlang::sym(DTC_col))) < (!!rlang::sym(RFSTDTC)) ~
as.numeric(as.Date(!!rlang::sym(DTC_col)) - as.Date(!!rlang::sym(RFSTDTC))),
TRUE ~ NA_real_
)
)
}
#' Assign SEQ numbers for a SDTM data set
#'
#' Assigns the `"[DOMAIN]SEQ"` number by sorting the data set by the specified
#' variables and then grouping by `"USUBJID"`.
#'
#' @param tbl a data frame, the SDTM table
#' @param key_vars a character vector of the key variables to sort by
#' @param seq_prefix a string, the prefix for SEQ as per the spec (usually the
#' two letter domain abbreviation)
#' @param USUBJID a string, the column for the subject ID, USUBJID, default is
#' `"USUBJID"`
#'
#' @returns a sorted copy of the `tbl` data frame with the new SEQ column
#' @export
#'
#' @examples
#' df <- data.frame(
#' USUBJID = paste("Subject", c(rep(1, 3), rep(2, 3))),
#' XXTESTCD = paste("T", rep(c(2, 3, 1), 2))
#' )
#' assign_SEQ(df, key_vars = c("USUBJID", "XXTESTCD"), seq_prefix = "XX")
#'
assign_SEQ <- function(tbl, key_vars, seq_prefix, USUBJID = "USUBJID") {
tbl %>%
dplyr::arrange(dplyr::across(.cols = tidyselect::all_of(key_vars))) %>%
dplyr::group_by(dplyr::across(tidyselect::all_of(USUBJID))) %>%
dplyr::mutate("{seq_prefix}SEQ" := dplyr::row_number()) %>%
dplyr::ungroup()
}
#' Assign STAT 'NOT DONE' status
#'
#' Creates a --STAT variable and, if all measurements for a visit were not done,
#' also changes all --TESTCD values as "--ALL"
#'
#' @param df a data frame to modify
#' @param domain a string, the domain abbreviation in all caps
#' @param nd_ind a string, the variable name in `df` that indicates if a test
#' was not performed, usually a `"Yes"`/`"No"` or `"Y"`/`"N"` column
#' @param nd_ind_cd a string, the code from the `nd_ind` column that signifies
#' a test was not done, default is `"Yes"`
#' @param USUBJID a string, the variable name in `df` that contains the subject
#' identifier, default is `"USUBJID"`
#' @param VISIT a string, the variable name in `df` that indicates a VISIT field,
#' default is `"VISIT"`
#'
#' @returns a modified copy of `df`
#' @export
#'
#' @examples
#' df <- dplyr::tibble(
#' USUBJID = paste("Subject", c(rep("A", 2), rep("B", 4), rep("C", 2))),
#' VISIT = paste("Visit", c(1 , 2 , 1 , 1 , 2 , 2 , 2 , 2)),
#' XXTESTCD = paste("Test", c(1 , 2 , 1 , 2 , 1 , 2 , 1 , 2)),
#' ND = c("N", "N", "Y", "Y", "N", "N", "Y", "Y")
#' )
#' create_STAT(df = df, domain = "XX", nd_ind = "ND", nd_ind_cd = "Y")
#'
create_STAT <- function(df,
domain,
nd_ind,
nd_ind_cd = "Yes",
USUBJID = "USUBJID",
VISIT = "VISIT") {
# check the required variables are present
must_have_cols <- c(nd_ind, USUBJID, VISIT, paste0(domain, "TESTCD"))
stopifnot(all(must_have_cols %in% colnames(df)))
# create a temporary variable for storing row numbers
tmp_var <- paste0(sample(letters, 25), collapse = "")
while (tmp_var %in% colnames(df)) {
tmp_var <- sample(letters, 25)
}
df <- df %>%
# mark not done status and create STAT variable
dplyr::mutate("{domain}STAT" := dplyr::if_else(
!!rlang::sym(nd_ind) == nd_ind_cd,
"NOT DONE",
NA_character_
)) %>%
# consolidate all rows for the same subject and visit with NOT DONE tests
# into one --TESTCD with --ALL values
dplyr::group_by(dplyr::across(tidyselect::all_of(c(USUBJID, VISIT)))) %>%
dplyr::mutate("{domain}TESTCD" := dplyr::case_when(
all(!is.na(!!rlang::sym(paste0(domain, "STAT")))) ~ paste0(domain, "ALL"),
TRUE ~ !!rlang::sym(paste0(domain, "TESTCD"))
)) %>%
# remove duplicates of --ALL values
dplyr::mutate(
"{tmp_var}" := dplyr::row_number(),
"{tmp_var}" := dplyr::case_when(
!!rlang::sym(paste0(domain, "TESTCD")) != paste0(domain, "ALL") ~ NA_integer_,
TRUE ~ !!rlang::sym(tmp_var)
)) %>%
dplyr::filter(is.na(!!rlang::sym(tmp_var)) | (!!rlang::sym(tmp_var)) == 1) %>%
dplyr::select(-tidyselect::all_of(tmp_var)) %>%
dplyr::ungroup()
return(df)
}