-
Notifications
You must be signed in to change notification settings - Fork 4
/
coxpsmatch.R
261 lines (242 loc) · 9.17 KB
/
coxpsmatch.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
#' Propensity Score Matching with Time-Dependent Covariates
#'
#' Perform propensity score matching as described in Lu (2005) "Propensity Score
#' Matching with Time-Dependent Covariates". Given a longitudinal data frame
#' with covariate information, along with treatment time, match treated
#' individuals to those that haven't been treated yet (or are never treated)
#' based on time-dependent propensity scores from a Cox proportional hazards
#' model. Each treated individual is matched to one other individual, unless the
#' number of pairs is specified.
#'
#' @inheritParams brsmatch
#' @param options A list of additional parameters with the following components:
#' * `time_lag` A logical value indicating whether the matches should be made
#' on the time period preceding treatment. This can help avoid confounding if
#' treatment happens between two periods.
#'
#' @return A data frame containing the pair information. The data frame has
#' columns `id`, `pair_id`, and `type`. `id` matches the input parameter and
#' will contain all ids from the input data frame. `pair_id` refers to the id
#' of the computed pairs; `NA` values indicate unmatched individuals. `type`
#' indicates whether the individual in the pair is considered as treatment
#' ("trt") or control ("all") in that pair.
#'
#' @references Lu, Bo. 2005. "Propensity Score Matching with Time-Dependent
#' Covariates." Biometrics 61 (3): 721-28.
#' \doi{10.1111/j.1541-0420.2005.00356.x}
#'
#' @examples
#' if (requireNamespace("survival", quietly = TRUE) &
#' requireNamespace("nbpMatching", quietly = TRUE)) {
#' library(dplyr, quietly = TRUE)
#' pairs <- coxpsmatch(
#' n_pairs = 13,
#' data = oasis,
#' id = "subject_id",
#' time = "visit",
#' trt_time = "time_of_ad"
#' )
#'
#' na.omit(pairs)
#'
#' # evaluate the first match
#' first_match <- pairs$subject_id[which(pairs$pair_id == 1)]
#' oasis %>% dplyr::filter(subject_id %in% first_match)
#' }
#'
#' @export
#' @author Mitchell Paukner
coxpsmatch <- function(
n_pairs = 10^10,
data,
id = "id",
time = "time",
trt_time = "trt_time",
covariates = NULL,
exact_match = NULL,
options = list(
time_lag = FALSE
)) {
if (!requireNamespace("survival", quietly = TRUE) ||
!requireNamespace("nbpMatching", quietly = TRUE)) {
rlang::abort(c(
"Package 'survival' and 'nbpMatching' must be installed for `coxpsmatch()` to work.",
i = "Please install them."
))
}
if (!is.numeric(data[[trt_time]])) {
rlang::warn(c(
paste0("Treatment time `", trt_time, "` should be numeric."),
i = "Converting to a numeric column."
))
data[[trt_time]] <- as.numeric(data[[trt_time]])
}
id_list <- unique(data[[id]]) # compute before any NA removal
# Remove NA rows except those in `trt_time` column, with a warning
na_action <- stats::na.omit(data[, setdiff(colnames(data), trt_time)])
na_rows <- attributes(na_action)$na.action
if (!is.null(na_rows)) {
rlang::warn(c(
"ID, time, and covariates should not have NA entries.",
i = paste("Removed", length(na_rows), "rows.")
))
data <- data[-na_rows, ]
}
if (!is.null(exact_match)) {
balance_split <- split(data, data[, exact_match, drop = FALSE])
matches <- NULL
for (i in seq_along(balance_split)) {
matches <- rbind(matches, .coxps_match(
balance_split[[i]], id, time,
trt_time, covariates
))
}
} else {
matches <- .coxps_match(data, id, time, trt_time, covariates)
}
matches <- matches[order(matches$distance), ]
if (length(matches$trt.id) > n_pairs) {
matches <- matches[1:n_pairs, , drop = FALSE]
}
colnames(matches)[1:2] <- c("trt_id", "all_id")
return(.output_pairs(matches, id = id, id_list = id_list))
}
#' Propensity Score Matching with Time-Dependent Covariates
#'
#' Perform propensity score matching as described in Lu (2005) "Propensity Score
#' Matching with Time-Dependent Covariates". Given a longitudinal data frame
#' with covariate information, along with treatment time, build a MIP problem
#' that matches treated individuals to those that haven't been treated yet (or
#' are never treated) based on time-dependent propensity scores from a Cox
#' proportional hazards model. If balancing is desired, the model will try to
#' minimize the imbalance in terms of specified balancing covariates in the
#' final pair output. Each treated individual is matched to one other
#' individual.
#'
#' @inheritParams brsmatch
#' @return a data frame with matched pairs based on propensity scores generated
#' using a Cox proportional hazards model. This data frame will have three
#' columns. \code{trt.id} refers to the treated id, \code{con.id} refers to
#' the id that is a possible control, \code{distance} refers to the scaled
#' euclidean distance generated based on the propensity scores.
#'
#' @examples
#' df <- data.frame(
#' hhidpn = rep(1:3, each = 3),
#' wave = rep(1:3, 3),
#' treatment_time = rep(c(2, 3, NA), each = 3),
#' X1 = c(2, 2, 2, 3, 3, 3, 9, 9, 9),
#' X2 = rep(c("a", "a", "b"), each = 3),
#' X3 = c(9, 4, 5, 6, 7, 2, 3, 4, 8),
#' X4 = c(8, 9, 4, 5, 6, 7, 2, 3, 4)
#' )
#'
#' .coxps_match(df = df, id = "hhidpn", time = "wave", trt_time = "treatment_time")
#'
#' @importFrom stats predict
#' @noRd
.coxps_match <- function(
df,
id = "id",
time = "time",
trt_time = "trt_time",
covariates = NULL,
options = list(
time_lag = FALSE
)) {
if (is.null(covariates)) {
data <- df[, -which(names(df) %in% c(id, time, trt_time)), drop = FALSE]
} else {
data <- df[, c(covariates), drop = FALSE]
}
data$id <- as.factor(unlist(df[, id, drop = FALSE]))
data$time <- unlist(df[, time, drop = FALSE])
data$trt_time <- unlist(df[, trt_time, drop = FALSE])
data$trt <- ifelse(is.na(data$trt_time), 0, 1)
data[, covariates] <- dplyr::mutate_if(data[, covariates, drop = FALSE], is.numeric, scale)
time.max <- max(data$trt_time[!is.na(data$trt_time)])
data$trt_time <- ifelse(is.na(data$trt_time), time.max + 1, data$trt_time)
data <- transform(data, iid = as.numeric(id))
data.cox <-
data[which(data$time <= data$trt_time), , drop = FALSE]
data.cox$start <- data.cox$time - rep(1, length(data.cox$time))
form <- survival::Surv(start, time, trt) ~ . - id - trt_time - iid
model <- survival::coxph(form, data = data.cox)
data.cox$p <- predict(model)
nbp.distance <-
matrix(rep(NA, length(unique(data$iid)) * length(unique(data$iid))),
ncol = length(unique(data$iid))
)
nbp.t <-
matrix(rep(NA, length(unique(data$iid)) * length(unique(data$iid))),
ncol = length(unique(data$iid))
)
t.lag <- -1 * options$time_lag
for (i in seq_along(unique(data$iid))) {
for (j in seq_along(unique(data$iid))) {
if (i == j) {
nbp.distance[i, j] <- 999
}
if (i < j) {
a <- unique(data[which(data$iid == i), , drop = FALSE]$trt_time)
b <- unique(data[which(data$iid == j), , drop = FALSE]$trt_time)
if (a == b) {
nbp.distance[i, j] <- 999
} else {
trt_time <- min(a, b)
nbp.t[i, j] <- trt_time
if (length(data.cox[which(data.cox$iid == i &
data.cox$time == trt_time + t.lag), , drop = FALSE]$p) != 0 &&
length(data.cox[which(data.cox$iid == j &
data.cox$time == trt_time + t.lag), , drop = FALSE]$p) != 0 &&
length(data.cox[which(data.cox$iid == i &
data.cox$time == trt_time), , drop = FALSE]$p) != 0 &&
length(data.cox[which(data.cox$iid == j &
data.cox$time == trt_time), , drop = FALSE]$p) != 0) {
nbp.distance[i, j] <- (data.cox[which(data.cox$iid == i &
data.cox$time == trt_time + t.lag), , drop = FALSE]$p -
data.cox[which(data.cox$iid == j &
data.cox$time == trt_time + t.lag), , drop = FALSE]$p)^2
} else {
nbp.distance[i, j] <- 999
}
}
}
if (i > j) {
nbp.distance[i, j] <- nbp.distance[j, i, drop = FALSE]
}
}
}
nbp.distance1 <- nbp.distance * 1000
nbp.distance2 <- nbpMatching::distancematrix(nbp.distance1)
nbp <- nbpMatching::nonbimatch(nbp.distance2)
nbpp <- nbp$halves[which(nbp$halves$Distance != 999000), , drop = FALSE]
if (length(unique(data$id)) %% 2 != 0) {
nbpp <- nbpp[-which(nbpp$Group1.ID == "ghost" | nbpp$Group2.ID == "ghost"), , drop = FALSE]
}
nbp.id1 <- nbpp$Group1.Row
nbp.id2 <- nbpp$Group2.Row
trt.id <- NULL
con.id <- NULL
for (i in seq_along(nbp.id1)) {
if (data$trt_time[which(data$iid == nbp.id1[i])[1]] <
data$trt_time[which(data$iid == nbp.id2[i])[1]]) {
trt.id[i] <- as.numeric(as.character(nbp.id1[i]))
con.id[i] <- as.numeric(as.character(nbp.id2[i]))
} else {
trt.id[i] <- as.numeric(as.character(nbp.id2[i]))
con.id[i] <- as.numeric(as.character(nbp.id1[i]))
}
}
for (i in seq_along(nbp.id1)) {
trt.id[i] <-
as.character(data$id[which(data$iid == trt.id[i])[1]])
con.id[i] <-
as.character(data$id[which(data$iid == con.id[i])[1]])
}
coxph_matches <- data.frame(
"trt.id" = trt.id, "con.id" = con.id,
"distance" = nbpp$Distance
)
return(coxph_matches)
}