-
Notifications
You must be signed in to change notification settings - Fork 15
/
align.R
219 lines (208 loc) · 8.97 KB
/
align.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
#' Add reference dates and relative horizons to a dataframe of forecasts
#'
#' @param forecasts dataframe in format returned by \code{load_forecasts}
#' @param reference_dates a named list of vectors of reference dates for
#' forecasts, grouping forecasts that were made during the same week. The list
#' should have two components: "wk" providing reference dates for forecasts at
#' a weekly temporal resolution, and "day" providing reference dates for
#' forecasts at a daily temporal resolution. Dates may be a character vector in
#' the format "YYYY-MM-DD" or a vector of Dates. For example, if a forecast of
#' daily hospitalizations is issued with a forecast date that is Saturday,
#' 2021-09-04, and we want to align analyses around a Monday, the reference
#' date would be Monday, 2021-09-06. This can be accomplished more easily by
#' providing the `reference_weekday` argument below.
#' @param reference_weekday a named list or named character vector of weekdays
#' to use as reference dates. The list should contain entries named "wk" and
#' "day". The default uses "Saturday" as the reference weekday for forecasts
#' of weekly targets and "Monday" as the reference weekday for forecasts of
#' daily targets.
#' @param reference_windows a named list of integer vectors with offset values
#' giving the set of forecast dates that should be assigned to a particular
#' reference date, in units of number of days. The function defaults to using
#' `-4:2` for forecasts at a weekly temporal resolution so that a forecast
#' issued on a Monday will have a reference date of the previous Saturday, and
#' `-6:0` for forecasts at a daily temporal resolution so that all forecasts
#' with a `forecast_date` in the week leading up to a particular Monday will be
#' assigned a reference date of that Monday.
#' @param drop_nonpos_relative_horizons boolean indicating whether forecasts that have a
#' non-positive horizon relative to the reference date should be dropped.
#' Defaults to `TRUE`
#'
#' @return forecast dataframe augmented by columns reference_date and
#' relative_horizon
#'
#' @examples
#' \dontrun{
#' library(tidyverse)
#' library(covidHubUtils)
#' hub_repo_path <- "../covid19-forecast-hub"
#' dates <- seq.Date(as.Date("2021-05-01"), as.Date("2021-06-01"), by = 1)
#' forecasts <- load_forecasts(
#' models = get_all_models(source = "local_hub_repo", hub_repo_path = hub_repo_path),
#' dates = dates[!(weekdays(dates) %in% c("Monday", "Sunday"))],
#' date_window_size = 0,
#' locations = "US",
#' types = "quantile",
#' source = "local_hub_repo",
#' hub_repo_path = hub_repo_path,
#' verbose = FALSE,
#' as_of = NULL,
#' hub = c("US")
#' ) %>% filter(quantile == .5)
#' forecasts
#' forecasts %>% distinct(model)
#' forecasts %>% align_forecasts()
#' forecasts %>% align_forecasts(drop_nonpos_relative_horizons = FALSE) %>%
#' filter(relative_horizon <= 0)
#'}
#'
#' @export
align_forecasts <- function(
forecasts,
reference_dates = list("wk" = NULL, "day" = NULL),
reference_weekday = list("wk" = "Saturday", "day" = "Monday"),
reference_windows = list("wk" = -4:2, "day" = -6:0),
drop_nonpos_relative_horizons = TRUE
) {
if (!(
all(c("wk", "day") %in% (names(reference_dates))) &&
all(c("wk", "day") %in% (names(reference_weekday))) &&
all(c("wk", "day") %in% (names(reference_windows)))
)) {
stop(paste0("reference_dates, reference_weekday, and reference_windows ",
"must be named lists with entries for 'wk' and 'day'."))
}
aligned_forecasts <- purrr::map_dfr(
unique(forecasts$temporal_resolution),
function(temporal_res) {
align_forecasts_one_temporal_resolution(
forecasts = forecasts %>%
dplyr::filter(temporal_resolution == temporal_res),
reference_dates = reference_dates[[temporal_res]],
reference_weekday = reference_weekday[[temporal_res]],
reference_windows = reference_windows[[temporal_res]],
drop_nonpos_relative_horizons = drop_nonpos_relative_horizons
)
}
)
return(
# use join to retain order of original dataframe
dplyr::right_join(forecasts, aligned_forecasts, by = names(forecasts)) %>%
# place columns in a convenient order
dplyr::relocate(reference_date, .after = forecast_date) %>%
dplyr::relocate(relative_horizon, .after = horizon)
)
}
#' Internal function that add reference dates and relative horizons
#' to a dataframe of forecasts. This function requires that the forecasts
#' data frame contains only forecasts one temporal_resolution, i.e., only
#' at the daily resolution or the weekly resolution.
#'
#' @param forecasts dataframe in format returned by \code{load_forecasts}
#' @param reference_dates a vector of reference dates for forecasts, grouping
#' forecasts that were made during the same week. May be a character vector in
#' the format "YYYY-MM-DD" or a vector of Dates. For example, if a forecast of
#' daily hospitalizations is issued with a forecast date that is Saturday,
#' 2021-09-04, and we want to align analyses around a Monday, the reference
#' date would be Monday, 2021-09-06. This can be accomplished more easily by
#' providing the `reference_weekday` argument below.
#' @param reference_weekday capitalized weekday, e.g. "Monday". The day of week
#' to use for reference dates, if the `reference_dates` argument is missing or
#' `NULL`
#' @param reference_windows an integer vector of offset values indicating the
#' set of forecast dates should be assigned to a particular reference date,
#' in units of number of days. If the `reference_weekday` is "Monday", the
#' function defaults to `-6:0`, so that all forecasts with a `forecast_date`
#' in the week leading up to a particular Monday will be assigned a reference
#' date of that Monday. By default, if the `reference_weekday` is "Saturday",
#' the function defaults to `-4:2`. This is appropriate for forecasts of
#' weekly targets, in which case we might like to assign a forecast issued on
#' a Monday to have a reference date of the previous Saturday.
#' @param drop_nonpos_relative_horizons boolean indicating whether forecasts that have a
#' non-positive horizon relative to the reference date should be dropped.
#' Defaults to `TRUE`
#'
#' @return forecast dataframe augmented by columns reference_date and
#' relative_horizon
align_forecasts_one_temporal_resolution <- function(
forecasts,
reference_dates,
reference_weekday,
reference_windows,
drop_nonpos_relative_horizons
) {
if (length(unique(forecasts$temporal_resolution)) > 1) {
stop("standardize_forecasts_one_temporal_resolution only supports forecasts at a single temporal resolution.")
}
if (is.null(reference_windows)) {
if (reference_weekday == "Saturday") {
reference_windows <- -4:2
} else if (reference_weekday == "Monday") {
reference_windows <- -6:0
} else {
stop("Reference windows undefined")
}
}
if (!is.list(reference_windows)) {
reference_windows <- list(reference_windows)
}
if (!is.null(reference_dates)) {
# ensure we have dates
reference_dates <- as.Date(reference_dates)
} else {
# every date from that of first forecast - diameter of first window
# to that of last forecast + diameter of last window
all_dates <- seq(
min(forecasts$forecast_date) - (
max(sort(reference_windows[[1]])) -
min(sort(reference_windows[[1]]))
),
max(forecasts$forecast_date) + (
max(sort(reference_windows[[length(reference_windows)]])) -
min(sort(reference_windows[[length(reference_windows)]]))
),
by = 1
)
# keep the dates identified above that are the specified reference_weekday
reference_dates <- all_dates[weekdays(all_dates) == reference_weekday]
}
# create a tibble where each row contains:
# - a possible forecast date
# - a reference date to which that forecast date should be assigned
ref_df <- tibble(
reference_date = reference_dates,
forecast_date = purrr::map2(
reference_date,
reference_windows,
~.x+.y
)
) %>% unnest(cols = forecast_date)
# ensure that in the tibble constructed above, each forecast date is
# associated with at most one reference date
# this could be violated if some windows are overlapping
reps <- ref_df %>%
dplyr::group_by(forecast_date) %>%
dplyr::tally() %>%
dplyr::filter(n > 1)
if (nrow(reps) > 0) {
stop(paste0(
"The following forecast dates are associated with multiple reference dates: ",
paste(reps %>% dplyr::pull(forecast_date), collapse = ", ")
))
}
# join with the reference date lookup table above
# and calculate the relative horizon
forecasts <- forecasts %>%
dplyr::left_join(ref_df, by = "forecast_date") %>%
dplyr::mutate(
ts_days = ifelse(temporal_resolution == "wk", 7, 1),
relative_horizon =
ceiling(as.numeric((target_end_date - reference_date) / ts_days))
) %>%
dplyr::select(-ts_days)
if (drop_nonpos_relative_horizons) {
forecasts <- forecasts %>%
dplyr::filter(relative_horizon > 0)
}
return(forecasts)
}