This repository has been archived by the owner on Aug 29, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
13 changed files
with
32,863 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,10 @@ | ||
# dengue-thailand-2014-forecasts | ||
Code and public data for "Challenges in real-time prediction of infectious disease: a case study of dengue in Thailand" | ||
# Predicting Dengue Fever in Thailand, 2014 | ||
This repository houses code and public data for the manuscript "Challenges in real-time prediction of infectious disease: a case study of dengue in Thailand". This paper describes a practical computational infrastructure that generated multi-step predictions of dengue incidence in Thai provinces every two weeks throughout 2014. | ||
|
||
Due to restrictions in the data-sharing agreement with the owners of the case data from Thailand, we are not permitted to make the case data available on this repository. However, we have provided the code used in the analysis, the reproducible manuscript documents (although they are not fully reproducible as is here due to dependencies on data that we are not allowed to share), and the forecasts generated by our prediction model. | ||
|
||
The `code` folder houses code for analysis and figure making. | ||
|
||
The `data` folder contains csv files containing summaries of our forecasts. | ||
|
||
The `manuscript` folder contains the dynamic document used to create the final manuscript and supplement that were submitted for review. |
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
|
||
|
||
#' reformat a cntry.data.linelist object into a long-format dataset with delivery dates | ||
#' | ||
#' @param d a cntry.data.linelist object | ||
#' | ||
#' @return a tbl_df of biweekly case count data. | ||
#' Columns for province, date_sick_year, date_sick_biweek, delivery_biweek, | ||
#' and case_counts. | ||
#' | ||
make_long_aggregated_counts_through_2014 <- function(d) { | ||
require(dengueThailand) | ||
require(spatialpred) | ||
require(dplyr) | ||
require(tidyr) | ||
require(lubridate) | ||
require(reshape2) | ||
|
||
data(thai_prov_data) | ||
thai_prov_narrow <- select(thai_prov_data, Province, province, FIPS) %>% | ||
dplyr::rename(prov_name = Province) | ||
|
||
## put wide-form counts into long-form | ||
colnames(d) <- paste0("BW.", d@time.in.yr, ".", den_data@yr) | ||
elongated_counts <- tbl_df(reshape2::melt(d@.Data, | ||
varnames=c("prov_name", "char.time"))) %>% | ||
dplyr::rename(case_count = value) %>% | ||
separate(char.time, | ||
into = c("bw_label", "date_sick_biweek", "date_sick_year"), | ||
convert=TRUE) %>% | ||
filter(date_sick_year < 2014) %>% ## 2014 cases to be added with deliv_dates | ||
select(-bw_label) %>% ## drop unneeded label column | ||
mutate(case_count = round(case_count)) %>% ## round interpolated cases | ||
left_join(thai_prov_narrow) %>% | ||
mutate(prov_name = as.factor(prov_name)) | ||
## total records = 61256 = 76 provinces * 26 biweeks * 46 years | ||
|
||
## aggregate 2014 linelist cases | ||
aggregated_linelist_2014 <- tbl_df(d@line.list) %>% | ||
filter(!is.na(date_sick), ## remove NA date sick | ||
year(date_sick)==2014, ## only 2014 | ||
province != 38, ## remove Bueng Kan, already added to Nong Khai | ||
disease==26) %>% ## subset to DHF | ||
mutate(date_sick_biweek = date_to_biweek(date_sick), | ||
date_sick_year = year(date_sick), | ||
province = as.numeric(province)) %>% | ||
group_by(province, delivery_date, date_sick_year, date_sick_biweek) %>% | ||
summarize(case_count = n()) %>% | ||
ungroup() %>% | ||
arrange(province, date_sick_year, date_sick_biweek, delivery_date) | ||
|
||
## for 2014, make table with one row per province-biweek-year-deliv_date | ||
## total rows = 76 provinces * 26 biweeks * 29 delivery_dates * 1 year | ||
expanded_linelist_cols <- aggregated_linelist_2014 %>% | ||
filter(date_sick_year == 2014) %>% | ||
tidyr::expand(province, date_sick_year, date_sick_biweek, delivery_date) | ||
## could remove the rows with delivery_date < date_sick | ||
|
||
## merge expanded grid with observed data | ||
## total rows = 148954 | ||
full_aggregated_linelist <- left_join(expanded_linelist_cols, aggregated_linelist_2014) %>% | ||
left_join(thai_prov_narrow) | ||
|
||
## merge with old data | ||
all_dhf_data <- bind_rows(full_aggregated_linelist, elongated_counts) %>% | ||
dplyr::rename(prov_num = province) %>% | ||
mutate(prov_name = factor(prov_name)) | ||
|
||
return(all_dhf_data) | ||
} | ||
|
||
|
||
|
||
#' Plot all case data from an aggregated counts file | ||
#' | ||
#' @param counts results from a function like make_long_aggregated_counts_through_2014() | ||
#' @param type scale of data, either "incidence" or "raw" | ||
#' | ||
#' @return NULL | ||
make_all_data_raster <- function(counts, type=c("incidence", "raw")) { | ||
type <- match.arg(type) | ||
require(dengueThailand) | ||
require(dplyr) | ||
require(scales) | ||
require(grid) | ||
|
||
data(thai_prov_data) | ||
|
||
prov_data <- thai_prov_data %>% | ||
transmute(FIPS=FIPS, pop = Population, moph_region = MOPH_Admin_Code) | ||
|
||
## aggregate 2014 counts across delivery dates | ||
counts_aggregated <- counts %>% | ||
## for 2014 only, add sum counts with na.rm=TRUE | ||
filter(date_sick_year == 2014) %>% | ||
group_by(date_sick_year, date_sick_biweek, FIPS, prov_name) %>% | ||
summarize(case_count = sum(case_count, na.rm=TRUE)) %>% | ||
ungroup() %>% | ||
## add back other years | ||
bind_rows(filter(counts, date_sick_year != 2014)) | ||
|
||
## merge couts and province data | ||
counts_with_prov <- left_join(counts_aggregated, prov_data) %>% | ||
mutate(prov_name = reorder(prov_name, pop)) | ||
|
||
## rescale to avoid -Inf | ||
zero_idx <- which(counts_with_prov$case_count == 0) | ||
counts_with_prov[zero_idx, "case_count"] <- 1 | ||
|
||
## color schemes from 9-class OrRd: http://colorbrewer2.org/ | ||
if(type=="incidence") { | ||
ggplot(counts_with_prov) + | ||
geom_raster(aes(x=date_sick_year+(date_sick_biweek-1)/26, | ||
y=prov_name, | ||
fill=log10((case_count)/pop))) + | ||
#facet_grid(moph_region~., space="free", scales="free") + | ||
scale_fill_gradientn(colours=c("#fff7ec", "#fdbb84", "#7f0000"), #c("#ffffcc", "#fd8d3c", "#800026"), | ||
values = rescale(c(-7,-4,-2.2)), | ||
guide = "colorbar", limits=c(-7,-2.2), | ||
name="cases per 100,000 residents", | ||
na.value="lightgrey", | ||
labels=c(1, 10, 100, 500), breaks=c(-5, -4, -3, log10(5/1000)))+ | ||
scale_x_continuous(expand=c(0,0)) + | ||
theme_bw() + xlab("") + ylab("") + | ||
theme(panel.margin = unit(0, "lines")) | ||
|
||
} else if (type=="raw") { | ||
ggplot(counts_with_prov) + | ||
geom_raster(aes(x=date_sick_year+(date_sick_biweek-1)/26, | ||
y=prov_name, | ||
fill=log10(case_count))) + | ||
#facet_grid(moph_region~., space="free", scales="free") + | ||
scale_fill_gradientn(colours=c("#fff7ec", "#fc8d59", "#7f0000"), | ||
values = rescale(c(0, 4)), | ||
guide = "colorbar", limits=c(0,4), | ||
name="number of cases", | ||
na.value="lightgrey", | ||
labels=c(0, 10, 100, 1000), breaks=log10(c(1, 11, 101, 1001))) + | ||
scale_x_continuous(expand=c(0,0)) + | ||
theme_bw() + xlab("") + ylab("") + | ||
theme(panel.margin = unit(0, "lines")) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
## function to make outbreak probability plot for paper | ||
## Nicholas Reich | ||
## June 2015 | ||
|
||
## adapted from dengueThailand function new_plot_forecast_map.R | ||
|
||
#'@param forecast_data forecast file dataframe | ||
#'@param cdata cntry.data object with spatial polygons needed for plotting | ||
#'@param biweek_to_plot biweek to plot | ||
#'@param analysis_date analysis date to plot predictions from | ||
#'@param include_legend logical, whether to include legend | ||
#'@param plot_type one of either "incidence" or "outbreak" | ||
|
||
plot_outbreak_map <- function(forecast_data, | ||
wld.shape = "../../data/updated_adm/ne_10m_admin_1_states_provinces.shp", | ||
biweek_to_plot, | ||
analysis_date, | ||
include_legend=TRUE, | ||
plot_type=c("incidence", "outbreak")) { | ||
require(ggplot2) | ||
require(dplyr) | ||
require(rgeos) | ||
require(mapproj) | ||
require(maptools) | ||
|
||
|
||
data(thai_prov_data) | ||
|
||
## load in spatial polygons | ||
world <- readShapeSpatial(wld.shape) | ||
thai_poly <- world[world$admin=="Thailand",] | ||
|
||
## adjust for end of year biweeks | ||
if(biweek_to_plot>26) | ||
biweek_to_plot <- biweek_to_plot - 26 | ||
|
||
if(!(biweek_to_plot %in% unique(forecast_data$date_sick_biweek))) | ||
stop("date_sick_biweek must be in forecast_data.") | ||
|
||
if(nrow(filter(forecast_data, | ||
analysis_date == analysis_date, | ||
date_sick_biweek=biweek_to_plot)) < 1) | ||
stop("no match for plotting biweek and analysis date.") | ||
|
||
## merge thai_prov_data with forecasts to get population | ||
forecast_data_merged <- left_join(forecast_data, thai_prov_data, by = c("pid" = "FIPS")) %>% | ||
mutate(incidence = pred_median/Population) | ||
|
||
## foritfy polygon info | ||
thai_poly <- thai_poly[which(thai_poly@data$gns_adm1 != "TH81"),] | ||
thai_poly$ID_1 <- dense_rank(thai_poly$woe_name) | ||
thai_locs <- fortify(thai_poly, region="ID_1") | ||
thai_locs[['region']] <- thai_locs[['id']] | ||
|
||
## store loc.info@data | ||
loc_info <- thai_poly@data | ||
|
||
## combine polygon info with thai data | ||
data_to_plot <- left_join(loc_info, thai_prov_data, by = c("gns_adm1" = "FIPS")) %>% | ||
mutate(id = ID_1, | ||
pid = as.character(gns_adm1)) | ||
|
||
|
||
## plotting choices based on type | ||
if(plot_type=="incidence") { | ||
fill_var <- "log10(incidence)" | ||
plot_lims <- range(forecast_data_merged$incidence) | ||
plot_lims <- c(floor(log10(plot_lims)[1]), | ||
ceiling(log10(plot_lims)[2])) | ||
plot_breaks <- seq(plot_lims[1], plot_lims[2]) | ||
plot_midpoint <- mean(plot_lims) | ||
legend_title <- "incidence" | ||
plot_labels <- paste0("1e", plot_breaks) | ||
|
||
} else { | ||
fill_var <- "outbreak_prob" | ||
plot_lims <- c(0,1) | ||
plot_midpoint <- .5 | ||
plot_breaks <- c(0, .5, 1) | ||
plot_labels <- c(0, .5, 1) | ||
legend_title <- "outbreak probability" | ||
} | ||
|
||
## set legend position, if any | ||
legend_pos <- ifelse(include_legend, "right", "none") | ||
|
||
## text for map label | ||
forecast_data_subset <- subset(forecast_data_merged, | ||
date_sick_biweek == biweek_to_plot, | ||
analysis_date == analysis_date) | ||
map_date <- format(as.Date(biweek_to_date(biweek_to_plot, | ||
forecast_data_subset$date_sick_year[1])), "%d %b %Y") | ||
|
||
## merge forecast data with data_to_plot | ||
new_dp <- left_join(data_to_plot, forecast_data_subset) | ||
|
||
sp_map <- ggplot(new_dp, aes(map_id=id)) + | ||
geom_map(aes_string(fill=fill_var), map=thai_locs) + | ||
expand_limits(x = thai_locs$long, y = thai_locs$lat) + | ||
## use color blind friendly colors | ||
scale_fill_gradient2(low = "#053061", mid="#CCCCCC", high = "#FF2C19", | ||
name=legend_title, | ||
limits=plot_lims, | ||
midpoint=plot_midpoint, | ||
breaks=plot_breaks, | ||
labels=plot_labels) + | ||
theme_bw() + | ||
theme(axis.ticks = element_blank(), | ||
axis.text = element_blank(), | ||
panel.background = element_rect(fill = "transparent",colour = NA), | ||
legend.position = legend_pos) + # or element_blank() | ||
coord_map(projection="mercator") + ## to keep scaling right | ||
annotate("text", x = 103.5, y = 20.1, label = map_date, size=4) + | ||
xlab("") + ylab("") | ||
return(sp_map) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
## find amplitude of seasonal time series | ||
|
||
prov_seasonalities <- function(data, plot=FALSE) { | ||
require(mgcv) | ||
require(dplyr) | ||
require(dengueThailand) | ||
nprov <- length(unique(data$prov_num)) | ||
max_seas <- data_frame(prov_num = unique(data$prov_num), seasonality=NA, resid_sd_seas=NA) | ||
for (i in 1:nprov) { | ||
prov_dat <- filter(data, prov_num == unique(data$prov_num)[i], date_sick_year >= 1995) %>% | ||
mutate(date_sick = date_sick_year + (date_sick_biweek-1)/26) | ||
## fit seasonal model with secular trend data | ||
fm1 <- gam(case_count~s(date_sick, k=3) + s(date_sick_biweek, bs="cc"), data=prov_dat, family="poisson") | ||
fm1_output <- plot(fm1, select=2) | ||
max_seas[i, "seasonality"] <- max(fm1_output[[2]]$fit) | ||
max_seas[i, "resid_sd_seas"] <- sd(fm1$residuals) | ||
} | ||
return(max_seas) | ||
} | ||
|
||
library(dplyr) | ||
load("~/Documents/code_versioned/spamd/trunk/source/realtime-models/20150806-spamd-nomem-allyrs-10step-6lag-casedata.RData") | ||
dat <- dhf_data %>% | ||
group_by(prov_num, FIPS, prov_name, date_sick_year, date_sick_biweek) %>% | ||
summarize(case_count = sum(case_count, na.rm=TRUE)) | ||
|
||
prov_seasonalities <- prov_seasonalities(dat) | ||
|
||
save(prov_seasonalities, file="data/20151027-province-seasonalities.RData") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# Data codebook | ||
|
||
field name | description | ||
---------------------|-------------- | ||
`pid` | province ID | ||
`analysis_date` | date forecasts were generated | ||
`date_sick_year` | year of the date for which cases were reported | ||
`date_sick_biweek` | biweek of the date for which cases were reported | ||
`epidemic_threshold` | epidemic threshold for the `date_sick` | ||
`obs_count` | the observed case count for that `date_sick` | ||
`pred_lb` | the lower bound of the 95% prediction interval from the model | ||
`pred_ub` | the upper bound of the 95% prediction interval from the model | ||
`pred_median` | the median of the predictive distribution | ||
`AE_lb` | the absolute error of the lower bound | ||
`AE_ub` | the absolute error of the upper bound | ||
`outbreak_prob` | the outbreak probability | ||
`seasonal_median` | the seasonal median for the particular `date_sick` | ||
`last_obs` | the last observed count | ||
`obs_outbreak` | binary, whether incidence was above the epidemic threshold | ||
`pred_covered` | binary, whether the prediction interval covered the truth | ||
`AE_pred` | absolute error of the predicted median | ||
`AE_seas` | absolute error of the seasonal prediction | ||
`AE_last_obs` | absolute error of the last observation | ||
`step` | the prediction horizon, i.e. the difference in biweeks between `date_sick` and `analysis_date` |
Oops, something went wrong.