Skip to content
This repository has been archived by the owner on Aug 29, 2024. It is now read-only.

Commit

Permalink
first updates to repo.
Browse files Browse the repository at this point in the history
  • Loading branch information
nickreich committed May 12, 2016
1 parent bdc08bf commit 021e439
Show file tree
Hide file tree
Showing 13 changed files with 32,863 additions and 2 deletions.
12 changes: 10 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,10 @@
# dengue-thailand-2014-forecasts
Code and public data for "Challenges in real-time prediction of infectious disease: a case study of dengue in Thailand"
# Predicting Dengue Fever in Thailand, 2014
This repository houses code and public data for the manuscript "Challenges in real-time prediction of infectious disease: a case study of dengue in Thailand". This paper describes a practical computational infrastructure that generated multi-step predictions of dengue incidence in Thai provinces every two weeks throughout 2014.

Due to restrictions in the data-sharing agreement with the owners of the case data from Thailand, we are not permitted to make the case data available on this repository. However, we have provided the code used in the analysis, the reproducible manuscript documents (although they are not fully reproducible as is here due to dependencies on data that we are not allowed to share), and the forecasts generated by our prediction model.

The `code` folder houses code for analysis and figure making.

The `data` folder contains csv files containing summaries of our forecasts.

The `manuscript` folder contains the dynamic document used to create the final manuscript and supplement that were submitted for review.
1,204 changes: 1,204 additions & 0 deletions code/eval-aggregation-fxns.R

Large diffs are not rendered by default.

474 changes: 474 additions & 0 deletions code/eval-forecast-fxns.R

Large diffs are not rendered by default.

143 changes: 143 additions & 0 deletions code/make-aggregated-counts.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@


#' reformat a cntry.data.linelist object into a long-format dataset with delivery dates
#'
#' @param d a cntry.data.linelist object
#'
#' @return a tbl_df of biweekly case count data.
#' Columns for province, date_sick_year, date_sick_biweek, delivery_biweek,
#' and case_counts.
#'
make_long_aggregated_counts_through_2014 <- function(d) {
require(dengueThailand)
require(spatialpred)
require(dplyr)
require(tidyr)
require(lubridate)
require(reshape2)

data(thai_prov_data)
thai_prov_narrow <- select(thai_prov_data, Province, province, FIPS) %>%
dplyr::rename(prov_name = Province)

## put wide-form counts into long-form
colnames(d) <- paste0("BW.", d@time.in.yr, ".", den_data@yr)
elongated_counts <- tbl_df(reshape2::melt(d@.Data,
varnames=c("prov_name", "char.time"))) %>%
dplyr::rename(case_count = value) %>%
separate(char.time,
into = c("bw_label", "date_sick_biweek", "date_sick_year"),
convert=TRUE) %>%
filter(date_sick_year < 2014) %>% ## 2014 cases to be added with deliv_dates
select(-bw_label) %>% ## drop unneeded label column
mutate(case_count = round(case_count)) %>% ## round interpolated cases
left_join(thai_prov_narrow) %>%
mutate(prov_name = as.factor(prov_name))
## total records = 61256 = 76 provinces * 26 biweeks * 46 years

## aggregate 2014 linelist cases
aggregated_linelist_2014 <- tbl_df(d@line.list) %>%
filter(!is.na(date_sick), ## remove NA date sick
year(date_sick)==2014, ## only 2014
province != 38, ## remove Bueng Kan, already added to Nong Khai
disease==26) %>% ## subset to DHF
mutate(date_sick_biweek = date_to_biweek(date_sick),
date_sick_year = year(date_sick),
province = as.numeric(province)) %>%
group_by(province, delivery_date, date_sick_year, date_sick_biweek) %>%
summarize(case_count = n()) %>%
ungroup() %>%
arrange(province, date_sick_year, date_sick_biweek, delivery_date)

## for 2014, make table with one row per province-biweek-year-deliv_date
## total rows = 76 provinces * 26 biweeks * 29 delivery_dates * 1 year
expanded_linelist_cols <- aggregated_linelist_2014 %>%
filter(date_sick_year == 2014) %>%
tidyr::expand(province, date_sick_year, date_sick_biweek, delivery_date)
## could remove the rows with delivery_date < date_sick

## merge expanded grid with observed data
## total rows = 148954
full_aggregated_linelist <- left_join(expanded_linelist_cols, aggregated_linelist_2014) %>%
left_join(thai_prov_narrow)

## merge with old data
all_dhf_data <- bind_rows(full_aggregated_linelist, elongated_counts) %>%
dplyr::rename(prov_num = province) %>%
mutate(prov_name = factor(prov_name))

return(all_dhf_data)
}



#' Plot all case data from an aggregated counts file
#'
#' @param counts results from a function like make_long_aggregated_counts_through_2014()
#' @param type scale of data, either "incidence" or "raw"
#'
#' @return NULL
make_all_data_raster <- function(counts, type=c("incidence", "raw")) {
type <- match.arg(type)
require(dengueThailand)
require(dplyr)
require(scales)
require(grid)

data(thai_prov_data)

prov_data <- thai_prov_data %>%
transmute(FIPS=FIPS, pop = Population, moph_region = MOPH_Admin_Code)

## aggregate 2014 counts across delivery dates
counts_aggregated <- counts %>%
## for 2014 only, add sum counts with na.rm=TRUE
filter(date_sick_year == 2014) %>%
group_by(date_sick_year, date_sick_biweek, FIPS, prov_name) %>%
summarize(case_count = sum(case_count, na.rm=TRUE)) %>%
ungroup() %>%
## add back other years
bind_rows(filter(counts, date_sick_year != 2014))

## merge couts and province data
counts_with_prov <- left_join(counts_aggregated, prov_data) %>%
mutate(prov_name = reorder(prov_name, pop))

## rescale to avoid -Inf
zero_idx <- which(counts_with_prov$case_count == 0)
counts_with_prov[zero_idx, "case_count"] <- 1

## color schemes from 9-class OrRd: http://colorbrewer2.org/
if(type=="incidence") {
ggplot(counts_with_prov) +
geom_raster(aes(x=date_sick_year+(date_sick_biweek-1)/26,
y=prov_name,
fill=log10((case_count)/pop))) +
#facet_grid(moph_region~., space="free", scales="free") +
scale_fill_gradientn(colours=c("#fff7ec", "#fdbb84", "#7f0000"), #c("#ffffcc", "#fd8d3c", "#800026"),
values = rescale(c(-7,-4,-2.2)),
guide = "colorbar", limits=c(-7,-2.2),
name="cases per 100,000 residents",
na.value="lightgrey",
labels=c(1, 10, 100, 500), breaks=c(-5, -4, -3, log10(5/1000)))+
scale_x_continuous(expand=c(0,0)) +
theme_bw() + xlab("") + ylab("") +
theme(panel.margin = unit(0, "lines"))

} else if (type=="raw") {
ggplot(counts_with_prov) +
geom_raster(aes(x=date_sick_year+(date_sick_biweek-1)/26,
y=prov_name,
fill=log10(case_count))) +
#facet_grid(moph_region~., space="free", scales="free") +
scale_fill_gradientn(colours=c("#fff7ec", "#fc8d59", "#7f0000"),
values = rescale(c(0, 4)),
guide = "colorbar", limits=c(0,4),
name="number of cases",
na.value="lightgrey",
labels=c(0, 10, 100, 1000), breaks=log10(c(1, 11, 101, 1001))) +
scale_x_continuous(expand=c(0,0)) +
theme_bw() + xlab("") + ylab("") +
theme(panel.margin = unit(0, "lines"))
}
}
116 changes: 116 additions & 0 deletions code/plot_outbreak_map.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
## function to make outbreak probability plot for paper
## Nicholas Reich
## June 2015

## adapted from dengueThailand function new_plot_forecast_map.R

#'@param forecast_data forecast file dataframe
#'@param cdata cntry.data object with spatial polygons needed for plotting
#'@param biweek_to_plot biweek to plot
#'@param analysis_date analysis date to plot predictions from
#'@param include_legend logical, whether to include legend
#'@param plot_type one of either "incidence" or "outbreak"

plot_outbreak_map <- function(forecast_data,
wld.shape = "../../data/updated_adm/ne_10m_admin_1_states_provinces.shp",
biweek_to_plot,
analysis_date,
include_legend=TRUE,
plot_type=c("incidence", "outbreak")) {
require(ggplot2)
require(dplyr)
require(rgeos)
require(mapproj)
require(maptools)


data(thai_prov_data)

## load in spatial polygons
world <- readShapeSpatial(wld.shape)
thai_poly <- world[world$admin=="Thailand",]

## adjust for end of year biweeks
if(biweek_to_plot>26)
biweek_to_plot <- biweek_to_plot - 26

if(!(biweek_to_plot %in% unique(forecast_data$date_sick_biweek)))
stop("date_sick_biweek must be in forecast_data.")

if(nrow(filter(forecast_data,
analysis_date == analysis_date,
date_sick_biweek=biweek_to_plot)) < 1)
stop("no match for plotting biweek and analysis date.")

## merge thai_prov_data with forecasts to get population
forecast_data_merged <- left_join(forecast_data, thai_prov_data, by = c("pid" = "FIPS")) %>%
mutate(incidence = pred_median/Population)

## foritfy polygon info
thai_poly <- thai_poly[which(thai_poly@data$gns_adm1 != "TH81"),]
thai_poly$ID_1 <- dense_rank(thai_poly$woe_name)
thai_locs <- fortify(thai_poly, region="ID_1")
thai_locs[['region']] <- thai_locs[['id']]

## store loc.info@data
loc_info <- thai_poly@data

## combine polygon info with thai data
data_to_plot <- left_join(loc_info, thai_prov_data, by = c("gns_adm1" = "FIPS")) %>%
mutate(id = ID_1,
pid = as.character(gns_adm1))


## plotting choices based on type
if(plot_type=="incidence") {
fill_var <- "log10(incidence)"
plot_lims <- range(forecast_data_merged$incidence)
plot_lims <- c(floor(log10(plot_lims)[1]),
ceiling(log10(plot_lims)[2]))
plot_breaks <- seq(plot_lims[1], plot_lims[2])
plot_midpoint <- mean(plot_lims)
legend_title <- "incidence"
plot_labels <- paste0("1e", plot_breaks)

} else {
fill_var <- "outbreak_prob"
plot_lims <- c(0,1)
plot_midpoint <- .5
plot_breaks <- c(0, .5, 1)
plot_labels <- c(0, .5, 1)
legend_title <- "outbreak probability"
}

## set legend position, if any
legend_pos <- ifelse(include_legend, "right", "none")

## text for map label
forecast_data_subset <- subset(forecast_data_merged,
date_sick_biweek == biweek_to_plot,
analysis_date == analysis_date)
map_date <- format(as.Date(biweek_to_date(biweek_to_plot,
forecast_data_subset$date_sick_year[1])), "%d %b %Y")

## merge forecast data with data_to_plot
new_dp <- left_join(data_to_plot, forecast_data_subset)

sp_map <- ggplot(new_dp, aes(map_id=id)) +
geom_map(aes_string(fill=fill_var), map=thai_locs) +
expand_limits(x = thai_locs$long, y = thai_locs$lat) +
## use color blind friendly colors
scale_fill_gradient2(low = "#053061", mid="#CCCCCC", high = "#FF2C19",
name=legend_title,
limits=plot_lims,
midpoint=plot_midpoint,
breaks=plot_breaks,
labels=plot_labels) +
theme_bw() +
theme(axis.ticks = element_blank(),
axis.text = element_blank(),
panel.background = element_rect(fill = "transparent",colour = NA),
legend.position = legend_pos) + # or element_blank()
coord_map(projection="mercator") + ## to keep scaling right
annotate("text", x = 103.5, y = 20.1, label = map_date, size=4) +
xlab("") + ylab("")
return(sp_map)
}
29 changes: 29 additions & 0 deletions code/province_seasonality_analysis.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
## find amplitude of seasonal time series

prov_seasonalities <- function(data, plot=FALSE) {
require(mgcv)
require(dplyr)
require(dengueThailand)
nprov <- length(unique(data$prov_num))
max_seas <- data_frame(prov_num = unique(data$prov_num), seasonality=NA, resid_sd_seas=NA)
for (i in 1:nprov) {
prov_dat <- filter(data, prov_num == unique(data$prov_num)[i], date_sick_year >= 1995) %>%
mutate(date_sick = date_sick_year + (date_sick_biweek-1)/26)
## fit seasonal model with secular trend data
fm1 <- gam(case_count~s(date_sick, k=3) + s(date_sick_biweek, bs="cc"), data=prov_dat, family="poisson")
fm1_output <- plot(fm1, select=2)
max_seas[i, "seasonality"] <- max(fm1_output[[2]]$fit)
max_seas[i, "resid_sd_seas"] <- sd(fm1$residuals)
}
return(max_seas)
}

library(dplyr)
load("~/Documents/code_versioned/spamd/trunk/source/realtime-models/20150806-spamd-nomem-allyrs-10step-6lag-casedata.RData")
dat <- dhf_data %>%
group_by(prov_num, FIPS, prov_name, date_sick_year, date_sick_biweek) %>%
summarize(case_count = sum(case_count, na.rm=TRUE))

prov_seasonalities <- prov_seasonalities(dat)

save(prov_seasonalities, file="data/20151027-province-seasonalities.RData")
24 changes: 24 additions & 0 deletions data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Data codebook

field name | description
---------------------|--------------
`pid` | province ID
`analysis_date` | date forecasts were generated
`date_sick_year` | year of the date for which cases were reported
`date_sick_biweek` | biweek of the date for which cases were reported
`epidemic_threshold` | epidemic threshold for the `date_sick`
`obs_count` | the observed case count for that `date_sick`
`pred_lb` | the lower bound of the 95% prediction interval from the model
`pred_ub` | the upper bound of the 95% prediction interval from the model
`pred_median` | the median of the predictive distribution
`AE_lb` | the absolute error of the lower bound
`AE_ub` | the absolute error of the upper bound
`outbreak_prob` | the outbreak probability
`seasonal_median` | the seasonal median for the particular `date_sick`
`last_obs` | the last observed count
`obs_outbreak` | binary, whether incidence was above the epidemic threshold
`pred_covered` | binary, whether the prediction interval covered the truth
`AE_pred` | absolute error of the predicted median
`AE_seas` | absolute error of the seasonal prediction
`AE_last_obs` | absolute error of the last observation
`step` | the prediction horizon, i.e. the difference in biweeks between `date_sick` and `analysis_date`
Loading

0 comments on commit 021e439

Please sign in to comment.