-
Notifications
You must be signed in to change notification settings - Fork 1
/
diseasystore-google-covid-19.Rmd
156 lines (129 loc) · 5.38 KB
/
diseasystore-google-covid-19.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
---
title: "diseasystore: Google Health COVID-19 Open Data"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{diseasystore: Google Health COVID-19 Open Data}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---
```{r, include = FALSE}
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
```
```{r setup}
library(diseasystore)
```
```{r hidden_options, include = FALSE}
if (rlang::is_installed("withr")) {
withr::local_options("tibble.print_min" = 5)
withr::local_options("diseasystore.verbose" = FALSE)
} else {
opts <- options("tibble.print_min" = 5, "diseasystore.verbose" = FALSE)
}
# We have a "hard" dependency for RSQLite to render parts of this vignette
suggests_available <- rlang::is_installed("RSQLite")
not_on_cran <- interactive() || identical(Sys.getenv("NOT_CRAN"), "true") || identical(Sys.getenv("CI"), "true")
```
The Google COVID-19 data repository is a comprehensive open repository of COVID-19 data.
This vignette shows how to use (some of) this data through the `diseasystore` package.
First, it is a good idea to copy the relevant Google COVID-19 data files locally and store that location as an option
for the package.
`DiseasystoreGoogleCovid19` uses only the age-stratified metrics for COVID-19, so only a subset of the repository is
needed to download.
```{r download_data, eval = FALSE}
# First we set the path we want to use as an option
options(
"diseasystore.DiseasystoreGoogleCovid19.source_conn" =
file.path("local", "path")
)
# Ensure folder exists
source_conn <- diseasyoption("source_conn", "DiseasystoreGoogleCovid19")
if (!dir.exists(source_conn)) {
dir.create(source_conn, recursive = TRUE, showWarnings = FALSE)
}
# Define the Google files to download
google_files <- c("by-age.csv", "demographics.csv", "index.csv", "weather.csv")
# Download each file and compress them to reduce storage
purrr::walk(google_files, ~ {
url <- paste0(diseasyoption("remote_conn", "DiseasystoreGoogleCovid19"), .)
destfile <- file.path(
diseasyoption("source_conn", "DiseasystoreGoogleCovid19"),
.
)
if (!file.exists(destfile)) {
download.file(url, destfile)
}
})
```
```{r download_data_hidden, include = FALSE, eval = not_on_cran}
# The files we need are stored remotely in Google's API
google_files <- c("by-age.csv", "demographics.csv", "index.csv", "weather.csv")
remote_conn <- diseasyoption("remote_conn", "DiseasystoreGoogleCovid19")
# In practice, it is best to make a local copy of the data which is stored in the "vignette_data" folder
# This folder can either be in the package folder (preferred, please create the folder) or in the tempdir()
local_conn <- purrr::detect("vignette_data", checkmate::test_directory_exists, .default = tempdir())
if (rlang::is_installed("withr")) {
withr::local_options("diseasystore.DiseasystoreGoogleCovid19.source_conn" = local_conn)
withr::local_options("diseasystore.DiseasystoreGoogleCovid19.n_max" = 1000)
} else {
opts <- c(opts, options("diseasystore.DiseasystoreGoogleCovid19.source_conn" = local_conn,
"diseasystore.DiseasystoreGoogleCovid19.n_max" = 1000))
}
# Then we download the first n rows of each data set of interest
try({
purrr::discard(google_files, ~ checkmate::test_file_exists(file.path(local_conn, .))) |>
purrr::walk(\(file) {
paste0(remote_conn, file) |>
readr::read_csv(n_max = 1000, show_col_types = FALSE, progress = FALSE) |>
readr::write_csv(file.path(local_conn, file))
})
})
# Check that the files are available after attempting to download
if (purrr::some(google_files, ~ !checkmate::test_file_exists(file.path(local_conn, .)))) {
data_available <- FALSE
} else {
data_available <- TRUE
}
```
The `diseasystores` require a database to store its features in.
These should be configured before use and can be stored in the packages options.
```{r configure_diseasystore, eval = FALSE}
# We define target_conn as a function that opens a DBIconnection to the DB
target_conn <- \() DBI::dbConnect(RSQLite::SQLite())
options(
"diseasystore.DiseasystoreGoogleCovid19.target_conn" = target_conn
)
```
```{r configure_diseasystore_hidden, include = FALSE, eval = not_on_cran && suggests_available && data_available}
target_conn <- \() DBI::dbConnect(RSQLite::SQLite())
if (rlang::is_installed("withr")) {
withr::local_options("diseasystore.DiseasystoreGoogleCovid19.target_conn" = target_conn)
} else {
opts <- c(opts, options("diseasystore.DiseasystoreGoogleCovid19.target_conn" = target_conn))
}
```
Once the files are downloaded and the target DB is configured, we can initialize the `diseasystore` that uses the
Google COVID-19 data.
```{r initializing_diseasystore, eval = not_on_cran && suggests_available && data_available}
ds <- DiseasystoreGoogleCovid19$new()
```
Once configured such, we can use the feature store directly to get data.
```{r using_diseasystore_example_1, eval = not_on_cran && suggests_available && data_available}
# We can see all the available features in the feature store
ds$available_features
```
```{r using_diseasystore_example_2, eval = not_on_cran && suggests_available && data_available}
# And then retrieve a feature from the feature store
ds$get_feature(feature = "n_hospital",
start_date = as.Date("2020-01-01"),
end_date = as.Date("2020-06-01"))
```
```{r cleanup, include = FALSE}
if (exists("ds")) rm(ds)
gc()
if (!rlang::is_installed("withr")) {
options(opts)
}
```