-
Notifications
You must be signed in to change notification settings - Fork 19
/
cprd_import.R
executable file
·144 lines (131 loc) · 6.24 KB
/
cprd_import.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#' Reads a zipped data file to a dataframe
#'
#' This function will unzip a zipped text file and read it in to an R data frame
#'
#' Default behaviour is to read in as a standard read.delim call.
#' extra arguments to read.delim can be passed to the function
#'
#' @export
#'
#' @param file character a file to read in
#' @param \dots extra arguments to pass to read.delim
#' @return a dataframe
read_zip <- function(file, ...) {
zipFileInfo <- unzip(file, list=TRUE)
if(nrow(zipFileInfo) > 1)
stop("More than one data file inside zip")
else
read.delim(unz(file, as.character(zipFileInfo$Name)), ...)
}
#' Wrapper for dbConnect
#'
#' Connects to a SQLite database or creates one if it does not already exist
#'
#' If the '.sqlite' file extension is ommited from the dbname argument it is automatically added.
#'
#' @export
#'
#' @param dbname character name path to database file
#' @return SQLiteConnection object
#' @examples \dontrun{
#' db <- database("mydb")
#' }
database <- function(dbname){
if(!str_detect(dbname, "\\.sqlite$")) {
dbname <- paste(dbname, "sqlite", sep = ".")
}
dbConnect(SQLite(), dbname)
}
#' Adds a series of files to a database
#'
#' This function can be used to import a CPRD file or files into a SQLite database connection.
#'
#' Will automatically unzip files before calling them in
#' If practid is TRUE, a practid variable is constructed by converting the last 3 digits of the patient id (if supplied) to a numeric.
#' If filenames is TRUE, source data filenames are included as a variable with the filetypes stripped away.
#'
#' @export
#'
#' @param db a database connection object
#' @param files a character vector of filenames to files to be imported
#' @param table_name a name for the table to import to
#' @param dateformat the format that dates are stored in the CPRD data. If this is wrong it won't break but all dates are likely to be NA
#' @param yob_origin value to add yob values to to get actual year of birth (Generally 1800)
#' @param practid logical should practice id variable be constructed from the patient ids?
#' @param filenames logical should the filename be included as a variable?
add_to_database <- function(db, files, table_name, dateformat = "%d/%m/%Y", yob_origin = 1800, practid = TRUE, filenames = FALSE){
date_fields <- c("eventdate", "sysdate", "lcd", "uts", "frd", "crd", "tod", "deathdate")
for(f in files){
if(str_detect(f, "zip$")){
message(sprintf("Unzipping %s...", f), appendLF = FALSE)
dat <- read_zip(f, stringsAsFactors = FALSE)
} else {
message(sprintf("Reading %s...", f), appendLF = FALSE)
dat <- read.delim(f, stringsAsFactors = FALSE)
}
f_dates <- intersect(names(dat), date_fields)
if(length(f_dates)){
message(" Converting date formats...", appendLF = FALSE)
for(column in f_dates){
dat[[column]] <- as.character(as.Date(as.character(dat[[column]]),
format = dateformat))
}
}
if("yob" %in% names(dat)) dat$yob <- dat$yob + yob_origin
if(practid && "patid" %in% names(dat)){
message(" Adding practid variable...", appendLF = FALSE)
dat$practid <- as.integer(str_extract(dat$patid, "[0-9]{3}$"))
}
if(filenames) dat$filename <- str_replace(basename(f), "\\..*", "")
message(sprintf(" Importing to table '%s'...", table_name))
if(dbExistsTable(db, table_name)){
dbWriteTable(conn = db,
name = table_name,
value = dat,
row.names = FALSE,
append = TRUE)
} else dbWriteTable(conn = db,
name = table_name,
value = dat,
row.names = FALSE)
}
}
#' Imports all selected CPRD data into an sqlite database
#'
#' This function can import from both cohorts downloaded via the CPRD online tool and CPRD GOLD builds
#'
#' Note that if you chose to import all the filetype, you may end up with a very large database file.
#' You may then chose only to import the files you want to use. You can always import the rest of the files later.
#' This function may take a long time to process because it unzips (potentially large) files, reads into R where it converts the date formats
#' before importing to SQLite. However, this initial data preparation step will greatly accelerate downstream processing.
#'
#' @export
#'
#' @param db a database connection
#' @param data_dir the directory containing the CPRD cohort data
#' @param filetypes character vector of filetypes to be imported
#' @param dateformat the format that dates are stored in the CPRD data. If this is wrong it won't break but all dates are likely to be NA
#' @param yob_origin value to add yob values to to get actual year of birth (Generally 1800)
#' @param regex character regular expression to identify data files in the directory. This is separated from the filetype by an underscore. e.g. 'p[0-9]{3}' in CPRD GOLD
#' @param recursive logical should files be searched for recursively under the data_dir?
#' @param \dots arguments to be passed to add_to_database
import_CPRD_data <- function(db, data_dir,
filetypes = c("Additional", "Clinical", "Consultation",
"Immunisation", "Patient", "Practice",
"Referral", "Staff", "Test", "Therapy"),
dateformat = "%d/%m/%Y",
yob_origin = 1800,
regex = "PET",
recursive = TRUE, ...){
all_files <- list.files(data_dir, recursive = recursive)
for(filetype in filetypes){
current <- all_files[str_detect(all_files, paste(regex, filetype, sep = "_"))]
current <- file.path(data_dir, current)
if(!length(current)){
message(sprintf("No %s files to import.", filetype))
} else {
add_to_database(db, files = current, table_name = filetype,
dateformat = dateformat, yob_origin = yob_origin, ...)
}
}
}