This repository has been archived by the owner on Dec 26, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
create.R
348 lines (260 loc) · 8.91 KB
/
create.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
#' Prepare Raw Data
#'
#' @description Data downloaded from the St. Louis Metropolitan Police Department
#' are downloaded with incorrect file paths - e.g. \code{January2008.CSV.html}.
#' This function iterates over all files in a given path and replaces their
#' file extensions. Thus \code{January2008.CSV.html} will be replaced by
#' \code{january2008.csv}. There should be no more than 12 files in a given path,
#' and all should correspond to the same year.
#'
#' @usage cs_prep_year(path, verbose = FALSE)
#'
#' @param path File path where raw STLMPD data are
#' @param verbose If \code{TRUE}, returns a tibble with results; otherwise if \code{FALSE}, no
#' output is returned.
#'
#' @return A tibble containing old file names and new file names for reference is \code{verbose = TRUE}.
#' Otherwise, no output is returned. This function will change all problematic filenames in the
#' specified path.
#'
#' @examples
#' # create temporary directory
#' tmpdir <- tempdir()
#' fs::dir_create(paste0(tmpdir,"/data/"))
#'
#' # load sample files into temporary directory
#' cs_example(path = paste0(tmpdir,"/data/"))
#'
#' # list files
#' list.files(paste0(tmpdir,"/data/"))
#'
#' # prep sample files
#' cs_prep_year(path = paste0(tmpdir,"/data/"))
#'
#' # list files again
#' list.files(paste0(tmpdir,"/data/"))
#'
#' # delete data
#' fs::dir_delete(paste0(tmpdir,"/data/"))
#'
#' # create temporary directory
#' fs::dir_create(paste0(tmpdir,"/data/"))
#'
#' # load sample files into temporary directory
#' cs_example(path = paste0(tmpdir,"/data/"))
#'
#' # prep sample files
#' cs_prep_year(path = paste0(tmpdir,"/data/"), verbose = TRUE)
#'
#' # delete data again
#' fs::dir_delete(paste0(tmpdir,"/data/"))
#'
#' @importFrom dplyr as_tibble
#' @importFrom dplyr filter
#' @importFrom fs file_move
#' @importFrom purrr map
#' @importFrom purrr map_chr
#' @importFrom stringr str_c
#' @importFrom stringr str_detect
#' @importFrom stringr str_replace
#'
#' @export
cs_prep_year <- function(path, verbose = FALSE){
# check parameters
if (is.logical(verbose) == FALSE){
stop("The 'verbose' parameter only accepts 'TRUE' or 'FALSE' as valid arguments.")
}
# create vector of filenames
files <- list.files(path)
# check number of files
if (length(files) > 12){
stop('There are too many files in the specified folder. Edit crime files in yearly batches of 12 monthly files.')
} else if (length(files) < 12){
warning('There are fewer than 12 files in the specified folder. You are only editing a partial year.')
}
# detect html in file extensions
html <- stringr::str_detect(files, pattern = ".html$")
# create data frame of files and file extensions
data <- data.frame(files = files, html = html, stringsAsFactors = FALSE)
# subset data frame and convert to vector
data <- dplyr::filter(data, html == TRUE)
problemFiles <- as.vector(data$files)
# iterate and produce optional output
if (verbose == TRUE){
# iterate over each filename, renaming it and coverting to lowercase
problemFiles %>%
split(problemFiles) %>%
purrr::map_chr(~ cs_edit_filename(path = path, file = .x)) -> changes
# create vector of new filenames
orignal <- names(changes)
names(changes) <- NULL
# create output
out <- dplyr::as_tibble(data.frame(
original = orignal,
new = changes,
stringsAsFactors = FALSE
))
# return output
return(out)
} else if (verbose == FALSE){
# iterate over each filename, renaming it and coverting to lowercase
problemFiles %>%
split(problemFiles) %>%
purrr::map(~ cs_edit_filename(path = path, file = .x)) -> out
}
}
# edit an individual file name
cs_edit_filename <- function(path, file){
# construct a new file name that is all lower case, removes .html from end
newFile <- tolower(stringr::str_replace(file, pattern = ".html$", replacement = ""))
# create new file paths, adding a forward slash between path and filename if necessary
if (stringr::str_detect(path, pattern = "/$") == FALSE){
filePath <- stringr::str_c(path, "/", file)
newPath <- stringr::str_c(path, "/", newFile)
} else {
filePath <- stringr::str_c(path, file)
newPath <- stringr::str_c(path, newFile)
}
# rename file
fs::file_move(filePath, newPath)
# return output
return(newFile)
}
#' Create Year List Object
#'
#' @description \code{cs_load_year} is used to load a set of \code{.csv} files
#' contained in the given directory. This should be used to load a full
#' year worth of data or a partial year. There should be no more than 12
#' files in a given path, and all should correspond to the same year. All
#' columns will be read in as character data in order to address inconsistencies
#' in how the data are created. When \link{cs_collapse} is executed, variables
#' will be converted numeric when doing so is applicable.
#'
#' @usage cs_load_year(path)
#'
#' @param path A file path
#'
#' @return A year-list object containing 12 tibbles - one per month - worth
#' of crime data stored within a list.
#'
#' @examples
#' # create temporary directory
#' tmpdir <- tempdir()
#' fs::dir_create(paste0(tmpdir,"/data/"))
#'
#' # load sample files into temporary directory
#' cs_example(path = paste0(tmpdir,"/data/"))
#'
#' # prep sample files
#' cs_prep_year(path = paste0(tmpdir,"/data/"))
#'
#' # load sample files
#' yearList17 <- cs_load_year(path = paste0(tmpdir,"/data/"))
#'
#' # delete data
#' fs::dir_delete(paste0(tmpdir,"/data/"))
#'
#' # print year-list object
#' yearList17
#'
#' @importFrom dplyr %>%
#' @importFrom purrr map
#' @importFrom readr cols
#' @importFrom readr col_character
#' @importFrom readr read_csv
#' @importFrom stringr str_sub
#'
#' @export
cs_load_year <- function(path){
# create list of all files at path that are csv
files <- dir(path = path, pattern = "*.csv")
# check number of files
if (length(files) > 12){
stop('There are too many files in the specified folder. Load crime files in yearly batches of 12 monthly files.')
} else if (length(files) < 12){
warning('There are fewer than 12 files in the specified folder. You are only loading a partial year.')
}
# read csv files into year list objects
files %>%
purrr::map(~ suppressMessages(suppressWarnings(readr::read_csv(file.path(path, .), col_types = readr::cols(.default = readr::col_character()))))) -> out
# clean-up variable names
out %>%
purrr::map(janitor::clean_names) -> out
# create list of months associated with year list object items
out %>%
purrr::map(cs_identifyMonth) -> nameList
# convert list of months to vector
nameVector <- unlist(nameList, recursive = TRUE, use.names = TRUE)
# apply vector to data
names(out) <- nameVector
# add new class
class(out) <- append(class(out), "cs_year_list")
# return year list object
return(out)
}
# Extract Month of a Given Year List Object Item
#
# @description This uses the value of the first observation's coded month as the basis for
# identifying which month the data are from.
#
# @param .data A year list object name
# @param read A logical scalar; if \code{TRUE}, return output structured for initial reading of
# data into R. If \code{FALSE}, use for data validation.
#
cs_identifyMonth <- function(.data, read = TRUE){
# depending on number of columns, the CodedMonth variable is named differently
# the if elseif statements pull the first value from CodedMonth
if (length(.data) == 18){
monthVal <- .data$month_reportedto_mshp[1]
} else if (length(.data) == 20 | length(.data) == 26){
monthVal <- .data$coded_month[1]
}
# extract the last two digits from the coded month value
monthString <- stringr::str_sub(monthVal, start = -2)
if (read == TRUE){
# convert those last two digits into a string month name
out <- cs_matchMonth(monthString)
} else if (read == FALSE){
out <- as.numeric(monthString)
}
# return output
return(out)
}
# Match Extract Month with Month Name
#
# @description Based on the result of cs_identifyMonth, this function returns the
# appropriate string name.
#
# @param x The last two characters of the first observation's coded month value
#
cs_matchMonth <- function(x){
# the last two digits from two digits from the coded month value are passed to this function as x
# depending on the value, the correct month string name is returned
if (x == "01") {
name <- "January"
} else if (x == "02") {
name <- "February"
} else if (x == "03") {
name <- "March"
} else if (x == "04") {
name <- "April"
} else if (x == "05") {
name <- "May"
} else if (x == "06") {
name <- "June"
} else if (x == "07") {
name <- "July"
} else if (x == "08") {
name <- "August"
} else if (x == "09") {
name <- "September"
} else if (x == "10") {
name <- "October"
} else if (x == "11") {
name <- "November"
} else if (x == "12") {
name <- "December"
}
# return output
return(name)
}