This repository has been archived by the owner on Jun 10, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 22
/
validate-gtfs-structure.R
304 lines (219 loc) · 11.4 KB
/
validate-gtfs-structure.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
#' Create validation list for a gtfs_obj. It provides an overview of the structure of all files that were imported.
#'
#' @param gtfs_obj A GTFS list object with components agency_df, etc.
#' @param return_gtfs_obj Boolean. If TRUE, returns gtfs_obj list with a 'validate' attribute appended (TRUE by default),
#' if FALSE, returns validate list only
#' @param quiet Boolean. Option to suppress any messages, prints, etc
#'
#' @return A gtfs_obj list object with attribute 'validate' or just a list containing validation data
#'
#' @export
validate_gtfs_structure <- function(gtfs_obj, return_gtfs_obj = TRUE, quiet = FALSE) {
if (length(gtfs_obj) == 0) {
warning('Empty gtfs_obj.')
return(NULL)
}
if(!quiet) message(gtfs_obj$agency_df$agency_name)
val_files <- validate_files_provided(gtfs_obj = gtfs_obj)
all_val_df <- validate_vars_provided(val_files, gtfs_obj = gtfs_obj) %>%
calendar_exception_fix
probs_subset <- all_val_df %>% dplyr::filter(validation_status == 'problem') # subset of only problems
ok_subset <- all_val_df %>% dplyr::filter(validation_status != 'problem') # subset of ok values
validate_list <- list(all_req_files = !('missing_req_file' %in% probs_subset$validation_details),
all_req_fields_in_req_files = !('missing_req_field' %in% probs_subset$validation_details),
all_req_fields_in_opt_files = !('missing_req_field' %in% ok_subset$validation_details),
validate_df = all_val_df)
# get subset of problem req files
if(!validate_list$all_req_files) {
validate_list$problem_req_files <- probs_subset %>%
dplyr::filter(grepl('missing_req_*', validation_details))%>%
dplyr::select(file, file_spec, file_provided_status, field, field_spec, field_provided_status)
}
# get subset of problem opt files
if(any(!validate_list$all_req_fields_in_req_files, !validate_list$all_req_fields_in_opt_files)) {
validate_list$problem_opt_files <- ok_subset %>%
dplyr::filter(grepl('missing_opt_*', validation_details)) %>%
dplyr::select(file, file_spec, file_provided_status, field, field_spec, field_provided_status)
}
# get subset of extra files
if('ext' %in% all_val_df$file_spec) {
validate_list$extra_files <- all_val_df %>%
dplyr::filter(grepl('ext', file_spec)) %>%
dplyr::select(file, file_spec, file_provided_status, field, field_spec, field_provided_status)
}
# update gtfs_obj attributes with validation data
attributes(gtfs_obj) <- append(attributes(gtfs_obj), list(validate = validate_list))
if (return_gtfs_obj) {
return(gtfs_obj)
} else {
return(validate_list)
}
}
# Purpose -----------------------------------------------------------------
# Functions to validate/assess data quality of a gtfs class object
# These should collectively allow a user to
# - assess whether raw data from a feed has quality problems such that a gtfs class object can/should not be created
# - get a summary of data quality for a gtfs class object with various categories/levels of importance
# - save data quality metadata in a structured way
#' For a single 'gtfs' class object, check provided files
#'
#' @param gtfs_obj A 'gtfs' class object with components agency_df, etc.
#'
#' @return Dataframe will one row for all required and optional files per spec, plus one row for any other files provided (file), with an indication of these categories (spec), and a yes/no/empty status (provided_status)
#' @noRd
validate_files_provided <- function(gtfs_obj) {
# Per spec, these are the required and optional files
all_req_files <- c('agency', 'stops', 'routes', 'trips', 'stop_times', 'calendar')
all_opt_files <- c(
'calendar_attributes',
'calendar_dates',
'directions',
'fare_attributes',
'fare_rider_categories',
'fare_rules',
'farezone_attributes',
'feed_info',
'frequencies',
'rider_categories',
'route_directions',
'shapes',
'stop_attributes',
'timetable_stop_order',
'timetables',
'transfers'
)
all_spec_files <- c(all_req_files, all_opt_files)
# Get the names of all the dfs in the list for a gtfs_obj
feed_names <- names(gtfs_obj)
# Strip the _df from the names to get to file components
feed_names_file <- gsub('_df', '', feed_names)
# Determine whether any of the files provided are neither required nor optional
extra_files <- feed_names_file[!(feed_names_file %in% all_spec_files)]
all_files <- c(all_spec_files, extra_files)
val_files <- dplyr::data_frame(file = all_files, spec = c(rep('req', times = length(all_req_files)), rep('opt', times = length(all_opt_files)), rep('ext', times = length(extra_files))))
val_files <- val_files %>%
dplyr::mutate(provided_status = ifelse(!(file %in% feed_names_file), 'no',
ifelse(sapply(gtfs_obj, dim)[2,] == 0, 'empty',
'yes')))
return(val_files)
}
#' Create dataframe of GTFS variable spec info
#' @noRd
make_var_val <- function() {
nms <- get_gtfs_meta() %>% names() # get names of each envir element
all_val_df <- get_gtfs_meta() %>%
lapply(. %>% as.data.frame(stringsAsFactors=FALSE) %>% dplyr::tbl_df(.)) # convert list data to tbl_df
all_val_df <- mapply(function(x,y) dplyr::mutate(.data=x, file = y), x = all_val_df, y = nms, SIMPLIFY=FALSE) %>%
dplyr::bind_rows() # assign new variable 'file' to each tbl_df based on names
return(all_val_df)
}
#' Validate variables provided vs spec
#'
#' @param val_files The dataframe output of validate_files_provided for a single feed
#' @param gtfs_obj A 'gtfs' class object with components agency_df, etc.
#'
#' @return Dataframe with one record per file x column (columns in spec + any extra provided),
#' with file and field specs and file and field provided statuses
#'
#' @noRd
validate_vars_provided <- function(val_files, gtfs_obj) {
stopifnot(any(class(val_files) == 'tbl_df'))
# Generate the df of files and fields per the GTFS spec
spec_vars_df <- make_var_val() %>%
dplyr::rename(field_spec = spec)
# the files that are provided for this gtfs_obj
val_files_df <- val_files %>%
dplyr::rename(file_spec = spec, file_provided_status = provided_status)
# Separately store any extra files and get their variables (since these aren't in spec_vars_df)
extra_files_df <- val_files_df %>% dplyr::filter(file_spec == 'ext')
# For extra files provided, get all file names and add them into spec_vars_df
if (nrow(extra_files_df) > 0) {
extra_files <- extra_files_df$file
extra_files_df <- paste0(extra_files, '_df')
extra_vars_df <- dplyr::data_frame()
for (i in extra_files_df) {
temp_df <- gtfs_obj[[i]]
temp_names <- names(temp_df)
# create dataframe of names
temp_vars_df <- dplyr::data_frame(file = rep(gsub('_df', '', i), length(temp_names)), field = temp_names, field_spec = 'ext')
extra_vars_df <- dplyr::bind_rows(extra_vars_df, temp_vars_df)
}
spec_vars_df <- dplyr::bind_rows(spec_vars_df, extra_vars_df)
}
spec_vars_df <- spec_vars_df %>% dplyr::select(field, field_spec, file)
# Join file level data with variable level data - for files that were provided
vars_df <- suppressMessages(dplyr::left_join(val_files_df, spec_vars_df))
val_vars_df <- dplyr::data_frame()
val_cols <- val_files_df$file
val_cols_df <- paste0(val_cols, '_df')
# List variables provided for each file and join to determine field_provided
for (j in val_cols_df) {
temp_df <- gtfs_obj[[j]]
temp_names <- names(temp_df)
# Handle the case where a required file is missing
if (is.null(temp_df) || is.null(temp_names) || all(is.na(temp_names))) {
temp_vars_df <- dplyr::data_frame(file = gsub('_df', '', j), field = NA, field_provided_status = 'none')
} else {
provided_status <- temp_df %>%
dplyr::summarize_all(dplyr::funs(is_empty = all(is.na(.))))
temp_vars_df <- dplyr::data_frame(file = rep(gsub('_df', '', j), length(temp_names)), field = temp_names, field_provided_status = ifelse(provided_status, 'empty', 'yes'))
}
val_vars_df <- dplyr::bind_rows(val_vars_df, temp_vars_df)
}
# Join observed field provided status with spec info
all_val_df <- suppressMessages(dplyr::full_join(vars_df, val_vars_df))
orig_rows <- nrow(all_val_df)
# Now fill in special cases from join results
# 1) Extra fields provided in spec files
sub_all_val_df <- all_val_df %>%
dplyr::filter(!is.na(field_spec))
case_df <- all_val_df %>%
dplyr::filter(is.na(field_spec))
case_df <- case_df %>%
dplyr::mutate(field_spec = 'ext') %>%
dplyr::group_by(file) %>%
dplyr::mutate(file_spec = unique(sub_all_val_df$file_spec[sub_all_val_df$file == unique(file)]),
file_provided_status = unique(sub_all_val_df$file_provided_status[sub_all_val_df$file == unique(file)]))
# 2) field_provided_status is NA for spec fields not provided in files provided
sub_all_val_df$field_provided_status[is.na(sub_all_val_df$field_provided_status)] <- 'no'
# Put filled in parts back together
all_val_df <- dplyr::bind_rows(sub_all_val_df, case_df)
if (nrow(all_val_df) != orig_rows) stop('Handling of special cases failed.')
# Check overall status for required files/fields
all_val_df <- all_val_df %>%
dplyr::mutate(validation_status = 'ok') # default ok
all_val_df <- all_val_df %>%
dplyr::mutate(validation_details = NA) %>% # default to NA
dplyr::mutate(validation_details = replace(validation_details, file_provided_status != 'yes' & file_spec == 'opt', 'missing_opt_file')) %>% # optional file missing
dplyr::mutate(validation_details = replace(validation_details, file_provided_status != 'yes' & file_spec == 'req', 'missing_req_file')) %>% # req file missing
dplyr::mutate(validation_details = replace(validation_details, file_provided_status == 'yes' & field_spec == 'req' & field_provided_status != 'yes', 'missing_req_field')) %>%
dplyr::mutate(validation_details = replace(validation_details, file_provided_status == 'yes' & field_spec == 'opt' & field_provided_status != 'yes', 'missing_opt_field'))
all_val_df <- all_val_df %>%
dplyr::mutate(validation_status = replace(validation_status, grepl('req_file', validation_details), 'problem'))
return(all_val_df)
}
#' checks for the missing calendar.txt exception (see https://developers.google.com/transit/gtfs/reference/calendar_dates-file). if the exception is TRUE, then calendar has its file spec set to 'extra'.
#' @param all_val_df dataframe of all files and fields. returned from validate_vars_provided()
#' @return corrected dataframe.
#' @noRd
calendar_exception_fix <- function(all_val_df) {
# see if missing calendar
missing_calendar <- all_val_df %>%
dplyr::filter(file == 'calendar') %>%
'$'('file_provided_status') %>%
unique() %>%
`==`('no')
# see if calendar_dates file includes field `date`
has_date_field <- all_val_df %>%
dplyr::filter(file == 'calendar_dates') %>%
dplyr::filter(field == 'date') %>%
'$'('field_provided_status') %>%
`==`('yes')
# if missing calendar.txt but calendar_dates.txt has 'date' field, then update validation status of calendar to 'ok'
if(all(missing_calendar, has_date_field)) {
all_val_df <- all_val_df %>%
dplyr::mutate(validation_status = dplyr::if_else(file == "calendar", "ok", validation_status)) %>%
dplyr::mutate(file_spec = dplyr::if_else(file == "calendar", "ext", file_spec))
}
return(all_val_df)
}