Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

stop distance analysis and stop_name clustering #181

Merged
merged 6 commits into from
Jan 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
S3method(plot,tidygtfs)
S3method(print,tidygtfs)
S3method(summary,tidygtfs)
export(cluster_stops)
export(filter_feed_by_area)
export(filter_feed_by_date)
export(filter_feed_by_stops)
Expand All @@ -22,6 +23,8 @@ export(set_api_key)
export(set_servicepattern)
export(sf_as_tbl)
export(shapes_as_sf)
export(stop_distances)
export(stop_group_distances)
export(stops_as_sf)
export(travel_times)
export(validate_gtfs)
Expand Down Expand Up @@ -55,6 +58,7 @@ importFrom(sf,st_cast)
importFrom(sf,st_transform)
importFrom(stats,"na.omit")
importFrom(stats,"setNames")
importFrom(stats,kmeans)
importFrom(stats,median)
importFrom(stats,reshape)
importFrom(stats,sd)
Expand Down
256 changes: 256 additions & 0 deletions R/geo.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
#' Calculate distances between a given set of stops
#'
#' @param gtfs_stops gtfs stops table either as data frame (with at least `stop_id`,
#' `stop_lon` and `stop_lat` columns) or as `sf` object.
#'
#' @return Returns a data.frame with each row containing a pair of stop_ids (columns
#' `from_stop_id` and `to_stop_id`) and the `distance` between them (in meters)
#'
#' @note The resulting data.frame has nrow(gtfs_stops)^2 rows, distances calculations
#' among all stops for large feeds should be avoided
#'
#' @examples
#' library(dplyr)
#'
#' nyc_path <- system.file("extdata", "google_transit_nyc_subway.zip", package = "tidytransit")
#' nyc <- read_gtfs(nyc_path)
#'
#' nyc$stops %>%
#' filter(stop_name == "Borough Hall") %>%
#' stop_distances() %>%
#' arrange(desc(distance))
#'
#' #> # A tibble: 36 × 3
#' #> from_stop_id to_stop_id distance
#' #> <chr> <chr> <dbl>
#' #> 1 423 232 91.5
#' #> 2 423N 232 91.5
#' #> 3 423S 232 91.5
#' #> 4 423 232N 91.5
#' #> 5 423N 232N 91.5
#' #> 6 423S 232N 91.5
#' #> 7 423 232S 91.5
#' #> 8 423N 232S 91.5
#' #> 9 423S 232S 91.5
#' #> 10 232 423 91.5
#' #> # … with 26 more rows
#'
#' @export
stop_distances = function(gtfs_stops) {
stopifnot(nrow(gtfs_stops) > 1)
from_stop_id <- NULL
if(!inherits(gtfs_stops, "data.frame")) {
stop("Please pass a stops data.frame (i.e. with gtfs_obj$stops)")
}
if(inherits(gtfs_stops, "sf")) {
dist_matrix = sf::st_distance(gtfs_stops)
} else {
dist_matrix = geodist::geodist(gtfs_stops[,c("stop_id", "stop_lon", "stop_lat")])
}

rownames(dist_matrix) <- gtfs_stops$stop_id
colnames(dist_matrix) <- gtfs_stops$stop_id
dist_matrix_df = dplyr::as_tibble(dist_matrix, rownames = "from_stop_id")

# replace gather (no dependency on tidyr)
# dists_gathered = gather(dplyr::as_tibble(dist_matrix_df), "to_stop_id", "dist", -from_stop_id)
dists = reshape(as.data.frame(dist_matrix_df), direction = "long",
idvar = "from_stop_id", timevar = "to_stop_id", v.names = "distance",
varying = dist_matrix_df$from_stop_id)

dists$to_stop_id <- rep(dist_matrix_df$from_stop_id, each = length(dist_matrix_df$from_stop_id))
dists$distance <- as.numeric(dists$distance)

dplyr::as_tibble(dists)
}

geodist_list = function(lon, lat, names = NULL) {
# second fastest measure after cheap
dists = geodist::geodist(data.frame(lon, lat), measure = "haversine")
if(!is.null(names)) {
colnames(dists) <- rownames(dists) <- names
}
list(dists)
}

geodist_list_sf = function(pts, names = NULL) {
dists = as.numeric(sf::st_distance(pts))
if(!is.null(names)) {
colnames(dists) <- rownames(dists) <- names
}
list(dists)
}

prep_dist_mtrx = function(dist_list) {
mtrx = dist_list[[1]]
if(nrow(mtrx) == 1) return(0)
diag(mtrx) <- NA
v = c(mtrx)
v[!is.na(v)]
}

#' Calculates distances among stop within the same group column
#'
#' By default calculates distances among stop_ids with the same stop_name.
#'
#' @inheritParams stop_distances
#' @param by group column, default: stop_name
#'
#' @returns data.frame with one row per group containing a distance matrix (distances),
#' number of stop ids within that group (n_stop_ids) and distance summary values
#' (dist_mean, dist_median and dist_max).
#'
#' @examples
#' library(dplyr)
#'
#' nyc_path <- system.file("extdata", "google_transit_nyc_subway.zip", package = "tidytransit")
#' nyc <- read_gtfs(nyc_path)
#'
#' stop_group_distances(nyc$stops)
#' #> # A tibble: 380 × 6
#' #> stop_name distances n_stop_ids dist_mean dist_median dist_max
#' #> <chr> <list> <dbl> <dbl> <dbl> <dbl>
#' #> 1 86 St <dbl [18 × 18]> 18 5395. 5395. 21811.
#' #> 2 79 St <dbl [6 × 6]> 6 19053. 19053. 19053.
#' #> 3 Prospect Av <dbl [6 × 6]> 6 18804. 18804. 18804.
#' #> 4 77 St <dbl [6 × 6]> 6 16947. 16947. 16947.
#' #> 5 59 St <dbl [6 × 6]> 6 14130. 14130. 14130.
#' #> 6 50 St <dbl [9 × 9]> 9 7097. 7097. 14068.
#' #> 7 36 St <dbl [6 × 6]> 6 12496. 12496. 12496.
#' #> 8 8 Av <dbl [6 × 6]> 6 11682. 11682. 11682.
#' #> 9 7 Av <dbl [9 × 9]> 9 5479. 5479. 10753.
#' #> 10 111 St <dbl [9 × 9]> 9 3877. 3877. 7753.
#' #> # … with 370 more rows
#'
#' @export
stop_group_distances = function(gtfs_stops, by = "stop_name") {
distances <- n_stop_ids <- dist_mean <- dist_median <- dist_max <- NULL
if(inherits(gtfs_stops, "sf")) {
gtfs_stops <- sf_points_to_df(gtfs_stops, c("stop_lon", "stop_lat"), TRUE)
}
if(!by %in% colnames(gtfs_stops)) {
stop("column ", by, " does not exist in ", deparse(substitute(gtfs_stops)))
}
n_stops = table(gtfs_stops$stop_name)

gtfs_single_stops = gtfs_stops %>% filter(stop_name %in% names(n_stops)[n_stops == 1])
gtfs_multip_stops = gtfs_stops %>% filter(stop_name %in% names(n_stops)[n_stops != 1])

gtfs_multip_stops <- gtfs_multip_stops %>%
dplyr::group_by_at(by) %>%
dplyr::summarise(distances = geodist_list(stop_lon, stop_lat, stop_id), .groups = "keep") %>%
dplyr::mutate(n_stop_ids = nrow(distances[[1]]),
dist_mean = median(prep_dist_mtrx(distances)),
dist_median = median(prep_dist_mtrx(distances)),
dist_max = max(prep_dist_mtrx(distances))) %>% ungroup()

# tidytable version
# gtfs_multip_stops <- gtfs_multip_stops %>%
# tidytable::summarise.(distances = geodist_list(stop_lon, stop_lat, stop_id), .by = by) %>%
# tidytable::mutate.(n_stop_ids = nrow(distances[[1]]),
# dist_mean = median(prep_dist_mtrx(distances)),
# dist_median = median(prep_dist_mtrx(distances)),
# dist_max = max(prep_dist_mtrx(distances)), .by = "stop_name")

gtfs_single_stops <- gtfs_single_stops %>%
select(stop_name) %>%
dplyr::mutate(distances = list(matrix(0)), n_stop_ids = 1, dist_mean = 0, dist_median = 0, dist_max = 0)

dists = dplyr::as_tibble(dplyr::bind_rows(gtfs_single_stops, gtfs_multip_stops))
dists[order(dists$dist_max, dists$n_stop_ids, dists[[by]], decreasing = T),]
}

#' Cluster nearby stops within a group
#'
#' Finds clusters of stops for each unique value in `group_col` (e.g. stop_name). Can
#' be used to find different groups of stops that share the same name but are located more
#' than `max_dist` apart. `gtfs_stops` is assigned a new column (named `cluster_colname`)
#' which contains the `group_col` value and the cluster number.
#'
#' [stats::kmeans()] is used for clustering.
#'
#' @param gtfs_stops Stops table of a gtfs object. It is also possible to pass a
#' tidygtfs object to enable piping.
#' @param max_dist Only stop groups that have a maximum distance among them above this
#' threshold (in meters) are clustered.
#' @param group_col Clusters for are calculated for each set of stops with the same value
#' in this column (default: stop_name)
#' @param cluster_colname Name of the new column name. Can be the same as group_col to overwrite.
#'
#' @return Returns a stops table with an added cluster column. If `gtfs_stops` is a tidygtfs object, a
#' modified tidygtfs object is return
#'
#' @importFrom stats kmeans
#' @examples \donttest{
#' library(dplyr)
#' nyc_path <- system.file("extdata", "google_transit_nyc_subway.zip", package = "tidytransit")
#' nyc <- read_gtfs(nyc_path)
#' nyc <- cluster_stops(nyc)
#'
#' # There are 6 stops with the name "86 St" that are far apart
#' stops_86_St = nyc$stops %>%
#' filter(stop_name == "86 St")
#'
#' table(stops_86_St$stop_name_cluster)
#' #> 86 St [1] 86 St [2] 86 St [3] 86 St [4] 86 St [5] 86 St [6]
#' #> 3 3 3 3 3 3
#'
#' stops_86_St %>% select(stop_id, stop_name, parent_station, stop_name_cluster) %>% head()
#' #> # A tibble: 6 × 4
#' #> stop_id stop_name parent_station stop_name_cluster
#' #> <chr> <chr> <chr> <chr>
#' #> 1 121 86 St "" 86 St [3]
#' #> 2 121N 86 St "121" 86 St [3]
#' #> 3 121S 86 St "121" 86 St [3]
#' #> 4 626 86 St "" 86 St [4]
#' #> 5 626N 86 St "626" 86 St [4]
#' #> 6 626S 86 St "626" 86 St [4]
#'
#' library(ggplot2)
#' ggplot(stops_86_St) +
#' geom_point(aes(stop_lon, stop_lat, color = stop_name_cluster))
#' }
#' @export
cluster_stops = function(gtfs_stops,
max_dist = 300,
group_col = "stop_name",
cluster_colname = "stop_name_cluster") {
if(inherits(gtfs_stops, "tidygtfs")) {
gstops = gtfs_stops$stops
} else {
gstops = gtfs_stops
}

is_sf = inherits(gstops, "sf")
stops_clusters = lapply(unique(gstops[[group_col]]), function(sn) {
stop_name_set = gstops[gstops[[group_col]] == sn,]
stop_name_set[[cluster_colname]] <- sn
if(nrow(stop_name_set) == 1) return(stop_name_set)

dists = stop_distances(stop_name_set)
if(max(dists$distance) > max_dist) {
if(is_sf) {
stop_name_lonlat = do.call(rbind, sf::st_geometry(stop_name_set))
} else {
stop_name_lonlat = stop_name_set[,c("stop_lon", "stop_lat")]
}

stops_unique_coords = unique(stop_name_lonlat)
n_dists = min(length(unique(dists$distance)), nrow(stops_unique_coords))
n_clusters = min(n_dists, nrow(stop_name_set)-1)

kms = kmeans(stop_name_lonlat, n_clusters)

stop_name_set[[cluster_colname]] <- paste0(sn, " [", kms$cluster, "]")
}
stop_name_set
})
stops_clusters = dplyr::bind_rows(stops_clusters)

if(inherits(gtfs_stops, "tidygtfs")) {
gtfs_stops$stops <- stops_clusters
return(gtfs_stops)
} else {
return(stops_clusters)
}
}
43 changes: 34 additions & 9 deletions R/raptor.R
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ raptor = function(stop_times,
if(!is.character(stop_ids) && !is.null(stop_ids)) {
stop("stop_ids must be a character vector (or NULL)")
}

# check and params ####
# stop ids need to be a character vector
# use data.table for faster manipulation
Expand Down Expand Up @@ -298,12 +298,17 @@ raptor = function(stop_times,
#' Calculate shortest travel times from a stop to all reachable stops
#'
#' Function to calculate the shortest travel times from a stop (given by `stop_name`)
#' to all other stops of a feed. `filtered_stop_times` needs to be created before with
#' to all other stop_names of a feed. `filtered_stop_times` needs to be created before with
#' [filter_stop_times()] or [filter_feed_by_date()].
#'
#' This function allows easier access to [raptor()] by using stop names instead of ids and
#' returning shortest travel times by default.
#'
#' Note however that stop_name might not be a suitable identifier for a feed. It is possible
#' that multiple stops have the same name while not being related or geographically close to
#' each other. [stop_group_distances()] and [cluster_stops()] can help identify and fix
#' issues with stop_names.
#'
#' @param filtered_stop_times stop_times data.table (with transfers and stops tables as
#' attributes) created with [filter_stop_times()] where the
#' departure or arrival time has been set. Alternatively,
Expand All @@ -324,15 +329,26 @@ raptor = function(stop_times,
#' @param return_coords Returns stop coordinates as columms. Default is FALSE.
#' @param return_DT travel_times() returns a data.table if TRUE. Default is FALSE which
#' returns a tibble/tbl_df.
#'
#' @param stop_dist_check stop_names are not structured identifiers like
#' stop_ids or parent_stations, so it's possible that
#' stops with the same name are far apart. travel_times()
#' errors if the distance among stop_ids with the same name is
#' above this threshold (in meters).
#' Use FALSE to turn check off. However, it is recommended to
#' either use [raptor()] or fix the feed (see [cluster_stops()]).
#'
#' @return A table with travel times to/from all stops reachable by `stop_name` and their
#' corresponding journey departure and arrival times.
#'
#' @importFrom data.table fifelse
#' @export
#' @examples \donttest{
#' nyc_path <- system.file("extdata", "google_transit_nyc_subway.zip", package = "tidytransit")
#' nyc <- read_gtfs(nyc_path)
#'
#' # stop_names in this feed are not restricted to an area, create clusters of stops to fix
#' nyc <- cluster_stops(nyc, group_col = "stop_name", cluster_colname = "stop_name")
#'
#' # Use journeys departing after 7 AM with arrival time before 9 AM on 26th June
#' stop_times <- filter_stop_times(nyc, "2018-06-26", 7*3600, 9*3600)
#'
Expand All @@ -355,8 +371,10 @@ travel_times = function(filtered_stop_times,
max_transfers = NULL,
max_departure_time = NULL,
return_coords = FALSE,
return_DT = FALSE) {
return_DT = FALSE,
stop_dist_check = 300) {
travel_time <- journey_arrival_time <- journey_departure_time <- NULL
stop_names = stop_name; rm(stop_name)
if("tidygtfs" %in% class(filtered_stop_times)) {
gtfs_obj = filtered_stop_times
if(is.null(attributes(gtfs_obj$stop_times)$extract_date)) {
Expand Down Expand Up @@ -387,11 +405,18 @@ travel_times = function(filtered_stop_times,
}

# get stop_ids of names
stop_ids = NULL
if(!is.null(stop_name)) {
stop_ids = stops$stop_id[which(stops$stop_name %in% stop_name)]
if(length(stop_ids) == 0) {
stop(paste0("Stop name '", stop_name, "' not found in stops table"))
stop_ids = stops$stop_id[which(stops$stop_name %in% stop_names)]
if(length(stop_ids) == 0) {
stop(paste0("Stop name '", stop_names, "' not found in stops table"))
}

# Check stop_name integrity
if(length(stop_ids) > 1 & !is.null(stop_dist_check) & !isFALSE(stop_dist_check)) {
stop_dists = stop_group_distances(stops, "stop_name")

if(max(stop_dists$dist_max) > stop_dist_check) {
stop("Some stops with the same name are more than ", stop_dist_check, " meters apart, see stop_group_distances().\n",
"Using travel_times() might lead to unexpected results. Set stop_dist_check=FALSE to ignore this error.")
}
}

Expand Down
Loading