Skip to content

Commit

Permalink
Merge pull request #181 from r-transit/dev/stop-name-distances
Browse files Browse the repository at this point in the history
stop distance analysis and stop_name clustering
  • Loading branch information
polettif committed Jan 27, 2022
2 parents bd587fc + 0b700f1 commit 2593862
Show file tree
Hide file tree
Showing 9 changed files with 560 additions and 11 deletions.
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
S3method(plot,tidygtfs)
S3method(print,tidygtfs)
S3method(summary,tidygtfs)
export(cluster_stops)
export(filter_feed_by_area)
export(filter_feed_by_date)
export(filter_feed_by_stops)
Expand All @@ -22,6 +23,8 @@ export(set_api_key)
export(set_servicepattern)
export(sf_as_tbl)
export(shapes_as_sf)
export(stop_distances)
export(stop_group_distances)
export(stops_as_sf)
export(travel_times)
export(validate_gtfs)
Expand Down Expand Up @@ -55,6 +58,7 @@ importFrom(sf,st_cast)
importFrom(sf,st_transform)
importFrom(stats,"na.omit")
importFrom(stats,"setNames")
importFrom(stats,kmeans)
importFrom(stats,median)
importFrom(stats,reshape)
importFrom(stats,sd)
Expand Down
256 changes: 256 additions & 0 deletions R/geo.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
#' Calculate distances between a given set of stops
#'
#' @param gtfs_stops gtfs stops table either as data frame (with at least `stop_id`,
#' `stop_lon` and `stop_lat` columns) or as `sf` object.
#'
#' @return Returns a data.frame with each row containing a pair of stop_ids (columns
#' `from_stop_id` and `to_stop_id`) and the `distance` between them (in meters)
#'
#' @note The resulting data.frame has nrow(gtfs_stops)^2 rows, distances calculations
#' among all stops for large feeds should be avoided
#'
#' @examples
#' library(dplyr)
#'
#' nyc_path <- system.file("extdata", "google_transit_nyc_subway.zip", package = "tidytransit")
#' nyc <- read_gtfs(nyc_path)
#'
#' nyc$stops %>%
#' filter(stop_name == "Borough Hall") %>%
#' stop_distances() %>%
#' arrange(desc(distance))
#'
#' #> # A tibble: 36 × 3
#' #> from_stop_id to_stop_id distance
#' #> <chr> <chr> <dbl>
#' #> 1 423 232 91.5
#' #> 2 423N 232 91.5
#' #> 3 423S 232 91.5
#' #> 4 423 232N 91.5
#' #> 5 423N 232N 91.5
#' #> 6 423S 232N 91.5
#' #> 7 423 232S 91.5
#' #> 8 423N 232S 91.5
#' #> 9 423S 232S 91.5
#' #> 10 232 423 91.5
#' #> # … with 26 more rows
#'
#' @export
stop_distances = function(gtfs_stops) {
stopifnot(nrow(gtfs_stops) > 1)
from_stop_id <- NULL
if(!inherits(gtfs_stops, "data.frame")) {
stop("Please pass a stops data.frame (i.e. with gtfs_obj$stops)")
}
if(inherits(gtfs_stops, "sf")) {
dist_matrix = sf::st_distance(gtfs_stops)
} else {
dist_matrix = geodist::geodist(gtfs_stops[,c("stop_id", "stop_lon", "stop_lat")])
}

rownames(dist_matrix) <- gtfs_stops$stop_id
colnames(dist_matrix) <- gtfs_stops$stop_id
dist_matrix_df = dplyr::as_tibble(dist_matrix, rownames = "from_stop_id")

# replace gather (no dependency on tidyr)
# dists_gathered = gather(dplyr::as_tibble(dist_matrix_df), "to_stop_id", "dist", -from_stop_id)
dists = reshape(as.data.frame(dist_matrix_df), direction = "long",
idvar = "from_stop_id", timevar = "to_stop_id", v.names = "distance",
varying = dist_matrix_df$from_stop_id)

dists$to_stop_id <- rep(dist_matrix_df$from_stop_id, each = length(dist_matrix_df$from_stop_id))
dists$distance <- as.numeric(dists$distance)

dplyr::as_tibble(dists)
}

geodist_list = function(lon, lat, names = NULL) {
# second fastest measure after cheap
dists = geodist::geodist(data.frame(lon, lat), measure = "haversine")
if(!is.null(names)) {
colnames(dists) <- rownames(dists) <- names
}
list(dists)
}

geodist_list_sf = function(pts, names = NULL) {
dists = as.numeric(sf::st_distance(pts))
if(!is.null(names)) {
colnames(dists) <- rownames(dists) <- names
}
list(dists)
}

prep_dist_mtrx = function(dist_list) {
mtrx = dist_list[[1]]
if(nrow(mtrx) == 1) return(0)
diag(mtrx) <- NA
v = c(mtrx)
v[!is.na(v)]
}

#' Calculates distances among stop within the same group column
#'
#' By default calculates distances among stop_ids with the same stop_name.
#'
#' @inheritParams stop_distances
#' @param by group column, default: stop_name
#'
#' @returns data.frame with one row per group containing a distance matrix (distances),
#' number of stop ids within that group (n_stop_ids) and distance summary values
#' (dist_mean, dist_median and dist_max).
#'
#' @examples
#' library(dplyr)
#'
#' nyc_path <- system.file("extdata", "google_transit_nyc_subway.zip", package = "tidytransit")
#' nyc <- read_gtfs(nyc_path)
#'
#' stop_group_distances(nyc$stops)
#' #> # A tibble: 380 × 6
#' #> stop_name distances n_stop_ids dist_mean dist_median dist_max
#' #> <chr> <list> <dbl> <dbl> <dbl> <dbl>
#' #> 1 86 St <dbl [18 × 18]> 18 5395. 5395. 21811.
#' #> 2 79 St <dbl [6 × 6]> 6 19053. 19053. 19053.
#' #> 3 Prospect Av <dbl [6 × 6]> 6 18804. 18804. 18804.
#' #> 4 77 St <dbl [6 × 6]> 6 16947. 16947. 16947.
#' #> 5 59 St <dbl [6 × 6]> 6 14130. 14130. 14130.
#' #> 6 50 St <dbl [9 × 9]> 9 7097. 7097. 14068.
#' #> 7 36 St <dbl [6 × 6]> 6 12496. 12496. 12496.
#' #> 8 8 Av <dbl [6 × 6]> 6 11682. 11682. 11682.
#' #> 9 7 Av <dbl [9 × 9]> 9 5479. 5479. 10753.
#' #> 10 111 St <dbl [9 × 9]> 9 3877. 3877. 7753.
#' #> # … with 370 more rows
#'
#' @export
stop_group_distances = function(gtfs_stops, by = "stop_name") {
distances <- n_stop_ids <- dist_mean <- dist_median <- dist_max <- NULL
if(inherits(gtfs_stops, "sf")) {
gtfs_stops <- sf_points_to_df(gtfs_stops, c("stop_lon", "stop_lat"), TRUE)
}
if(!by %in% colnames(gtfs_stops)) {
stop("column ", by, " does not exist in ", deparse(substitute(gtfs_stops)))
}
n_stops = table(gtfs_stops$stop_name)

gtfs_single_stops = gtfs_stops %>% filter(stop_name %in% names(n_stops)[n_stops == 1])
gtfs_multip_stops = gtfs_stops %>% filter(stop_name %in% names(n_stops)[n_stops != 1])

gtfs_multip_stops <- gtfs_multip_stops %>%
dplyr::group_by_at(by) %>%
dplyr::summarise(distances = geodist_list(stop_lon, stop_lat, stop_id), .groups = "keep") %>%
dplyr::mutate(n_stop_ids = nrow(distances[[1]]),
dist_mean = median(prep_dist_mtrx(distances)),
dist_median = median(prep_dist_mtrx(distances)),
dist_max = max(prep_dist_mtrx(distances))) %>% ungroup()

# tidytable version
# gtfs_multip_stops <- gtfs_multip_stops %>%
# tidytable::summarise.(distances = geodist_list(stop_lon, stop_lat, stop_id), .by = by) %>%
# tidytable::mutate.(n_stop_ids = nrow(distances[[1]]),
# dist_mean = median(prep_dist_mtrx(distances)),
# dist_median = median(prep_dist_mtrx(distances)),
# dist_max = max(prep_dist_mtrx(distances)), .by = "stop_name")

gtfs_single_stops <- gtfs_single_stops %>%
select(stop_name) %>%
dplyr::mutate(distances = list(matrix(0)), n_stop_ids = 1, dist_mean = 0, dist_median = 0, dist_max = 0)

dists = dplyr::as_tibble(dplyr::bind_rows(gtfs_single_stops, gtfs_multip_stops))
dists[order(dists$dist_max, dists$n_stop_ids, dists[[by]], decreasing = T),]
}

#' Cluster nearby stops within a group
#'
#' Finds clusters of stops for each unique value in `group_col` (e.g. stop_name). Can
#' be used to find different groups of stops that share the same name but are located more
#' than `max_dist` apart. `gtfs_stops` is assigned a new column (named `cluster_colname`)
#' which contains the `group_col` value and the cluster number.
#'
#' [stats::kmeans()] is used for clustering.
#'
#' @param gtfs_stops Stops table of a gtfs object. It is also possible to pass a
#' tidygtfs object to enable piping.
#' @param max_dist Only stop groups that have a maximum distance among them above this
#' threshold (in meters) are clustered.
#' @param group_col Clusters for are calculated for each set of stops with the same value
#' in this column (default: stop_name)
#' @param cluster_colname Name of the new column name. Can be the same as group_col to overwrite.
#'
#' @return Returns a stops table with an added cluster column. If `gtfs_stops` is a tidygtfs object, a
#' modified tidygtfs object is return
#'
#' @importFrom stats kmeans
#' @examples \donttest{
#' library(dplyr)
#' nyc_path <- system.file("extdata", "google_transit_nyc_subway.zip", package = "tidytransit")
#' nyc <- read_gtfs(nyc_path)
#' nyc <- cluster_stops(nyc)
#'
#' # There are 6 stops with the name "86 St" that are far apart
#' stops_86_St = nyc$stops %>%
#' filter(stop_name == "86 St")
#'
#' table(stops_86_St$stop_name_cluster)
#' #> 86 St [1] 86 St [2] 86 St [3] 86 St [4] 86 St [5] 86 St [6]
#' #> 3 3 3 3 3 3
#'
#' stops_86_St %>% select(stop_id, stop_name, parent_station, stop_name_cluster) %>% head()
#' #> # A tibble: 6 × 4
#' #> stop_id stop_name parent_station stop_name_cluster
#' #> <chr> <chr> <chr> <chr>
#' #> 1 121 86 St "" 86 St [3]
#' #> 2 121N 86 St "121" 86 St [3]
#' #> 3 121S 86 St "121" 86 St [3]
#' #> 4 626 86 St "" 86 St [4]
#' #> 5 626N 86 St "626" 86 St [4]
#' #> 6 626S 86 St "626" 86 St [4]
#'
#' library(ggplot2)
#' ggplot(stops_86_St) +
#' geom_point(aes(stop_lon, stop_lat, color = stop_name_cluster))
#' }
#' @export
cluster_stops = function(gtfs_stops,
max_dist = 300,
group_col = "stop_name",
cluster_colname = "stop_name_cluster") {
if(inherits(gtfs_stops, "tidygtfs")) {
gstops = gtfs_stops$stops
} else {
gstops = gtfs_stops
}

is_sf = inherits(gstops, "sf")
stops_clusters = lapply(unique(gstops[[group_col]]), function(sn) {
stop_name_set = gstops[gstops[[group_col]] == sn,]
stop_name_set[[cluster_colname]] <- sn
if(nrow(stop_name_set) == 1) return(stop_name_set)

dists = stop_distances(stop_name_set)
if(max(dists$distance) > max_dist) {
if(is_sf) {
stop_name_lonlat = do.call(rbind, sf::st_geometry(stop_name_set))
} else {
stop_name_lonlat = stop_name_set[,c("stop_lon", "stop_lat")]
}

stops_unique_coords = unique(stop_name_lonlat)
n_dists = min(length(unique(dists$distance)), nrow(stops_unique_coords))
n_clusters = min(n_dists, nrow(stop_name_set)-1)

kms = kmeans(stop_name_lonlat, n_clusters)

stop_name_set[[cluster_colname]] <- paste0(sn, " [", kms$cluster, "]")
}
stop_name_set
})
stops_clusters = dplyr::bind_rows(stops_clusters)

if(inherits(gtfs_stops, "tidygtfs")) {
gtfs_stops$stops <- stops_clusters
return(gtfs_stops)
} else {
return(stops_clusters)
}
}
43 changes: 34 additions & 9 deletions R/raptor.R
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ raptor = function(stop_times,
if(!is.character(stop_ids) && !is.null(stop_ids)) {
stop("stop_ids must be a character vector (or NULL)")
}

# check and params ####
# stop ids need to be a character vector
# use data.table for faster manipulation
Expand Down Expand Up @@ -298,12 +298,17 @@ raptor = function(stop_times,
#' Calculate shortest travel times from a stop to all reachable stops
#'
#' Function to calculate the shortest travel times from a stop (given by `stop_name`)
#' to all other stops of a feed. `filtered_stop_times` needs to be created before with
#' to all other stop_names of a feed. `filtered_stop_times` needs to be created before with
#' [filter_stop_times()] or [filter_feed_by_date()].
#'
#' This function allows easier access to [raptor()] by using stop names instead of ids and
#' returning shortest travel times by default.
#'
#' Note however that stop_name might not be a suitable identifier for a feed. It is possible
#' that multiple stops have the same name while not being related or geographically close to
#' each other. [stop_group_distances()] and [cluster_stops()] can help identify and fix
#' issues with stop_names.
#'
#' @param filtered_stop_times stop_times data.table (with transfers and stops tables as
#' attributes) created with [filter_stop_times()] where the
#' departure or arrival time has been set. Alternatively,
Expand All @@ -324,15 +329,26 @@ raptor = function(stop_times,
#' @param return_coords Returns stop coordinates as columms. Default is FALSE.
#' @param return_DT travel_times() returns a data.table if TRUE. Default is FALSE which
#' returns a tibble/tbl_df.
#'
#' @param stop_dist_check stop_names are not structured identifiers like
#' stop_ids or parent_stations, so it's possible that
#' stops with the same name are far apart. travel_times()
#' errors if the distance among stop_ids with the same name is
#' above this threshold (in meters).
#' Use FALSE to turn check off. However, it is recommended to
#' either use [raptor()] or fix the feed (see [cluster_stops()]).
#'
#' @return A table with travel times to/from all stops reachable by `stop_name` and their
#' corresponding journey departure and arrival times.
#'
#' @importFrom data.table fifelse
#' @export
#' @examples \donttest{
#' nyc_path <- system.file("extdata", "google_transit_nyc_subway.zip", package = "tidytransit")
#' nyc <- read_gtfs(nyc_path)
#'
#' # stop_names in this feed are not restricted to an area, create clusters of stops to fix
#' nyc <- cluster_stops(nyc, group_col = "stop_name", cluster_colname = "stop_name")
#'
#' # Use journeys departing after 7 AM with arrival time before 9 AM on 26th June
#' stop_times <- filter_stop_times(nyc, "2018-06-26", 7*3600, 9*3600)
#'
Expand All @@ -355,8 +371,10 @@ travel_times = function(filtered_stop_times,
max_transfers = NULL,
max_departure_time = NULL,
return_coords = FALSE,
return_DT = FALSE) {
return_DT = FALSE,
stop_dist_check = 300) {
travel_time <- journey_arrival_time <- journey_departure_time <- NULL
stop_names = stop_name; rm(stop_name)
if("tidygtfs" %in% class(filtered_stop_times)) {
gtfs_obj = filtered_stop_times
if(is.null(attributes(gtfs_obj$stop_times)$extract_date)) {
Expand Down Expand Up @@ -387,11 +405,18 @@ travel_times = function(filtered_stop_times,
}

# get stop_ids of names
stop_ids = NULL
if(!is.null(stop_name)) {
stop_ids = stops$stop_id[which(stops$stop_name %in% stop_name)]
if(length(stop_ids) == 0) {
stop(paste0("Stop name '", stop_name, "' not found in stops table"))
stop_ids = stops$stop_id[which(stops$stop_name %in% stop_names)]
if(length(stop_ids) == 0) {
stop(paste0("Stop name '", stop_names, "' not found in stops table"))
}

# Check stop_name integrity
if(length(stop_ids) > 1 & !is.null(stop_dist_check) & !isFALSE(stop_dist_check)) {
stop_dists = stop_group_distances(stops, "stop_name")

if(max(stop_dists$dist_max) > stop_dist_check) {
stop("Some stops with the same name are more than ", stop_dist_check, " meters apart, see stop_group_distances().\n",
"Using travel_times() might lead to unexpected results. Set stop_dist_check=FALSE to ignore this error.")
}
}

Expand Down
Loading

0 comments on commit 2593862

Please sign in to comment.