# Setup

In [1]:
from shapely.geometry import box

from utils import *

# GTFS
Prepare Google Transit Feed Specification (GTFS) data for all the target cities (USA top 50).

## Get the download links
Download a spreadsheet from https://bit.ly/catalogs-csv on the [mobilitydatabase.org](https://mobilitydatabase.org) website. Then, download all the GTFS feeds which have a direct download URL packaged as a zip file for easy and standardized download.

In [2]:
all_urls = view(
    pd.read_csv('https://bit.ly/catalogs-csv')
    .rename(columns={'urls.direct_download': 'url'})
    .rename(columns=lambda x: x.replace('location.', ''))
    .pipe(lambda df: df[(df['country_code'] == 'US') &
                        (df['url'].str.endswith('.zip')) &
                        (~df['status'].isin(['inactive', 'deprecated']))])
    .set_index('mdb_source_id').rename_axis('mdb_id'))

704 rows x 25 cols; Memory: 0.7 MiB


Unnamed: 0,data_type,entity_type,country_code,subdivision_name,municipality,provider,name,note,feed_contact_email,static_reference,...,urls.license,bounding_box.minimum_latitude,bounding_box.maximum_latitude,bounding_box.minimum_longitude,bounding_box.maximum_longitude,bounding_box.extracted_on,status,features,redirect.id,redirect.comment
mdb_id,<object>,<object>,<object>,<object>,<object>,<object>,<object>,<object>,<object>,<float64>,...,<object>,<float64>,<float64>,<float64>,<float64>,<object>,<object>,<object>,<object>,<object>
5,gtfs,,US,New York,Canton,St Lawrence County Public Transit,,,,,...,https://data.ny.gov/download/77gx-ii52/applica...,44.148476,44.979369,-75.75695,-74.611761,2022-03-14T20:02:31+00:00,,,,


## Bounds-based filter
Only include the transit feeds lying within the bounding boxes of the target cities (50 largest) to improve routing performance.

In [3]:
# tracts = gpd.read_parquet('data/zones.parquet', filters=[
#     ('level', '==', 'Tract')], columns=['city', 'geometry'])
# tracts['geometry'] = tracts.bounds.apply(lambda x: box(*x), axis=1)
# city_bounds = view(tracts.dissolve('city').reset_index())
# city_bounds.to_parquet('data/city_bounds.parquet') # 5s
city_bounds = view(gpd.read_parquet('data/city_bounds.parquet'))

50 rows x 2 cols; Memory: 0.0 MiB; <Geographic 2D CRS: EPSG:4326>


Unnamed: 0,city,geometry
,<category>,<geometry>
0.0,Atlanta,"POLYGON ((-84.745213 33.469068, -84.745213 33...."


In [4]:
bounds = all_urls[['bounding_box.' + x for x in [
    'minimum_longitude', 'minimum_latitude',
    'maximum_longitude', 'maximum_latitude']]].dropna()
bounds.columns = ['minx', 'miny', 'maxx', 'maxy']
# remove large intercity GTFS feeds which span very large
# areas (such as Amtrak), maybe like 3° E-W and 3° N-S
bounds = bounds[(bounds.maxx - bounds.minx <= 3) &
                (bounds.maxy - bounds.miny <= 3)]
bounds['geometry'] = [box(*r) for _, r in bounds.iterrows()]
bounds = Gdf(bounds, crs=CRS_DEG, index=bounds.index)
bounds = bounds.sjoin(city_bounds).drop(columns='index_right')
bounds = bounds.drop(columns='city').merge(
    bounds.groupby('mdb_id').agg(D(city=list)), on='mdb_id')
feeds = view(bounds.merge(all_urls[['url']], on='mdb_id').reset_index())
feeds.to_parquet('data/gtfs/gtfs_feeds.parquet')

363 rows x 8 cols; Memory: 0.1 MiB; <Geographic 2D CRS: EPSG:4326>


Unnamed: 0,mdb_id,minx,miny,maxx,maxy,geometry,city,url
,<int64>,<float64>,<float64>,<float64>,<float64>,<geometry>,<object>,<object>
0.0,13,-117.277924,32.542819,-116.184458,33.256887,"POLYGON ((-116.184458 32.542819, -116.184458 3...",[San Diego],https://www.sdmts.com/google_transit_files/goo...


## Download data

Problems with:
- 50 (San Francisco): Max retries
- 521 (New York): Max retries

In [5]:
# pbar = tqdm([(r.mdb_id, r.city, r.url) for _, r in urls.iterrows()
#              if not Path(f'data/gtfs/all/{r.mdb_id}.zip').exists()])
# for mdb_id, cities, url in pbar:
#     pbar.set_description(f'{mdb_id} ({cities})')
#     try:
#         fpath = Path(f'data/gtfs/all/{mdb_id}.zip')
#         if fpath.exists():
#             continue
#         req = requests.get(url, stream=True)
#         with open(fpath, 'wb') as f:
#             for chunk in req.iter_content(chunk_size=128):
#                 f.write(chunk)
#     except Exception as e:
#         print('ERROR:', mdb_id, cities, e)