In [1]:
# This notebook is going to use the data-dashboard code to add SCHEDULED_HEADWAYS from gtfs to events (ARR/DEP) data.
# It's complicated and I don't want to re-write the logic, so I"m going to use it as is.
# Thus, we import bus2train and call it on our data.

import bus2train
import pandas as pd
import pathlib
import glob
import numpy as np

In [2]:
input_file = "./data/in-data/2023/MBTA-Bus-Arrival-Departure-Times_2023-06.csv"
output_dir = "./data/out-data/"

raw_dat = pd.read_csv(input_file)
all_routes = [r.lstrip("0") for r in raw_dat.route_id.value_counts().index]

In [4]:
# this will take a few minutes to run since it has a lot of data to process
# and a lot of files to write to disk
# It will start by downloading a bunch of GTFS feeds, then calculate and add the scheduled_headways 
# to the events, and save the output in output_dir as gzipped csv.

pathlib.Path(output_dir).mkdir(exist_ok=True)

data = bus2train.load_data(input_file, all_routes)
events = bus2train.process_events(data)
bus2train.to_disk(events, output_dir, nozip=False) 
# set nozip to True if you want to be able to read the files. The data is large though... 59M compressed.
# Maybe run on a subset of routes if you want to experiment with unzipped.

Archive for 20230601 already exists: 20230530
Archive for 20230602 already exists: 20230602
Archive for 20230603 already exists: 20230602
Archive for 20230604 already exists: 20230602
Archive for 20230605 already exists: 20230602
Archive for 20230606 already exists: 20230602
Archive for 20230607 already exists: 20230607
Archive for 20230608 already exists: 20230607
Archive for 20230609 already exists: 20230609
Archive for 20230610 already exists: 20230609
Archive for 20230611 already exists: 20230609
Archive for 20230612 already exists: 20230609
Archive for 20230613 already exists: 20230609
Archive for 20230614 already exists: 20230614
Archive for 20230615 already exists: 20230614
Archive for 20230616 already exists: 20230614
Archive for 20230617 already exists: 20230614
Archive for 20230618 already exists: 20230614
Archive for 20230619 already exists: 20230619
Archive for 20230620 already exists: 20230620
Archive for 20230621 already exists: 20230620
Archive for 20230622 already exist

In [None]:
# While it is possible to load all this data into a single dataframe,
# the volume means its easier to work with if it's broken into chunks
# That you can then read/load/process individually.

In [40]:
# routes = ["111", "39"]
routes = all_routes

direction = "*" # use 0 and 1 if you want to specify outbound and inbound. * will use both.

OUTPUT = dict()

for ROUTE in routes:
    files = glob.glob(f"{output_dir}/Events/monthly-bus-data/{ROUTE}-{direction}-*/**/events.csv.gz", recursive=True)

    route_output = []
    for f in files:
        df = pd.read_csv(f, parse_dates=['service_date',"event_time"])
        df = df[(df.event_type == "ARR") & (df.service_date.dt.dayofweek < 5) & (df.event_time.dt.hour >= 7) & (df.event_time.dt.hour < 19)]
        df['actual_headway'] = df.groupby('service_date').event_time.diff().dt.seconds

        df = df[['actual_headway', 'scheduled_headway']].dropna()

        df['bunched'] = df.actual_headway / df.scheduled_headway < 0.25
        
        route_output.append(df)

    output = pd.concat(route_output)
    n_trips = len(output)
    output = output.agg("mean")
    output.columns = ["mean_actual_headway", "mean_scheduled_headway", "bunched_ratio"]
    output["trips"] = n_trips
    output["route_id"] = ROUTE

    OUTPUT[ROUTE] = output

        

In [41]:
data = pd.concat(OUTPUT.values(), axis=1).T.set_index("route_id")
data

Unnamed: 0_level_0,actual_headway,scheduled_headway,bunched,trips
route_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
111,396.506665,320.206604,0.163708,33010.0
743,896.567702,659.767984,0.062607,16292.0
23,682.475378,534.82454,0.113742,24450.0
28,731.504401,577.676126,0.136046,25337.0
SL1,,,,0.0
...,...,...,...,...
191,,,,0.0
192,,,,0.0
193,,,,0.0
171,,,,0.0


In [42]:
data[data.trips > 100].sort_values(by='bunched', ascending=False).head(10)

Unnamed: 0_level_0,actual_headway,scheduled_headway,bunched,trips
route_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
442,2168.864718,3217.001982,0.183102,4036.0
1,745.209439,553.333333,0.165883,19791.0
111,396.506665,320.206604,0.163708,33010.0
57,640.619771,469.360103,0.149046,20222.0
66,724.455027,576.724817,0.143829,21324.0
28,731.504401,577.676126,0.136046,25337.0
22,831.935275,633.520778,0.117303,17783.0
23,682.475378,534.82454,0.113742,24450.0
70,1007.959788,894.847565,0.103325,14498.0
9,775.613981,663.239849,0.102302,11945.0


In [47]:
data.loc["39"]

actual_headway       806.298196
scheduled_headway     660.70002
bunched                0.075199
trips                   19628.0
Name: 39, dtype: object

In [48]:
data.sort_values(by='trips', ascending=False).head(10)

Unnamed: 0_level_0,actual_headway,scheduled_headway,bunched,trips
route_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
111,396.506665,320.206604,0.163708,33010.0
28,731.504401,577.676126,0.136046,25337.0
23,682.475378,534.82454,0.113742,24450.0
66,724.455027,576.724817,0.143829,21324.0
57,640.619771,469.360103,0.149046,20222.0
1,745.209439,553.333333,0.165883,19791.0
39,806.298196,660.70002,0.075199,19628.0
22,831.935275,633.520778,0.117303,17783.0
743,896.567702,659.767984,0.062607,16292.0
77,955.938066,700.011637,0.100078,15468.0
