In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Route 66 Travel Time Distributions


In [None]:
arrival_df = pd.read_csv('MBTA_BusArrivalData.csv', encoding = 'latin-1')

In [None]:
arrival_df.head()

In [None]:
to_drop = arrival_df.loc[arrival_df['RouteName']!=66].index

In [None]:
arrival_df.drop(to_drop, inplace = True)

In [None]:
stop_list = ['Harvard Sq @ Garden St - Dawes Island',
             'Harvard Ave @ Commonwealth Ave',
             'Washington St @ Walnut',
             'Dudley Station',
             'Washington St @ Pearl St']

In [None]:
df = arrival_df.loc[arrival_df.StopName.isin(stop_list)].copy()

In [None]:
df.Vehicle.unique()

In [None]:
travel_times = {}
for bus in df.Vehicle.unique():
    df2=df.loc[df.Vehicle == bus].sort_values(by = 'ScheduledTimeInMin')
    tt = df2.ActArrivalTimeInMin.shift(-1)-df2.ActDepartureTimeInMin
    for segment in [0, 1, 2, 4, 5, 6]:
        if segment  not in travel_times:
            travel_times[segment] = []
        seg_tt = tt.iloc[segment::8]
        travel_times[segment].extend(seg_tt.loc[seg_tt > 0])


In [None]:
for key, values in travel_times.items():
    plt.figure()
    plt.hist(values, density=True)

In [None]:
tt_icdfs = {}
for stop, data in travel_times.items():
    kde = sm.nonparametric.KDEUnivariate(data)
    kde.fit(kernel='tri', fft=False, cut=0, gridsize=1024)
    plt.figure()
    plt.plot(kde.support, kde.density, c='r')
    plt.hist(data, density=True)
    tt_icdfs[stop] = list(kde.icdf)

In [None]:
# import json
# with open('travel_time_icdfs.json', 'w') as f:
#     json.dump(tt_icdfs, f)