# notebook to validate headway calculations
## comparing selected stops to agency timetables

In [1]:
import pandas as pd, numpy as np, os

In [2]:
# field data types for a more efficient load

df_types = {
    'Agency': np.dtype('O'),
    'IB AM Peak': np.dtype('float64'),
    'IB PM Peak': np.dtype('float64'),
    'IB Routes': np.dtype('O'),
    'IB Saturday': np.dtype('float64'),
    'IB Sunday': np.dtype('float64'),
    'IB Weekday': np.dtype('float64'),
    'Name': np.dtype('O'),
    'OB AM Peak': np.dtype('float64'),
    'OB PM Peak': np.dtype('float64'),
    'OB Routes': np.dtype('O'),
    'OB Saturday': np.dtype('float64'),
    'OB Sunday': np.dtype('float64'),
    'OB Weekday': np.dtype('float64'),
    'Stop ID': np.dtype('O'),
    'Unnamed: 0': np.dtype('int64'),
    'X': np.dtype('float64'),
    'Y': np.dtype('float64')
}

In [6]:
df = pd.read_csv('output/AllBus.csv', dtype=df_types)

In [7]:
list_of_agencies = list(os.walk('gtfs'))[0][1]

path = "gtfs/" + list_of_agencies[np.random.randint(0, len(list_of_agencies))] + "/"
path

'gtfs/golden-empire-transit-district--340/'

In [5]:
print(path)
print("-------")

agency_name = pd.read_csv(path + 'agency.txt')['agency_name'][0]
agency_name = agency_name.replace("/","-")
print(agency_name)

trips = pd.read_csv(path + 'trips.txt')
print(len(trips), "trips")

routes = pd.read_csv(path + 'routes.txt')
print(len(routes), "routes")

stops = pd.read_csv(path + 'stops.txt')
print(len(stops), "stops")

stop_times = pd.read_csv(path + 'stop_times.txt')
print(len(stop_times), "stop times")

calendar = pd.read_csv(path + "calendar.txt")
print(len(calendar), "schedules")

calendar_dates = pd.read_csv(path + "calendar_dates.txt")
print(len(calendar_dates), "exception dates")

gtfs/marin-transit--345/
-------
Marin Transit
2174 trips
27 routes
587 stops
52012 stop times
32 schedules
47 exception dates


In [40]:
trips2routes = dict(zip(trips['trip_id'], trips['route_id']))
routes2headsigns = dict(zip(routes['route_id'], routes['route_short_name']))

In [36]:
stop_times.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled
0,778652,18:47:00,18:47:00,290,1,,,,
1,778652,18:48:00,18:48:00,1600,2,,,,
2,778652,18:48:00,18:48:00,1602,3,,,,
3,778652,18:49:00,18:49:00,1603,4,,,,
4,778652,18:50:00,18:50:00,1604,5,,,,


In [65]:
tempDF = stop_times.query("stop_sequence ==1")[['trip_id', 'stop_id']]
tempDF['route_short_name'] = [routes2headsigns[trips2routes[x]] for x in list(tempDF['trip_id'])]
tempDF['route_id'] = [trips2routes[x] for x in list(tempDF['trip_id'])]

In [68]:
tempDF.drop_duplicates(subset=['stop_id', 'route_short_name'],inplace=True)

tempDF['agency'] = agency_name

tempDF



Unnamed: 0,trip_id,stop_id,route_short_name,route_id,agency
0,t_209005_b_6600_tn_0,782812,C,1965,El Dorado Transit
232,t_209119_b_6600_tn_0,782847,C,1965,El Dorado Transit
418,t_209128_b_6600_tn_0,782799,C,1965,El Dorado Transit
437,t_209129_b_6600_tn_0,782752,C,1965,El Dorado Transit
459,t_209311_b_6600_tn_0,782805,C,1965,El Dorado Transit
548,t_255298_b_7159_tn_0,782723,60,1961,El Dorado Transit
571,t_255299_b_7159_tn_0,782761,60,1961,El Dorado Transit
1063,t_255324_b_7159_tn_0,782847,50x,1968,El Dorado Transit
1080,t_255325_b_7159_tn_0,782761,50x,1968,El Dorado Transit
1258,t_255336_b_7159_tn_1,782761,30,1963,El Dorado Transit


In [8]:
concatList = []

for agency in list_of_agencies:
    path = "gtfs/" + agency + "/" 
    #print(path)
    
    print("-------")

    agency_name = pd.read_csv(path + 'agency.txt')['agency_name'][0]
    agency_name = agency_name.replace("/","-")
    print(agency_name)

    # load in GTFS data
    trips = pd.read_csv(path + 'trips.txt')
    routes = pd.read_csv(path + 'routes.txt')
    stop_times = pd.read_csv(path + 'stop_times.txt')
    
    # store trip/route information in dictionary
    trips2routes = dict(zip(trips['trip_id'], trips['route_id']))
    routes2headsigns = dict(zip(routes['route_id'], routes['route_short_name']))
    routes2route_type = dict(zip(routes['route_id'], routes['route_type']))
    
    tempDF = stop_times.query("stop_sequence ==1")[['trip_id', 'stop_id']]
    if len(tempDF) != 0:
        try:
            tempDF['route_short_name'] = [routes2headsigns[trips2routes[x]] for x in list(tempDF['trip_id'])]
            tempDF['route_id'] = [trips2routes[x] for x in list(tempDF['trip_id'])]
            tempDF['route_type'] = [routes2route_type[x] for x in list(tempDF['route_id'])]
        except KeyError:
            tempDF['route_short_name'] = [routes2headsigns[trips2routes[str(x)]] for x in list(tempDF['trip_id'])]
            tempDF['route_id'] = [trips2routes[str(x)] for x in list(tempDF['trip_id'])]
            tempDF['route_type'] = [routes2route_type[str(x)] for x in list(tempDF['route_id'])]
        
        tempDF.query("route_type == 3", inplace=True)
        
        tempDF.drop_duplicates(subset=['stop_id', 'route_short_name'],inplace=True)
        tempDF['agency'] = agency_name
        tempDF.drop(['trip_id', 'route_type'], axis=1, inplace=True)
        
        concatList.append(tempDF)
    else:
        pass

origins = pd.concat(concatList, ignore_index=True)
origins.head()


    

-------
Caltrain
-------
Marin Transit
-------
Bay Area Rapid Transit
-------
Spirit Bus
-------
El Dorado Transit
-------
Blue & Gold Fleet
-------
Big Blue Bus


  interactivity=interactivity, compiler=compiler, result=result)


-------
North County Transit District
-------
Sacramento Regional Transit


  interactivity=interactivity, compiler=compiler, result=result)


-------
Redding Area Bus Authority
-------
Trinity Transit
-------
Riverside Transit Agency
-------
Sunline Transit Agency
-------
Metro - Los Angeles


  interactivity=interactivity, compiler=compiler, result=result)


-------
eTrans
-------
Corona Cruiser
-------
Norwalk Transit System
-------
LADOT
-------
Anaheim Resort Transportation
-------
MTS


  interactivity=interactivity, compiler=compiler, result=result)


-------
Eureka Transit Service
-------
Lake Transit
-------
Madera County Connection
-------
Lassen Rural Bus
-------
Yuba-Sutter Transit
-------
Laguna Beach Transit
-------
Metrolink Trains
-------
Mendocino Transit Authority
-------
Altamont Corridor Express
-------
Glendale Beeline
-------
Yosemite Valley Shuttle System
-------
Airport Valet Express
-------
Gold Coast Transit
-------
Livermore Amador Valley Transit Authority
-------
Palo Verde Valley Transit Agency
-------
SamTrans
-------
San Francisco Bay Ferry
-------
San Joaquin Regional Transit District (RTD)
-------
Petaluma Transit
-------
Emery Go-Round
-------
Metro - Los Angeles
-------
Golden Empire Transit District
-------
Orange County Transportation Authority
-------
MVgo Mountain View
-------
Santa Cruz Metro
-------
Redwood Coast Transit
-------
Modesto Area Express
-------
Turlock Transit
-------
Fresno Public Transportation (FAX)
-------
San Francisco Municipal Transportation Agency
-------
County Connection
-----

Unnamed: 0,stop_id,route_short_name,route_id,agency
0,777402,TaSJ-Shuttle,TaSj-130,Caltrain
1,777403,TaSJ-Shuttle,TaSj-130,Caltrain
2,2452018,17,10897,Marin Transit
3,2451865,17,10897,Marin Transit
4,2452018,22,10898,Marin Transit


In [9]:
df.columns

Index(['Unnamed: 0', 'Stop ID', 'Name', 'Agency', 'X', 'Y', 'OB Routes',
       'OB AM Peak', 'OB PM Peak', 'OB Weekday', 'OB Saturday', 'OB Sunday',
       'IB Routes', 'IB AM Peak', 'IB PM Peak', 'IB Weekday', 'IB Saturday',
       'IB Sunday'],
      dtype='object')

In [10]:
origins.shape

(4948, 4)

In [12]:
merged = origins.merge(df, how='inner', left_on=["stop_id","agency"], right_on=["Stop ID", "Agency"])

In [13]:
list(merged.columns)

['stop_id',
 'route_short_name',
 'route_id',
 'agency',
 'Unnamed: 0',
 'Stop ID',
 'Name',
 'Agency',
 'X',
 'Y',
 'OB Routes',
 'OB AM Peak',
 'OB PM Peak',
 'OB Weekday',
 'OB Saturday',
 'OB Sunday',
 'IB Routes',
 'IB AM Peak',
 'IB PM Peak',
 'IB Weekday',
 'IB Saturday',
 'IB Sunday']

merged.head()

In [87]:
keeperCols = ['stop_id',
 'route_short_name',
 'route_id',
 'Stop ID',
 'Name',
 'Agency',
 'OB Routes',
 'OB AM Peak',
 'OB PM Peak',
 'OB Weekday',
 'OB Saturday',
 'OB Sunday',
 'IB Routes',
 'IB AM Peak',
 'IB PM Peak',
 'IB Weekday',
 'IB Saturday',
 'IB Sunday']
merged[keeperCols].to_csv("ValidateTest.csv")

In [17]:
df.head()

Unnamed: 0.1,Unnamed: 0,Stop ID,Name,Agency,X,Y,OB Routes,OB AM Peak,OB PM Peak,OB Weekday,OB Saturday,OB Sunday,IB Routes,IB AM Peak,IB PM Peak,IB Weekday,IB Saturday,IB Sunday
0,0,777402,San Jose Caltrain Station,Caltrain,-121.901985,37.330196,TaSJ-Shuttle,,,,93.33,93.33,TaSJ-Shuttle,,,,70.0,70.0
1,1,777403,Tamien Caltrain Station,Caltrain,-121.883403,37.311638,TaSJ-Shuttle,,,,93.33,93.33,TaSJ-Shuttle,,,,70.0,70.0
2,0,2452018,San Rafael Transit Center,Marin Transit,-122.523102,37.971081,"125, 245, 29, 22, 49, 228, 35, 68, 233, 36, 25...",8.0,7.5,8.5,10.91,10.91,"125, 245, 29, 22, 49, 228, 35, 68, 233, 36, 25...",7.74,8.89,8.97,9.44,9.55
3,1,2452379,Hwy 101 @ Lucky Dr Bus Pad,Marin Transit,-122.51703,37.937937,,,,,,,"36, 17",18.46,17.14,20.43,25.45,25.45
4,2,2452231,Hwy 101 @ Tamalpais Dr Bus Pad,Marin Transit,-122.515205,37.926495,119,,240.0,480.0,,,"36, 17",18.46,17.14,20.43,25.45,25.45


In [18]:
len(df)

98913

In [29]:
(len(df) - len(df.drop_duplicates()))/len(df)

0.1798954636903137

In [22]:
df['Stop ID'].value_counts()

111                                     20
127                                     18
108                                     18
112                                     17
232                                     17
128                                     17
360                                     17
132                                     17
356                                     17
103                                     17
119                                     17
213                                     16
353                                     16
331                                     16
229                                     16
187                                     16
125                                     16
205                                     16
143                                     16
231                                     16
334                                     16
307                                     16
110                                     16
126        

In [26]:
df[df['Stop ID'] == "111"].sort_values("Agency")

Unnamed: 0.1,Unnamed: 0,Stop ID,Name,Agency,X,Y,OB Routes,OB AM Peak,OB PM Peak,OB Weekday,OB Saturday,OB Sunday,IB Routes,IB AM Peak,IB PM Peak,IB Weekday,IB Saturday,IB Sunday
1695,818,111,GRAND SB & VENICE NS,Big Blue Bus,-118.266338,34.03581,,,,,,,R10,48.0,21.82,34.29,,
77852,5,111,Johnson at La Cita,City of San Luis Obispo Transit,-120.642441,35.269775,1A,,,,70.0,70.0,1B,,,,,
63407,733,111,MLK - JENSEN,Fresno Public Transportation (FAX),-119.799956,36.706467,32,30.0,30.0,33.1,36.52,36.52,,,,,,
91954,21,111,Brand & Monterey,Glendale Beeline,-118.25523,34.157197,,,,,,,1,240.0,240.0,960.0,840.0,840.0
35272,21,111,Brand & Monterey,Glendale Beeline,-118.25523,34.157197,,,,,,,1,240.0,240.0,960.0,840.0,840.0
54544,321,111,Virginia & Washington,Golden Empire Transit District,-118.975832,35.36121,46,34.29,30.0,33.1,38.18,38.18,,,,,,
94147,280,111,BATTAAN & CARSON SE,Long Beach Transit,-118.223366,33.831577,,,,,,,191,21.82,26.67,29.09,42.0,42.0
19463,6784,111,Arleta / Pierce,Metro - Los Angeles,-118.435393,34.250737,158,48.0,40.0,53.33,70.0,60.0,,,,,,
47045,6784,111,Arleta / Pierce,Metro - Los Angeles,-118.435393,34.250737,158,48.0,40.0,53.33,70.0,60.0,,,,,,
76765,1557,111,Mt Vernon @ Centrepointe Sb Ns,OMNITRANS,-117.309261,34.050133,19,30.0,34.29,32.0,64.62,76.36,215,24.0,21.82,26.67,38.18,44.21


In [31]:
df.drop_duplicates(subset=["Stop ID", "Agency"]).to_csv('output/AllBus_DeDuped.csv')