In [2]:
# version of the evaluator script that runs all CA transit agencies
# no visualizations

In [3]:
import pandas as pd, numpy as np, time, datetime, csv, os

In [4]:
# define functions

def minSinceMidnight(t):
    if type(t) == str:
        tt = t.split(':')
        msm = (int(tt[0])*60) + int(tt[1]) + (int(tt[2])/60)
    else:
        msm = t
    return msm

## REWROTE on 9/20 TO DEAL WITH DUPLICATE TIMES

def interpolator(trip):
    
    lsst = list(stop_times[stop_times['trip_id'] == trip]['arrival_time'])
    lsst = [minSinceMidnight(x) for x in lsst]
    
    last_scheduled_time = lsst[0]
    lst_index = 0
    
    values = [last_scheduled_time]
    
    for x in range(1,len(lsst)):
        #print(x)
        if pd.isnull(lsst[x]) == False:
            #idx = lsst.index(x)
            #print(idx)
            yy = list(np.linspace(last_scheduled_time, lsst[x], num=(x-lst_index+1)))

            values += yy[1:]
            last_scheduled_time = lsst[x]
            lst_index = x
        else:
            pass
    return values # returns linear interpolation of times

In [5]:
# unchanging data

# days to evaluate
weekday = "20180918"
saturday = "20180922"
sunday = "20180923"

days_of_week = {0:"monday", 1:"tuesday", 2:"wednesday", 3:"thursday", 4:"friday", 5:"saturday", 6:"sunday"}

# modes
bus_types = [3]
rail_types = [0,1,2]
ferry_types = [4]

# queries to filter by various time criteria
# strings for pd.DF.query

am_peak_query = "interpolated >= 360 and interpolated <= 600"
pm_peak_query = "interpolated >= 900 and interpolated <= 1140"
weekday_range_query = "interpolated >= 360 and interpolated <= 1320"
saturday_range_query = "interpolated >= 480 and interpolated <= 1320"
sunday_range_query = "interpolated >= 480 and interpolated <= 1320"

# defines the schema for the dict object holding headways

dataHolderSpec = {
                  "name":'',
                  "longitude":np.NaN,
                  "latitude":np.NaN,
                0: {
                    "served_by": [],
                    "AM Peak": np.NaN,
                    "PM Peak": np.NaN,
                    "Weekdays": np.NaN,
                    "Saturday": np.NaN,
                    "Sunday": np.NaN
                     },
                 1: {
                    "served_by": [],
                    "AM Peak": np.NaN,
                    "PM Peak": np.NaN,
                    "Weekdays": np.NaN,
                    "Saturday": np.NaN,
                    "Sunday": np.NaN
                     }
                 }


# starting index
start_index = 0

In [6]:
# specify agency
list_of_agencies = list(os.walk('gtfs'))[0][1]

In [35]:
for agency in list_of_agencies[start_index:]:

    # LOAD REQUIRED FILES
    
    path = "gtfs/" + agency + "/"

    #print(path)
    print("-------")

    agency_name = pd.read_csv(path + 'agency.txt')['agency_name'][0]
    agency_name = agency_name.replace("/","-")
    print(agency_name, ", agency", str(list_of_agencies.index(agency)+1), "of", str(len(list_of_agencies)))

    trips = pd.read_csv(path + 'trips.txt')
    print(len(trips), "trips")

    routes = pd.read_csv(path + 'routes.txt')
    print(len(routes), "routes")

    stops = pd.read_csv(path + 'stops.txt')
    print(len(stops), "stops")

    stop_times = pd.read_csv(path + 'stop_times.txt')
    print(len(stop_times), "stop times")
    
    try:
        calendar = pd.read_csv(path + "calendar.txt")
        print(len(calendar), "schedules")
        calend = True
    except FileNotFoundError:
        calend = False

    calendar_dates = pd.read_csv(path + "calendar_dates.txt")
    print(len(calendar_dates), "exception dates")
    
    
    # STORE STOP ATTRIBUTES IN A USEFUL WAY
    try:
        trips2directions = dict(zip(trips['trip_id'], trips['direction_id']))
    except KeyError:
        trips2directions = dict(zip(trips['trip_id'], np.zeros(len(trips))))
    trips2service = dict(zip(trips['trip_id'], trips['service_id']))
    trips2routes = dict(zip(trips['trip_id'], trips['route_id']))
    routes2headsigns = dict(zip(routes['route_id'], routes['route_short_name']))
    routes2route_type = dict(zip(routes['route_id'], routes['route_type']))
    
    # INTERPOLATION
    did_i_interp = False

    #agency_trips = list(trips['trip_id'].unique())
    # changed agency_trips definition to preserve order of trips
    agency_trips = list(stop_times.drop_duplicates('trip_id')['trip_id'])

    if (sum(pd.isnull(stop_times['arrival_time']))/len(stop_times)) > 0.05:
        allInterpolatedTimes = []

        intrS = time.time()
        for trip in agency_trips:
            allInterpolatedTimes += interpolator(trip)
        intrF = time.time()

        did_i_interp = True
        print("** interpolated " + str(len(agency_trips)), "trips ("+ str(len(stop_times))+" vehicle stops) in", str(intrF - intrS), "seconds")
    else:
        allInterpolatedTimes = [minSinceMidnight(x) for x in stop_times['arrival_time']]
        print("** did not interpolate")
    
    # ADD ATTRIBUTES TO STOP_TIMES CSV
    useStrings = False # variable to decide if we need to use string versions of route_id or integer
    
    # clean up directionless route files
    if "direction_id" not in routes.columns:
        routes['direction_id'] = np.zeros(len(routes))
    else:
        pass
    
    if type(trips['trip_id'][0]) != str:
        stop_times['interpolated'] = allInterpolatedTimes
        stop_times['direction_id'] = [trips2directions[x] for x in stop_times['trip_id']]
        stop_times['service_id'] = [trips2service[x] for x in stop_times['trip_id']]
        stop_times['route_type'] = [routes2route_type[trips2routes[x]] for x in stop_times['trip_id']]
    else:
        stop_times['interpolated'] = allInterpolatedTimes
        stop_times['direction_id'] = [trips2directions[str(x)] for x in stop_times['trip_id']]
        stop_times['service_id'] = [trips2service[str(x)] for x in stop_times['trip_id']]
        stop_times['route_type'] = [routes2route_type[trips2routes[str(x)]] for x in stop_times['trip_id']]
        useStrings = True

    # deal with routes that don't have directions specified
    stop_times['direction_id'] = [0 if pd.isnull(x) == True else x for x in list(stop_times['direction_id'])]
    
    # create dictionary object with exceptions dates for different types of service

    exceptions = {}

    for x in range(len(calendar_dates)):
        exception_date = str(calendar_dates['date'][x])
        exception_service_id = calendar_dates['service_id'][x]
        exception_type = calendar_dates['exception_type'][x]

        if exception_date not in exceptions:
            exceptions[exception_date] = {1:[],2:[]}
            exceptions[exception_date][exception_type].append(exception_service_id)
        else:
            exceptions[exception_date][exception_type].append(exception_service_id)
    
    # GENERATES service_ids IN USE ON SPECIFIED DAY OF WEEK
    # ALSO ENSURES THAT SCHEDULES ARE ACTIVE DURING SPECIFIED TIME FRAME

    weekday_day = days_of_week[datetime.datetime(int(weekday[:4]), int(weekday[4:6]), int(weekday[6:])).weekday()]
    saturday_day = days_of_week[datetime.datetime(int(saturday[:4]), int(saturday[4:6]), int(saturday[6:])).weekday()]
    sunday_day = days_of_week[datetime.datetime(int(sunday[:4]), int(sunday[4:6]), int(sunday[6:])).weekday()]

    # this if-else statement checks to make sure that calendar.txt file is up to date
    # if none of the service_id are currently active, we take them anyway
    # otherwise, we filter out any inactive service_id

    if len([x for x in list(calendar['end_date']) if x > np.min([int(weekday), int(saturday), int(sunday)])]) == 0:
        weekday_query = "(%s == 1) & (%s >= start_date)" % (weekday_day, weekday)
        saturday_query = "(%s == 1) & (%s >= start_date)" % (saturday_day, saturday)
        sunday_query = "(%s == 1) & (%s >= start_date)" % (sunday_day, sunday)
    else:
        weekday_query = "(%s == 1) & (%s >= start_date) & (%s <= end_date)" % (weekday_day, weekday, weekday)
        saturday_query = "(%s == 1) & (%s >= start_date) & (%s <= end_date)" % (saturday_day, saturday, saturday)
        sunday_query = "(%s == 1) & (%s >= start_date) & (%s <= end_date)" % (sunday_day, sunday, sunday)
    
    # LISTS OF SERVICE_IDs FOR USE IN ANALYSIS
    if calend == True:
        weekday_service = list(calendar.query(weekday_query)['service_id'])
        saturday_service = list(calendar.query(saturday_query)['service_id'])
        sunday_service = list(calendar.query(sunday_query)['service_id'])
    elif calend == False:
        weekday_service = exceptions[weekday][1]
        saturday_service = exceptions[saturday][1]
        sunday_service = exceptions[sunday][1]
    # create dictionary object to hold information about stops
    # dictionary will be used for quick access

    stops_data = {}

    for i in range(len(stops)):
        stop_id = stops['stop_id'][i]
        stop_name = stops['stop_name'][i]
        stop_lon = stops['stop_lon'][i]
        stop_lat = stops['stop_lat'][i]

        stops_data[stop_id] = {
            "name": stop_name,
            "lon": stop_lon,
            "lat": stop_lat
        }
        
    # make sure directory for saving outputs exists
    if os.path.isdir("output") == False:
        os.mkdir("output")
    else:
        pass

    timeS = time.time() # start time, for evaluation

    # path for output CSV
    filename = "output/%s_test.csv" % agency_name

    # split up stops by transit mode
    bus_stops = stop_times[stop_times['route_type'].isin(bus_types)].reset_index()
    rail_stops = stop_times[stop_times['route_type'].isin(rail_types)].reset_index()
    ferry_stops = stop_times[stop_times['route_type'].isin(ferry_types)].reset_index()

    if len(bus_stops) > 1:
        # CSV writer
        f = open(filename, 'w')
        writer = csv.writer(f)
        writer.writerow(["Stop ID", "Name", "Agency", "X", "Y", "OB Routes", "OB AM Peak", "OB PM Peak", "OB Weekday", "OB Saturday", "OB Sunday", "IB Routes", "IB AM Peak", "IB PM Peak", "IB Weekday", "IB Saturday", "IB Sunday"])

        # ordered list of different time windows to evaluate
        filter_options = ['AM Peak', "PM Peak", 'Weekdays', 'Saturday', 'Sunday']


        # the following lists are in the same order as filter_options, so indices would be common across lists
        # e.g. filter_options[0] is "AM Peak" and filters[0] is am_peak_query, and filter_sched[0] is the weekday schedule, etc.
        # this feels kind of messy and we may want to play with cleaning this up
        filters = [am_peak_query, pm_peak_query, weekday_range_query, saturday_range_query, sunday_range_query]
        filter_sched = [0,0,0,1,2] # index of service_id in schedules array
        filter_time_per = [4,4,16,14,14] # number of hours in each evaluative window
        filter_crit = [16,16,48,28,28] # number of required buses during evaluative window
        filter_services = [weekday_service, weekday_service, weekday_service, saturday_service, sunday_service]
        filter_day = [weekday, weekday, weekday, saturday, sunday]

        # list of unique stops in that transit system
        unique_stops = bus_stops['stop_id'].unique()

        # number of stops to evaluate
        # to run for the entire agency, set num_of_stops = len(unique_stops)
        num_of_stops = len(unique_stops)
        updater = "%s has %s unique bus stops." % (agency_name, num_of_stops)
        print(updater)

        #CHECK THIS OUT
        for stop in unique_stops[:num_of_stops]:
            # stop characteristics

            if agency_name == "AC Transit":
                # add a "0" to stop_ids because AC Transit stores stop_ids as seven-character strings
                # this will need to be made flexible when this is reworked to accomodate other agencies
                if len(str(stop)) < 7:
                    stringstop = "0" + str(stop)
                else:
                    stringstop = str(stop)
            elif agency_name == "Foothill Transit" and len(str(stop)) < 5:
                stringstop = str(stop).zfill(5)
            else:
                stringstop = stop

            try:
                sub_stops_data = stops_data[stringstop] # subset of larger dictionary to make data storage syntax cleaner
            except KeyError:
                sub_stops_data = stops_data[str(stringstop)]
            # use defined dictionary variable from previous cell and populate with name and location info from stops DF
            dataHolder = dataHolderSpec 
            dataHolder['name'] = sub_stops_data['name']
            dataHolder['longitude'] = sub_stops_data['lon']
            dataHolder['latitude'] = sub_stops_data['lat']

            for direction in [0,1]: # iterate through both directions
                # create a DF that contains one stop in one direction
                # ex. stop_id 300315, direction_id  0
                tempDF = bus_stops[(bus_stops['stop_id'] == stop) & (bus_stops['direction_id'] == direction)]

                # generate a list of all the short names for the routes that stop at the defined stop in the defined direction 
                if useStrings == False:
                    served_by = [str(x) for x in list(set([routes2headsigns[trips2routes[x]] for x in  tempDF['trip_id']]))]
                else:
                    served_by = [str(x) for x in list(set([routes2headsigns[trips2routes[str(x)]] for x in  tempDF['trip_id']]))]

                if len(served_by) == sum(pd.isnull(served_by)):
                    served_by == ['']
                else:
                    pass

                #print()
                #print(stop, sub_stops_data['name'], served_by, "-", direction)


                for option in filter_options:

                    i = filter_options.index(option) # list index for that time window
                    fltr = filters[i]
                    #fltr_sched = schedules[filter_sched[i]]
                    try:
                        relevent_schedules = [x for x in filter_services[i] if x not in exceptions[filter_day[i]][2]] + exceptions[filter_day[i]][1]
                    except KeyError:
                        relevent_schedules = filter_services[i]

                    # integer for the number of buses that stop at a given stop in a given direction 
                    # during the time period defined by the query expression
                    #num_buses = len(tempDF[tempDF['service_id'] == fltr_sched].query(fltr))
                    num_buses = len(tempDF[tempDF['service_id'].isin(relevent_schedules)].query(fltr).drop_duplicates(subset="arrival_time"))

                    # right now, headway calculation doesn't work when service doesn't run for the entirety of the evaluative window
                    # ex. AC TRANSIT stop 300315, line 65 ends at 6:30pm on weekends (does not run until 10pm)


                    # crudely calculate average headway by dividing the total number of minutes during the defined time period
                    # by the number of buses that stopped during that time period (num_buses)
                    try:
                        headway = round(((filter_time_per[i] * 60) / num_buses), 2)
                    except ZeroDivisionError:
                        headway = np.NAN
                    try:
                        if pd.isnull(served_by[0]) == False:
                            dataHolder[direction]['served_by'] = served_by
                        else:
                            dataHolder[direction]['served_by'] = ['']
                    except IndexError:
                        dataHolder[direction]['served_by'] = ['']
                    dataHolder[direction][filter_options[i]] = headway

                    #print(filter_options[i] + ": " + str(num_buses) + " of " + str(filter_crit[i]) + " required. (" + str(headway) + " min headway)")

            ob = dataHolder[0] # subset of larger dictionary (just outbound) to make data storage syntax cleaner
            ib = dataHolder[1] # subset of larger dictionary (just inbound) to make data storage syntax cleaner

            # write data to CSV
            row_contents = [str(stop), dataHolder['name'], agency_name, dataHolder['longitude'], dataHolder['latitude'], ', '.join(ob['served_by']), ob['AM Peak'], ob['PM Peak'], ob['Weekdays'], ob['Saturday'], ob['Sunday'], ', '.join(ib['served_by']), ib['AM Peak'], ib['PM Peak'], ib['Weekdays'], ib['Saturday'], ib['Sunday']]
            writer.writerow(row_contents)
        timeF = time.time() # finish time, for evaluation
        f.close()

        print()  
        # print duration of process, number of stops, and rate
        try:
            print(str(round((timeF - timeS),2)), "seconds //", str(num_of_stops), "stops //", str(round((round((timeF - timeS),2)/num_of_stops),4)), "seconds per stop")
        except ZeroDivisionError:
            pass
    else:
        pass

    # create rail CSV
    if len(rail_stops) > 0:
        print(agency_name, "has", str(len(rail_stops['stop_id'].unique())), "rail stops.")
        railfile = "output/%s_RAIL_test.csv" % agency_name
        g = open(railfile, 'w')
        writer = csv.writer(g)
        writer.writerow(["Stop ID", "Name", "Agency", "X", "Y"])
        for stop in rail_stops['stop_id'].unique():
            try:
                writer.writerow([stop, stops_data[stop]['name'], agency_name, stops_data[stop]['lon'], stops_data[stop]['lat']])
            except KeyError:
                stop = str(stop)
                writer.writerow([stop, stops_data[stop]['name'], agency_name, stops_data[stop]['lon'], stops_data[stop]['lat']])
        g.close()
    else:
        pass



    # create ferry CSV
    if len(ferry_stops) > 0:
        print(agency_name, "has", str(len(ferry_stops['stop_id'].unique())), "ferry stops.")
        ferryfile = "output/%s_FERRY_test.csv" % agency_name
        g = open(ferryfile, 'w')
        writer = csv.writer(g)
        writer.writerow(["Stop ID", "Name", "Agency", "X", "Y"])
        for stop in ferry_stops['stop_id'].unique():
            try:
                writer.writerow([stop, stops_data[stop]['name'], agency_name, stops_data[stop]['lon'], stops_data[stop]['lat']])
            except KeyError:
                stop = str(stop)
                writer.writerow([stop, stops_data[stop]['name'], agency_name, stops_data[stop]['lon'], stops_data[stop]['lat']])
        g.close()
    else:
        pass
    start_index += 1

-------
Monterey-Salinas Transit , agency 71 of 99
3275 trips
57 routes
1317 stops
84679 stop times
4723 exception dates
** did not interpolate
Monterey-Salinas Transit has 1317 unique bus stops.

57.07 seconds // 1317 stops // 0.0433 seconds per stop
-------
Unitrans (Davis) , agency 72 of 99
2516 trips
20 routes
298 stops
56828 stop times
8 schedules
2 exception dates
** interpolated 2516 trips (56828 vehicle stops) in 2.1431329250335693 seconds
Unitrans (Davis) has 298 unique bus stops.

16.02 seconds // 298 stops // 0.0538 seconds per stop
-------
Capitol Corridor Joint Powers Authority , agency 73 of 99
104 trips
2 routes
19 stops
757 stop times
2 schedules
0 exception dates
** did not interpolate
Capitol Corridor Joint Powers Authority has 19 rail stops.
-------
Sage Stage , agency 74 of 99
10 trips
5 routes
31 stops
76 stop times
5 schedules
13 exception dates
** did not interpolate
Sage Stage has 31 unique bus stops.

1.48 seconds // 31 stops // 0.0477 seconds per stop
-------
