# Headway Analysis v3
## <i>f.k.a. LA Metro Checker</i>
### Another try at getting this right
Trying new ways in Pandas to figure out which stops are high-quality transit

In [None]:
import pandas as pd, numpy as np, time, datetime, csv, os
from matplotlib import pyplot as plt

## static bits
These cells define functions and variables that will not change over the course of analysis.

In [None]:
# define functions

def minSinceMidnight(t):
    if type(t) == str:
        tt = t.split(':')
        msm = (int(tt[0])*60) + int(tt[1]) + (int(tt[2])/60)
    else:
        msm = t
    return msm

## REWROTE on 9/20 TO DEAL WITH DUPLICATE TIMES

def interpolator(trip):
    
    lsst = list(stop_times[stop_times['trip_id'] == trip]['arrival_time'])
    lsst = [minSinceMidnight(x) for x in lsst]
    
    last_scheduled_time = lsst[0]
    lst_index = 0
    
    values = [last_scheduled_time]
    
    for x in range(1,len(lsst)):
        #print(x)
        if pd.isnull(lsst[x]) == False:
            #idx = lsst.index(x)
            #print(idx)
            yy = list(np.linspace(last_scheduled_time, lsst[x], num=(x-lst_index+1)))

            values += yy[1:]
            last_scheduled_time = lsst[x]
            lst_index = x
        else:
            pass
    return values # returns linear interpolation of times

In [None]:
# unchanging data

# days to evaluate
weekday = "20181101"
saturday = "20181103"
sunday = "20181104"

days_of_week = {0:"monday", 1:"tuesday", 2:"wednesday", 3:"thursday", 4:"friday", 5:"saturday", 6:"sunday"}

# modes
bus_types = [3]
rail_types = [0,1,2]
ferry_types = [4]

# queries to filter by various time criteria
# strings for pd.DF.query

am_peak_query = "interpolated >= 360 and interpolated <= 600"
pm_peak_query = "interpolated >= 900 and interpolated <= 1140"
weekday_range_query = "interpolated >= 360 and interpolated <= 1320"
saturday_range_query = "interpolated >= 480 and interpolated <= 1320"
sunday_range_query = "interpolated >= 480 and interpolated <= 1320"

# defines the schema for the dict object holding headways

dataHolderSpec = {
                  "name":'',
                  "longitude":np.NaN,
                  "latitude":np.NaN,
                0: {
                    "served_by": [],
                    "AM Peak": np.NaN,
                    "PM Peak": np.NaN,
                    "Weekdays": np.NaN,
                    "Saturday": np.NaN,
                    "Sunday": np.NaN
                     },
                 1: {
                    "served_by": [],
                    "AM Peak": np.NaN,
                    "PM Peak": np.NaN,
                    "Weekdays": np.NaN,
                    "Saturday": np.NaN,
                    "Sunday": np.NaN
                     }
                 }


# starting index
start_index = 0

## Define agencies
These cells offer a few different ways to define the agency to be analyzed.

In [None]:
list_of_agencies = list(os.walk('gtfs'))[0][1]

In [None]:
agency = "gtfs/la-metro--184/"
# agency = "gtfs/sfmta--60/"
path = agency

In [None]:
rand_agency = list_of_agencies[np.random.randint(len(list_of_agencies))]
rand_agency

path = "gtfs/" + rand_agency + "/"
path

## Load GTFS data

In [None]:
startTime = time.time()

In [None]:
agency_name = pd.read_csv(path + 'agency.txt')['agency_name'][0]
agency_name = agency_name.replace("/","-")
print(agency_name)

trips = pd.read_csv(path + 'trips.txt')
print(len(trips), "trips")

routes = pd.read_csv(path + 'routes.txt')
print(len(routes), "routes")

stops = pd.read_csv(path + 'stops.txt')
print(len(stops), "stops")

stop_times = pd.read_csv(path + 'stop_times.txt')
print(len(stop_times), "stop times")

try:
    calendar = pd.read_csv(path + "calendar.txt")
    print(len(calendar), "schedules")
    calend = True
except FileNotFoundError:
    calend = False

calendar_dates = pd.read_csv(path + "calendar_dates.txt")
print(len(calendar_dates), "exception dates")

## Determine the schedules (service_ids) to be used for analysis

In [None]:
# create dictionary object with exceptions dates for different types of service

exceptions = {}

for x in range(len(calendar_dates)):
    exception_date = str(calendar_dates['date'][x])
    exception_service_id = calendar_dates['service_id'][x]
    exception_type = calendar_dates['exception_type'][x]

    if exception_date not in exceptions:
        exceptions[exception_date] = {1:[],2:[]}
        exceptions[exception_date][exception_type].append(exception_service_id)
    else:
        exceptions[exception_date][exception_type].append(exception_service_id)

# GENERATES service_ids IN USE ON SPECIFIED DAY OF WEEK
# ALSO ENSURES THAT SCHEDULES ARE ACTIVE DURING SPECIFIED TIME FRAME

weekday_day = days_of_week[datetime.datetime(int(weekday[:4]), int(weekday[4:6]), int(weekday[6:])).weekday()]
saturday_day = days_of_week[datetime.datetime(int(saturday[:4]), int(saturday[4:6]), int(saturday[6:])).weekday()]
sunday_day = days_of_week[datetime.datetime(int(sunday[:4]), int(sunday[4:6]), int(sunday[6:])).weekday()]

# this if-else statement checks to make sure that calendar.txt file is up to date
# if none of the service_id are currently active, we take them anyway
# otherwise, we filter out any inactive service_id

if len([x for x in list(calendar['end_date']) if x > np.min([int(weekday), int(saturday), int(sunday)])]) == 0:
    weekday_query = "(%s == 1) & (%s >= start_date)" % (weekday_day, weekday)
    saturday_query = "(%s == 1) & (%s >= start_date)" % (saturday_day, saturday)
    sunday_query = "(%s == 1) & (%s >= start_date)" % (sunday_day, sunday)
else:
    weekday_query = "(%s == 1) & (%s >= start_date) & (%s <= end_date)" % (weekday_day, weekday, weekday)
    saturday_query = "(%s == 1) & (%s >= start_date) & (%s <= end_date)" % (saturday_day, saturday, saturday)
    sunday_query = "(%s == 1) & (%s >= start_date) & (%s <= end_date)" % (sunday_day, sunday, sunday)

# LISTS OF SERVICE_IDs FOR USE IN ANALYSIS
if calend == True:
    weekday_service = list(calendar.query(weekday_query)['service_id'])
    saturday_service = list(calendar.query(saturday_query)['service_id'])
    sunday_service = list(calendar.query(sunday_query)['service_id'])
elif calend == False:
    weekday_service = exceptions[weekday][1]
    saturday_service = exceptions[saturday][1]
    sunday_service = exceptions[sunday][1]
# create dictionary object to hold information about stops
# dictionary will be used for quick access

if len(weekday_service) == 0 or len(saturday_service) == 0 or len(sunday_service) == 0:
    print(agency.upper(), "SERVICE ISSUE -- PLEASE FIX")

Merge route, trip, and stop_time data together to produce a comprehensive dataset for analysis.

In [None]:
# create the fully-formed stops file

trip_info = trips[["route_id", "service_id", "trip_id", "direction_id"]]
route_info = routes[['route_id', 'route_type']]

trip_route_merged = trip_info.merge(route_info, on="route_id")

# reorder columns
trip_route_merged = trip_route_merged[["trip_id", "route_id", "service_id", "direction_id", "route_type"]]

if trips['trip_id'].dtype == object:

    # create a string version of trip_id
    stop_times['trip_id_str'] = stop_times['trip_id'].astype("str")

    # merge trip info onto stop_times
    stop_times_merged = stop_times[["trip_id_str", 'arrival_time', 'stop_id']].merge(trip_route_merged, right_on="trip_id", left_on="trip_id_str", how='left')

else:
    
    stop_times_merged = stop_times[["trip_id", 'arrival_time', 'stop_id']].merge(trip_route_merged, right_on="trip_id", left_on="trip_id", how='left')

# calculate a interpolated (float) time for future analysis


if (sum(pd.isnull(stop_times_merged['arrival_time']))/len(stop_times_merged)) > 0.05:
    agency_trips = list(stop_times.drop_duplicates('trip_id')['trip_id'])
    allInterpolatedTimes = []
    for trip in agency_trips:
        allInterpolatedTimes += interpolator(trip)
    stop_times_merged['interpolated'] = allInterpolatedTimes
    print("INTERP-O-LATED")
else:
    stop_times_merged['interpolated'] = [minSinceMidnight(x) for x in stop_times_merged['arrival_time']]

stop_times_merged.head()

Dump all rail and ferry stops into their own CSVs. Remove rail and ferry stops from the <i>stop_times_merged</i> dataset.

In [None]:
rail_times_merged = stop_times_merged.query("route_type in [0,1,2]")

if len(rail_times_merged) > 0:
    rail_stops = pd.DataFrame(rail_times_merged['stop_id'].unique(), columns=['stop_id'])
    rail_stops = rail_stops.merge(stops[['stop_id', 'stop_name', 'stop_lon', 'stop_lat']], on="stop_id")
    rail_stops.to_csv("output/v2/rail/" + agency_name + ".csv")
else:
    pass

ferry_times_merged = stop_times_merged.query("route_type in [4]")

if len(ferry_times_merged) > 0:
    ferry_stops = pd.DataFrame(ferry_times_merged['stop_id'].unique(), columns=['stop_id'])
    ferry_stops = ferry_stops.merge(stops[['stop_id', 'stop_name', 'stop_lon', 'stop_lat']], on="stop_id")
    ferry_stops.to_csv("output/v2/ferry/" + agency_name + ".csv")
else:
    pass

stop_times_merged = stop_times_merged.query("route_type == 3")

Create new dataframes for stops for each time period: weekdays, Saturdays, Sundays

In [None]:
# WEEKDAY

if weekday in exceptions:
    weekday_stops = [weekday_service.remove(x) for x in exceptions[weekday][2]] + exceptions[weekday][1]
else:
    pass

weekday_stops = stop_times_merged[stop_times_merged['service_id'].isin(weekday_service)]
print(weekday_stops.shape)

# SATURDAY

if saturday in exceptions:
    saturday_service = [saturday_service.remove(x) for x in exceptions[saturday][2]] + exceptions[saturday][1]
else:
    pass

saturday_stops = stop_times_merged[stop_times_merged['service_id'].isin(saturday_service)]
print(saturday_stops.shape)

# SUNDAY

if sunday in exceptions:
    sunday_service = [sunday_service.remove(x) for x in exceptions[sunday][2]] + exceptions[sunday][1]
else:
    pass

sunday_stops = stop_times_merged[stop_times_merged['service_id'].isin(sunday_service)]
print(sunday_stops.shape)

## Headway analysis

In [None]:
output_path = "output/v2/bus/" + agency_name + ".csv"

headways = pd.DataFrame(data=stops['stop_id'])

minutes = 60 * 4
df = weekday_stops

queries = [am_peak_query, pm_peak_query, weekday_range_query, saturday_range_query, sunday_range_query]
periods = ['am_pk', 'pm_pk', 'wkdy', 'sat', 'sun']
minute_ranges = [(600-360), (1140-900), (1320-360), (1320-480), (1320-480)]
dfs = [weekday_stops, weekday_stops, weekday_stops, saturday_stops, sunday_stops]


for i in range(5):

    # conditional parameters
    df = dfs[i]
    minutes = minute_ranges[i]
    query = queries[i]
    
    for direction in [0,1]:
        results = pd.DataFrame(minutes / df[df['direction_id']==direction].query(query)['stop_id'].value_counts())
        results.reset_index(inplace=True)
        results.rename(columns={"stop_id":periods[i]}, inplace=True)
        results.rename(columns={"index":"stop_id"}, inplace=True)


        headways = headways.merge(results, on="stop_id", suffixes=["_dir0", "_dir1"], how='left')

output = stops[["stop_id", "stop_name", "stop_lon", "stop_lat"]].merge(headways, on="stop_id", how="outer")

output.to_csv(output_path)

# output.head()

In [None]:
finishTime = time.time()

In [None]:
print(agency_name.upper(), "-- total time:", finishTime - startTime)

## Load and evaluate the headway data

In [None]:
loaded = pd.read_csv(output_path)
# loaded.head()

In [None]:
hqt_filter = "((am_pk_dir0 <= 15) | (am_pk_dir1 <= 15)) & ((pm_pk_dir0 <= 15) | (pm_pk_dir1 <= 15))"
hqt_filter += " & ((wkdy_dir0 <= 20) | (wkdy_dir1 <= 20)) & "
hqt_filter += "((sat_dir0 <= 30) | (sat_dir1 <= 30)) & ((sun_dir0 <= 30) | (sun_dir1 <= 30))"
# hqt_filter

In [None]:
hqt = loaded.query(hqt_filter)

if len(hqt) > 0:
    hqt.to_csv("output/v2/bus-hqt/" + agency_name + " (HQT).csv")
else:
    pass

In [None]:
pct_qual = len(hqt) / len(loaded)
print(len(hqt), "of", len(loaded), "stops --", round(pct_qual, 2), "%")

In [None]:
pct_qual = len(hqt) / len(loaded)
print("###############")
print(len(hqt), "of", len(loaded), "stops --", round((pct_qual*100),1), "%")
print("###############")

plt.clf()

plt.figure(figsize=(20,20))

plt.plot(loaded['stop_lon'], loaded['stop_lat'], 'ok')
plt.plot(hqt['stop_lon'], hqt['stop_lat'], 'or')

plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.legend(["All", "HQT"])
plt.title(agency_name)

plt.savefig("output/v2/img/" + agency_name + ".pdf")

plt.show()