# Load up our libraries

In [3]:
# all purpose
import datetime, geoplotlib, re
from math import radians, cos, sin, asin, sqrt

# for talking to SQL databases
import psycopg2
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database

# json and XML parsing
import json
from pprint import pprint
from urllib2 import urlopen
from pyquery import PyQuery as pq

# for making maps
import geoplotlib
from geoplotlib.utils import BoundingBox
from IPython.display import Image

# all purpose data analysis and plotting
from scipy import stats
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Define this function for computing distances in meters from (lat,lon) coordinates

In [4]:
# We will need this function to compute the distance between two (lat,lon) points, in meters
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    meters = 6367 * c * 1000
    return meters

## Load data from SQL database (necessary only to load Muni routes)

We need to connect to the PostgresSQL database that I am reading the NextBus Muni data into, which is called 'sf_muni_arrivals' in our case

In [5]:
dbname = 'sf_muni_arrivals'
username = 'dstone'
table = 'nextbus_write_2016_01_15'

# Open up an engine, that we will use to create the database if it doesn't exist
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))

if not database_exists(engine.url):
    create_database(engine.url)

The following command loads the ENTIRE SQL database from above into a pandas dataframe. In the future, I will want to sort the data first with SQL commands, then load it into a pandas dataframe. I can do that with the commented out text.

In [6]:
# If I want to filter the data first:
# connect:
db_con = None
db_con = psycopg2.connect(database = dbname, user = username)
# the table name is 'nextbus':
# only run this code if the list_of_muni_routes needs to be reestablished
# sql_query = """
# SELECT * FROM {table};
# """.format(table=table)
# nbdata = pd.read_sql_query(sql_query,db_con)

In [28]:
# routeslist = pd.unique(nbdata['route'].ravel())
# list_of_muni_routes = pd.DataFrame(data=routeslist, index = range(len(routeslist)), columns=['route'])

# # Clean up a bit
# list_of_muni_routes = list_of_muni_routes[(list_of_muni_routes.route.isnull() == False) & (list_of_muni_routes.route != 'Inspectors') & (list_of_muni_routes.route != 'Training')]
# list_of_muni_routes = list_of_muni_routes[list_of_muni_routes.route != '']
# np.save('list_of_muni_routes',np.asarray(list_of_muni_routes).ravel())
list_of_muni_routes = np.load('list_of_muni_routes.npy')

## Compute distance distributions for each route

In [116]:
# All documentation for this function can be found in the exploration I did above for route 30 specifically
def write_dists_for_all_routes_to_disk():
    def compute_dists_for_route(route_num, database_connection, table_to_read):
        sql_query = '''SELECT * FROM {table} WHERE route = \'{route}\';'''.format(table=table_to_read,route=route_num)
        
        route_dat = pd.read_sql_query(sql_query, database_connection)
        
        # Need to coarse-grain the time
        route_dat.time = route_dat.apply(lambda row: compute_datetime(row['time']), axis=1)

        trips_in_order_for_route = np.sort(pd.unique(route_dat[route_dat.trip != ''].trip.ravel()))

        coords_dict = {}
        for i in range(len(trips_in_order_for_route)):
            my_key = "trip_"+str(i)
            if my_key not in coords_dict:
                coords_dict[my_key] = 0

        for i in range(len(trips_in_order_for_route)):
            trip_temp = 'trip_'+str(i)
            tmp_vehicles = pd.unique(route_dat[(route_dat.trip == trips_in_order_for_route[i])].vehicle.ravel())
            coords_dict[trip_temp] = route_dat[(route_dat.trip == trips_in_order_for_route[i]) & (route_dat.vehicle == tmp_vehicles[0])][['lat','lon','time']]
            coords_dict[trip_temp].drop_duplicates(subset='time', inplace=True)
            coords_dict[trip_temp].index = coords_dict[trip_temp].time

        dist_tmp = pd.DataFrame()

        for i in range(len(trips_in_order_for_route)-1):
            trip_now = 'trip_'+str(i)
            trip_next = 'trip_'+str(i+1)
            #print 'Merging {trip1} and {trip2}'.format(trip1=trip_now,trip2=trip_next)
            tmp = pd.merge(left=coords_dict[trip_now],right=coords_dict[trip_next],on='time')
            if tmp.shape[0] != 0:
                tmp['dist'] = tmp.apply(lambda row: haversine(row['lat_x'],row['lon_x'],row['lat_y'],row['lon_y']), axis=1)
                dist_tmp = pd.concat([dist_tmp, tmp[['time','dist']]])
            else:
                continue

        dist_tmp.index = range(len(dist_tmp))

        return dist_tmp
    
    route_hist_dict={}
    for i in list_of_muni_routes:
            my_key = 'route_'+str(i)
            if my_key not in route_hist_dict:
                route_hist_dict[my_key] = 0
    #build a dict here, then fill in each element of the dict 
    #   with the corresponding 'compute_dists_for_route' so 
    #   we have everything ready to go for each route
    for route_num in list_of_muni_routes:
        temp_Route = 'route_'+str(route_num)
        route_hist_dict[temp_Route] = compute_dists_for_route(route_num, db_con, 'nextbus_write_2016_01_15')
        if len(route_hist_dict[temp_Route]) != 0:
            path_to_file = 'muni_route_distance_distributions/'+temp_Route+'_distribution'
            np.save(path_to_file,np.asarray(route_hist_dict[temp_Route]['dist']).ravel())

For each route in the Muni list, write the array of distances between pairs of buses to disk for reading later from the directory './muni_route_distance_distributions'

In [117]:
write_dists_for_all_routes_to_disk()

## Write distance pairs to database with generate_route_pair function 
Pulls data from SQL database, computes entry in final database, which is pairs subsequent buses on the same route

Now we write the function here. Idea: run over 'list_of_muni_routes', for each route, create the pairs (there will be many) of entries (this is done in various sections of code below), write them to a SQL database, db and table of your choice

In [118]:
# To convert the timestamps from SQL database to datetime objects
def compute_datetime(timestamp):
    return datetime.datetime(pd.to_datetime(timestamp).year, 
                   pd.to_datetime(timestamp).month, 
                   pd.to_datetime(timestamp).day, 
                   pd.to_datetime(timestamp).hour, 
                   pd.to_datetime(timestamp).minute)

In [119]:
# this function generates a table of pairs for a given route
# assume a database connection is already established
def generate_route_pairs(route, psycopg2_engine, database_connection, table_to_read, table_to_write):
    '''route: Muni route you want to generate data for
    psycopg2_engine: engine (postgreSQL) you wish to use to write data to
    database_connection: database that engine provides connection to where you will write
    table_to_read: grab route data from this table
    table_to_write: write bus pair data to this table'''
    sql_query = '''SELECT * FROM {table} WHERE route = \'{route}\';'''.format(table=table_to_read,route=route)
    df_route = pd.read_sql_query(sql_query, database_connection)
    
    # Need to coarse-grain the time
    df_route.time = df_route.apply(lambda row: compute_datetime(row['time']), axis=1)
    
    route_trips = np.sort(pd.unique(df_route[df_route.trip != ''].trip.ravel()))
   
    coords_dict = {}
    for i in range(len(route_trips)):
        my_key = "trip_"+str(i)
        if my_key not in coords_dict:
            coords_dict[my_key] = 0
            
    # some editing to be done here
    for i in range(len(route_trips)):
        trip_temp = 'trip_'+str(i)
        tmp_vehicles = pd.unique(df_route[(df_route.trip == route_trips[i])].vehicle.ravel())
        coords_dict[trip_temp] = df_route[(df_route.trip == route_trips[i]) & (df_route.vehicle == tmp_vehicles[0])]
        coords_dict[trip_temp].drop_duplicates(subset='time', inplace=True)
        coords_dict[trip_temp].index = coords_dict[trip_temp].time
        
    
    #dist_tmp = pd.DataFrame()
    
    # Compute the distance between each
    def compute_distance_percentile(route_num, lat1, lon1, lat2, lon2):
        tmp_dist = haversine(lat1,lon1,lat2,lon2)
        path_to_dist_distribution = 'muni_route_distance_distributions/route_'+str(route_num)+'_distribution.npy'
        route_dist_distribution = np.load(path_to_dist_distribution)
        percentile_score = 1 - stats.percentileofscore(route_dist_distribution, tmp_dist)/100
        return percentile_score
    
    for i in range(len(route_trips)-1):
        trip_now = 'trip_'+str(i)
        trip_next = 'trip_'+str(i+1)
        #print 'Merging {trip1} and {trip2}'.format(trip1=trip_now,trip2=trip_next)
        tmp = pd.merge(left=coords_dict[trip_now],right=coords_dict[trip_next],on='time')
        if tmp.shape[0] != 0:
            tmp['dist_percentile'] = tmp.apply(lambda row: compute_distance_percentile(route, row['lat_x'],row['lon_x'],row['lat_y'],row['lon_y']), axis=1)
            # Write this tuple to the SQL table
            tmp.to_sql(table_to_write, psycopg2_engine, if_exists='append')
        else:
            continue

Now for each route, add to the 'final' database an entry for each pair in that route

In [121]:
for route_num in list_of_muni_routes:
    generate_route_pairs(str(route_num), engine, db_con, 'nextbus_write_2016_01_15', 'vehicle_pairs_2016_01_15')

## Reading in data from Google Maps API

From there get stop ID of starting bus, from there get bus data from NextBus

In [20]:
with open('google_maps_api/second_dirs.json') as data_file:
    data = json.load(data_file)

#pprint(data)

In [25]:
route_name = str(data['routes'][0]['legs'][0]['steps'][0]['transit_details']['line']['short_name'])
departure_stop = str(data['routes'][0]['legs'][0]['steps'][0]['transit_details']['departure_stop']['name'])
departure_lat = float(data['routes'][0]['legs'][0]['steps'][0]['transit_details']['departure_stop']['location']['lat'])
departure_lon = float(data['routes'][0]['legs'][0]['steps'][0]['transit_details']['departure_stop']['location']['lng'])

In [26]:
print route_name
print departure_stop
print departure_lat
print departure_lon

19
26th St & De Haro St
37.75086
-122.40015


In [38]:
url_get_route_config='http://webservices.nextbus.com/service/publicXMLFeed?command=routeConfig&a=sf-muni&r='+str(route_name)
route_config = pq(urlopen(url_get_route_config).read())

In [39]:
for bus_stop_obj in route_config('stop'):
    bus_stop = pq(bus_stop_obj)
    if bus_stop.attr('lat') is not None:
        stop_name = str(bus_stop.attr('title'))
        stop_lat = round(float(bus_stop.attr('lat')),5)
        stop_lon = round(float(bus_stop.attr('lon')),5)
        if stop_name == departure_stop and stop_lat == departure_lat and stop_lon == departure_lon:
            stop_id = str(bus_stop.attr('stopId'))
            print 'Match at '+stop_name+' with stop id: '+stop_id
            print 'Coordinates of stop: ('+str(stop_lat)+','+str(stop_lon)+')'

Match at 26th St & De Haro St with stop id: 13516
Coordinates of stop: (37.75086,-122.40015)


In [40]:
url_get_stop_info='http://webservices.nextbus.com/service/publicXMLFeed?command=predictions&a=sf-muni&stopId='+stop_id
stop_config = pq(urlopen(url_get_stop_info).read())

In [43]:
for next_bus_obj in stop_config('predictions'):
    if str(pq(next_bus_obj).attr('routeTag')) == route_name:
        for upcoming_trips in pq(next_bus_obj)
        trip_id = pq(next_bus_obj).attr('tripTag')
        print 'Found correct bus route '+str(pq(next_bus_obj).attr('routeTag'))+', trip ID = '+ str(trip_id)

Found correct bus route 19, trip ID = None


In [44]:
!head nextbus_one_day_sf_muni_dump.csv > tmp_dump.txt

In [77]:
with open('tmp_dump.txt') as my_file:
    data = my_file.readlines()
my_file.close()

for i, line in enumerate(data,0):
    # get rid of whitespaces, parentheses in coordinates, replace comma in coordinates with '|' so it splits,
    # get rid of newlines, split on '|'
    data[i] = line.replace(',','|').replace(' ','').replace('(','').replace(')','').replace('\n','').split('|')

In [79]:
print data[0]
print data[3][1]

['vehiclid', 'receivedtimestamp', 'gpstimestamp', 'gpsfix', 'gpsspeed', 'gpsheading', 'route', 'trip']
2016-01-1400:00:00.099-08


## Build distance distributions (nightly), save to disk

## Generate database of pairs
With distance quantiles built in from previous function