# Load up our libraries

In [2]:
# all purpose
import datetime, geoplotlib, re
from math import radians, cos, sin, asin, sqrt

# for talking to SQL databases
import psycopg2
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database

# json and XML parsing
import json
from pprint import pprint
from urllib2 import urlopen
from pyquery import PyQuery as pq

# for making maps
import geoplotlib
from geoplotlib.utils import BoundingBox
from IPython.display import Image

# all purpose data analysis and plotting
from scipy import stats
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns



Define this function for computing distances in meters from (lat,lon) coordinates

In [19]:
# We will need this function to compute the distance between two (lat,lon) points, in meters
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    meters = 6367 * c * 1000
    return meters

## Load data from SQL database (necessary only to load Muni routes and estlabish connection to database)

We need to connect to the PostgresSQL database that I am reading the NextBus Muni data into, which is called 'sf_muni_arrivals' in our case.

**This cell must be run.**

In [6]:
dbname = 'sf_muni_arrivals'
username = 'dstone'
table = 'nextbus_write_2016_01_15'

# Open up an engine, that we will use to create the database if it doesn't exist
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))

if not database_exists(engine.url):
    create_database(engine.url)
    
# If I want to filter the data first:
# connect:
db_con = None
db_con = psycopg2.connect(database = dbname, user = username)

These commands load the entire database into a pandas dataframe. I only used this to grab the list of Muni routes. It is not necessary below.

In [6]:
# the table name is 'nextbus':
# only run this code if the list_of_muni_routes needs to be reestablished
# sql_query = """
# SELECT * FROM {table};
# """.format(table=table)
# nbdata = pd.read_sql_query(sql_query,db_con)

In [3]:
# routeslist = pd.unique(nbdata['route'].ravel())
# list_of_muni_routes = pd.DataFrame(data=routeslist, index = range(len(routeslist)), columns=['route'])

# # Clean up a bit
# list_of_muni_routes = list_of_muni_routes[(list_of_muni_routes.route.isnull() == False) & (list_of_muni_routes.route != 'Inspectors') & (list_of_muni_routes.route != 'Training')]
# list_of_muni_routes = list_of_muni_routes[list_of_muni_routes.route != '']
# np.save('list_of_muni_routes',np.asarray(list_of_muni_routes).ravel())
list_of_muni_routes = np.load('list_of_muni_routes.npy')

## Building algorithm to construct database entries with predictions times as bunching qualifier


Start with route 30

Get last stops of inbound and outbound buses for this route

In [160]:
route_name = 10
url_get_route_config='http://webservices.nextbus.com/service/publicXMLFeed?command=routeConfig&a=sf-muni&r='+str(route_name)
route_config = pq(urlopen(url_get_route_config).read())

In [161]:
for direction in pq(route_config('direction')):
    dirIO = pq(direction).attr('tag')
    stop_tag = pq(direction[-1]).attr('tag')
    if 'I' in str(dirIO):
        print 'Found inbound'
        stop_tag_inbound = stop_tag
    if 'O' in str(dirIO):
        print 'Found outbound'
        stop_tag_outbound = stop_tag

Found outbound
Found inbound


Get predictions in pairs

In [162]:
url_get_predictions_inbound='http://webservices.nextbus.com/service/publicXMLFeed?command=predictions&a=sf-muni&s='+str(stop_tag_inbound)+'&r='+str(route_name)
predictions_I = pq(urlopen(url_get_predictions_inbound).read())

In [163]:
df_I = pd.DataFrame()
indexer=0
# Deal with lack of pairs later, but throw that as an error and don't display bunching
for i in range(len(predictions_I('prediction'))-1):
    if pq(predictions_I('prediction')[i]).attr.affectedByLayover is None or pq(predictions_I('prediction')[i+1]).attr.affectedByLayover is None: 
        prediction1 = pq(predictions_I('prediction')[i])
        prediction2 = pq(predictions_I('prediction')[i+1])
        df_tmp = pd.DataFrame({'vehicle1': prediction1.attr.vehicle, \
                            'vehicle2': prediction2.attr.vehicle, \
                            'pred1': prediction1.attr.minutes, \
                            'pred2': prediction2.attr.minutes}, index=[indexer])
        df_I = df_I.append(df_tmp)
        indexer += 1

In [164]:
url_get_predictions_outbound='http://webservices.nextbus.com/service/publicXMLFeed?command=predictions&a=sf-muni&s='+str(stop_tag_outbound)+'&r='+str(route_name)
predictions_O = pq(urlopen(url_get_predictions_outbound).read())

In [165]:
df_O = pd.DataFrame()
indexer=0
# Deal with lack of pairs later, but throw that as an error and don't display bunching
for i in range(len(predictions_O('prediction'))-1):
    if pq(predictions('prediction')[i]).attr.affectedByLayover is None or pq(predictions_O('prediction')[i+1]).attr.affectedByLayover is None: 
        prediction1 = pq(predictions_O('prediction')[i])
        prediction2 = pq(predictions_O('prediction')[i+1])
        df_tmp = pd.DataFrame({'vehicle1': prediction1.attr.vehicle, \
                            'vehicle2': prediction2.attr.vehicle, \
                            'pred1': prediction1.attr.minutes, \
                            'pred2': prediction2.attr.minutes}, index=[indexer])
        df_O = df_O.append(df_tmp)
        indexer += 1

Now go to realtime predictions and match the vehicle numbers (vehicles should have a unique ID)

In [166]:
url_get_realtime_posits='http://webservices.nextbus.com/service/publicXMLFeed?command=vehicleLocations&a=sf-muni&t=0&r='+str(route_name)
realtime_posits = pq(urlopen(url_get_realtime_posits).read())
1453877206275

In [167]:
df_total = pd.DataFrame()
for df_tmp in [df_I, df_O]: 
    time_stamp = datetime.datetime.utcfromtimestamp(int(pq(pq(realtime_posits('vehicle')[-1]).siblings()[-1]).attr('time'))/1000)
    # for each pair in df_I, find the match in the realtime_posits
    df1 = pd.DataFrame()
    df2 = pd.DataFrame()
    for i in range(df_tmp.shape[0]):
        for vehicle in realtime_posits('vehicle'):
            v = pq(vehicle)
            if v.attr.id == df_tmp.loc[i]['vehicle1']:
                df1 = pd.DataFrame({'ind':i, 'time': time_stamp,'lat_x': float(v.attr.lat), 'lon_x': float(v.attr.lon), 'speed_x': float(v.attr.speedKmHr), 'route_x': str(v.attr.routeTag), 'pred_x': df_tmp.loc[i]['pred1']},index=[0])
                #df1 = df1.append(df1_tmp)
            elif v.attr.id == df_tmp.loc[i]['vehicle2']:
                df2 = pd.DataFrame({'ind':i, 'lat_y': float(v.attr.lat), 'lon_y': float(v.attr.lon), 'speed_y': float(v.attr.speedKmHr), 'pred_y': df_tmp.loc[i]['pred2']},index=[0])
                #df2 = df2.append(df2_tmp)
        if df1.empty or df2.empty:
            continue
        else:
            df_tmp1 = pd.merge(left=df1, right=df2)
            df_total = df_total.append(df_tmp1)

# some cleaning
df_total.drop('ind', inplace=True, axis=1)
df_total.index = np.arange(df_total.shape[0])
df_total['dist'] = df_total.apply(lambda row: haversine(row['lat_x'],row['lon_x'],row['lat_y'],row['lon_y']), axis=1)

In [168]:
df_total['dist'] = df_total.apply(lambda row: haversine(row['lat_x'],row['lon_x'],row['lat_y'],row['lon_y']), axis=1)

In [169]:
df_total

Unnamed: 0,lat_x,lon_x,pred_x,route_x,speed_x,time,lat_y,lon_y,pred_y,speed_y,dist
0,37.79248,-122.43439,0,10,7,2016-01-27 01:49:22,37.79278,-122.40137,22,9,3669.39477
1,37.79278,-122.40137,22,10,9,2016-01-27 01:49:22,37.7775,-122.39444,33,18,1191.949541
2,37.7775,-122.39444,33,10,18,2016-01-27 01:49:22,37.75455,-122.40207,53,25,1608.146356
3,37.75455,-122.40207,53,10,25,2016-01-27 01:49:22,37.75305,-122.40611,70,0,457.74569
4,37.75105,-122.39845,5,10,12,2016-01-27 01:49:22,37.7886,-122.40049,33,0,2247.300688
5,37.7886,-122.40049,33,10,0,2016-01-27 01:49:22,37.79866,-122.4014,44,0,607.501916
6,37.79866,-122.4014,44,10,0,2016-01-27 01:49:22,37.79243,-122.43514,65,0,3767.686087


## Defining the 'enter_pairs_for_route_into_database(route)'
Now make one big function that does this for each route

In [206]:
def temp_func(specified_route):
    # Get last stops of inbound and outbound buses for this route
    url_get_route_config='http://webservices.nextbus.com/service/publicXMLFeed?command=routeConfig&a=sf-muni&r='+str(specified_route)
    route_config = pq(urlopen(url_get_route_config).read())
    for direction in pq(route_config('direction')):
        dirIO = pq(direction).attr('tag')
        stop_tag = pq(direction[-1]).attr('tag')
        print 'stop_tag = '+str(stop_tag)
        if 'I' in str(dirIO):
            stop_tag_inbound = stop_tag
        if 'O' in str(dirIO):
            stop_tag_outbound = stop_tag

In [209]:
list_of_muni_routes[:3]

array(['29', '21', '28'], dtype=object)

In [20]:
def enter_pairs_for_route_into_database(specified_route, specified_engine, specified_table):
    # Get last stops of inbound and outbound buses for this route
    url_get_route_config='http://webservices.nextbus.com/service/publicXMLFeed?command=routeConfig&a=sf-muni&r='+str(specified_route)
    route_config = pq(urlopen(url_get_route_config).read())
    print 'route: '+str(specified_route)
    print url_get_route_config
    for direction in pq(route_config('direction')):
        dirIO = pq(direction).attr('tag')
        stop_tag = pq(direction[-1]).attr('tag')
        print 'stop_tag: '+str(stop_tag)
        if 'I' in str(dirIO):
            stop_tag_inbound = stop_tag
            print 'stop_tag_inbound = '+str(stop_tag_inbound)
        if 'O' in str(dirIO):
            stop_tag_outbound = stop_tag
            print 'stop_tag_outbound = '+str(stop_tag_outbound)

    # Get predictions in pairs
    # First inbound
    try:
        stop_tag_inbound
    except:
        stop_tag_inbound = None
        df_I = pd.DataFrame()
    if stop_tag_inbound is not None:
        print 'stop_tag_inbound here'
        url_get_predictions_inbound='http://webservices.nextbus.com/service/publicXMLFeed?command=predictions&a=sf-muni&s='+str(stop_tag_inbound)+'&r='+str(specified_route)
        predictions_I = pq(urlopen(url_get_predictions_inbound).read())
        df_I = pd.DataFrame()
        indexer=0
        # Deal with lack of pairs later, but throw that as an error and don't display bunching
        for i in range(len(predictions_I('prediction'))-1):
            if pq(predictions_I('prediction')[i]).attr.affectedByLayover is None or pq(predictions_I('prediction')[i+1]).attr.affectedByLayover is None: 
                prediction1 = pq(predictions_I('prediction')[i])
                prediction2 = pq(predictions_I('prediction')[i+1])
                #print 'prediction1: '+str(prediction1)
                #print 'prediction2: '+str(prediction2)
                df_tmp = pd.DataFrame({'vehicle1': prediction1.attr.vehicle, \
                                    'vehicle2': prediction2.attr.vehicle, \
                                    'pred1': prediction1.attr.minutes, \
                                    'pred2': prediction2.attr.minutes}, index=[indexer])
                df_I = df_I.append(df_tmp)
                indexer += 1
 
    # Now outbound
    try:
        stop_tag_outbound
    except:
        stop_tag_outbound = None
        df_O = pd.DataFrame()
    if stop_tag_outbound is not None:
        print 'stop_tag_outbound here'
        url_get_predictions_outbound='http://webservices.nextbus.com/service/publicXMLFeed?command=predictions&a=sf-muni&s='+str(stop_tag_outbound)+'&r='+str(specified_route)
        predictions_O = pq(urlopen(url_get_predictions_outbound).read())
        df_O = pd.DataFrame()
        indexer=0
        # Deal with lack of pairs later, but throw that as an error and don't display bunching
        for i in range(len(predictions_O('prediction'))-1):
            if pq(predictions_O('prediction')[i]).attr.affectedByLayover is None or pq(predictions_O('prediction')[i+1]).attr.affectedByLayover is None: 
                prediction1 = pq(predictions_O('prediction')[i])
                prediction2 = pq(predictions_O('prediction')[i+1])
                df_tmp = pd.DataFrame({'vehicle1': prediction1.attr.vehicle, \
                                    'vehicle2': prediction2.attr.vehicle, \
                                    'pred1': prediction1.attr.minutes, \
                                    'pred2': prediction2.attr.minutes}, index=[indexer])
                df_O = df_O.append(df_tmp)
                indexer += 1
                
    if df_I.empty and df_O.empty:
        print 'Not doing anything'
        return
    elif df_I.empty:
        df_array = [df_O]
    elif df_O.empty:
        df_array = [df_I]
    else:
        df_array = [df_I, df_O]
    # Now go to realtime predictions and match the vehicle numbers (vehicles should have a unique ID)
    url_get_realtime_posits='http://webservices.nextbus.com/service/publicXMLFeed?command=vehicleLocations&a=sf-muni&t=0&r='+str(specified_route)
    realtime_posits = pq(urlopen(url_get_realtime_posits).read())
    df_total = pd.DataFrame()
    for df_tmp in df_array: 
        time_stamp = datetime.datetime.utcfromtimestamp(int(pq(pq(realtime_posits('vehicle')[-1]).siblings()[-1]).attr('time'))/1000)
        # for each pair in df_I, find the match in the realtime_posits
        df1 = pd.DataFrame()
        df2 = pd.DataFrame()
        for i in range(df_tmp.shape[0]):
            for vehicle in realtime_posits('vehicle'):
                v = pq(vehicle)
                if v.attr.id == df_tmp.loc[i]['vehicle1']:
                    df1 = pd.DataFrame({'ind':i, 'time': time_stamp,'lat_x': float(v.attr.lat), 'lon_x': float(v.attr.lon), 'speed_x': float(v.attr.speedKmHr), 'route_x': str(v.attr.routeTag), 'pred_x': df_tmp.loc[i]['pred1']},index=[0])
                    #df1 = df1.append(df1_tmp)
                elif v.attr.id == df_tmp.loc[i]['vehicle2']:
                    df2 = pd.DataFrame({'ind':i, 'lat_y': float(v.attr.lat), 'lon_y': float(v.attr.lon), 'speed_y': float(v.attr.speedKmHr), 'pred_y': df_tmp.loc[i]['pred2']},index=[0])
                    #df2 = df2.append(df2_tmp)
            if df1.empty or df2.empty:
                continue
            else:
                df_tmp1 = pd.merge(left=df1, right=df2)
                df_total = df_total.append(df_tmp1)

    if not df_total.empty:
        print 'yay!'
        
    # check to make sure we've actually found something
    if df_total.empty:
        # don't add anything this time around (remember, this just adds buses from this route for this specific call)
        return
    else:
        # some cleaning
        df_total.drop('ind', inplace=True, axis=1)
        df_total.index = np.arange(df_total.shape[0])
        # add distance after computing for this route
        df_total['dist'] = df_total.apply(lambda row: haversine(row['lat_x'],row['lon_x'],row['lat_y'],row['lon_y']), axis=1)

        # Now right this beast to a SQL database specified by input
        df_total.to_sql(specified_table, specified_engine, if_exists='append')

In [204]:
enter_pairs_for_route_into_database('30', engine, 'nextbus_realtime_with_preds')

Not doing anything


In [205]:
enter_pairs_for_route_into_database('21', engine, 'nextbus_realtime_with_preds')

Not doing anything


In [21]:
for route in list_of_muni_routes[:3]:
    enter_pairs_for_route_into_database(route, engine, 'testing_loop')

route: 29
http://webservices.nextbus.com/service/publicXMLFeed?command=routeConfig&a=sf-muni&r=29
stop_tag: 34648
stop_tag_outbound = 34648
stop_tag: 33706
stop_tag_inbound = 33706
stop_tag_inbound here
stop_tag_outbound here
yay!
route: 21
http://webservices.nextbus.com/service/publicXMLFeed?command=routeConfig&a=sf-muni&r=21
stop_tag: 37499
stop_tag_outbound = 37499
stop_tag: 37832
stop_tag_inbound = 37832
stop_tag_inbound here
stop_tag_outbound here
yay!
route: 28
http://webservices.nextbus.com/service/publicXMLFeed?command=routeConfig&a=sf-muni&r=28
stop_tag: 34341
stop_tag_outbound = 34341
stop_tag: 7799
stop_tag_inbound = 7799
stop_tag_inbound here
stop_tag_outbound here
yay!
