# Load up our libraries

In [8]:
# all purpose
import datetime, geoplotlib, re
from math import radians, cos, sin, asin, sqrt

# for talking to SQL databases
import psycopg2
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database

# json and XML parsing
import json
from pprint import pprint
from urllib2 import urlopen
from pyquery import PyQuery as pq

# for making maps
import geoplotlib
from geoplotlib.utils import BoundingBox
from IPython.display import Image

# all purpose data analysis and plotting
from scipy import stats
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Define this function for computing distances in meters from (lat,lon) coordinates

In [5]:
# We will need this function to compute the distance between two (lat,lon) points, in meters
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    meters = 6367 * c * 1000
    return meters

# Data from PostgreSQL database I've built with API calls
### Initial exploratory analysis

## Write generate_route_pair function 
Pulls data from SQL database, computes entry in final database, which is pairs subsequent buses on the same route

We need to connect to the PostgresSQL database that I am reading the NextBus Muni data into, which is called 'sf_muni_arrivals' in our case

In [None]:
dbname = 'sf_muni_arrivals'
username = 'dstone'
table = 'nextbus_write_2016_01_15'

# Open up an engine, that we will use to create the database if it doesn't exist
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))

if not database_exists(engine.url):
    create_database(engine.url)

The following command loads the ENTIRE SQL database from above into a pandas dataframe. In the future, I will want to sort the data first with SQL commands, then load it into a pandas dataframe. I can do that with the commented out text.

In [None]:
# If I want to filter the data first:
# connect:
db_con = None
db_con = psycopg2.connect(database = dbname, user = username)
# the table name is 'nextbus':
sql_query = """
SELECT * FROM {table};
""".format(table=table)
nbdata = pd.read_sql_query(sql_query,db_con)

In [6]:
nbdata.drop(nbdata.index[0], inplace=True)
nbdata.drop(nbdata.index[-1], inplace=True)
#nbdata.columns = ['vehicle','received_time','gps_time','gpsfix','speed','heading','route','trip']

In [182]:
routeslist = pd.unique(nbdata['route'].ravel())
list_of_muni_routes = pd.DataFrame(data=routeslist, index = range(len(routeslist)), columns=['route'])

# Clean up a bit
list_of_muni_routes = list_of_muni_routes[(list_of_muni_routes.route.isnull() == False) & (list_of_muni_routes.route != 'Inspectors') & (list_of_muni_routes.route != 'Training')]

Now we write the function here. Idea: run over 'list_of_muni_routes', for each route, create the pairs (there will be many) of entries (this is done in various sections of code below), write them to a SQL database, db and table of your choice

In [None]:
# this function generates a table of pairs for a given route
# assume a database connection is already established
def generate_route_pairs(route, database_connection, table):
    sql_query = '''SELECT * FROM {table} WHERE route = {route};'''.format(table=table,route=route)
    df_route = pd.read_sql_query(sql_query, database_connection)

Some clean up of this data: remove all whitespaces in file, drop first row, rename columns so they are free of whitespaces, get GPS coordinates separately

Next we redefine the lat, lon coordinates into separate columns, and drop the old 'gpsfix' column. Finally, we reinterpret the times and coarse-grain to the minute (LATER: do to the half-minute)

Plan: merge the two tables with only their lat longs (time stamps included in index), compute distance on that table

In [331]:
dist1 = pd.merge(left=trip_0[['lat','lon','time']],right=trip_1[['lat','lon','time']],on='time').apply(lambda row: haversine(row['lat_x'],row['lon_x'],row['lat_y'],row['lon_y']), axis=1)
dist2 = pd.merge(left=trip_1[['lat','lon','time']],right=trip_2[['lat','lon','time']],on='time').apply(lambda row: haversine(row['lat_x'],row['lon_x'],row['lat_y'],row['lon_y']), axis=1)

In [28]:
coords_dict = {}
for i in range(len(trips_in_order_30)):
    my_key = "trip_"+str(i)
    if my_key not in coords_dict:
        coords_dict[my_key] = 0

for i in range(len(trips_in_order_30)):
    trip_temp = 'trip_'+str(i)
    tmp_vehicles = pd.unique(route30[(route30.trip == trips_in_order_30[i])].vehicle.ravel())
    coords_dict[trip_temp] = route30[(route30.trip == trips_in_order_30[i]) & (route30.vehicle == tmp_vehicles[0])][['lat','lon','time']]
    coords_dict[trip_temp].drop_duplicates(subset='time', inplace=True)
    coords_dict[trip_temp].index = coords_dict[trip_temp].time

Now make a dataframe that has time stamps and distances, by computing successive differences between each pair of trips, then appending (with pd.concat) each collection of (time, distance) of each pair of trips to the empty dist dataframe, which will end up containing all distances from pairs of trips on the route 30

In [70]:
dist=pd.DataFrame()

In [86]:
# substract one from length of trip_in_order_30 so that final pair of trips doesn't go outside index range
#for i in range(len(trips_in_order_30)-1):
for i in range(len(trips_in_order_30)-1):
    trip_now = 'trip_'+str(i)
    trip_next = 'trip_'+str(i+1)
    #print 'Merging {trip1} and {trip2}'.format(trip1=trip_now,trip2=trip_next)
    tmp = pd.merge(left=coords_dict[trip_now],right=coords_dict[trip_next],on='time')
    if tmp.shape[0] != 0:
        tmp['dist'] = tmp.apply(lambda row: haversine(row['lat_x'],row['lon_x'],row['lat_y'],row['lon_y']), axis=1)
        dist = pd.concat([dist, tmp[['time','dist']]])
    else:
        continue

dist.index = range(len(dist))

## Let's combine this into a single function that takes a given route and spits out the above dataframe of distances for that route

The dataframe has a list of distances between successive trips for that route.

This presumes that you are using the cleaned nbdata dataframe from above (which had all the precleaning done and takes ~10 minutes to run each time one restarts the notebook).

In [115]:
# All documentation for this function can be found in the exploration I did above for route 30 specifically
def compute_dists_for_route(route_num):
    route_dat = nbdata[(nbdata.route == route_num)]
    vehicles_for_route = pd.unique(route_dat.vehicle.ravel())
    trips_in_order_for_route = np.sort(pd.unique(route_dat.trip.ravel()))
    
    coords_dict = {}
    for i in range(len(trips_in_order_for_route)):
        my_key = "trip_"+str(i)
        if my_key not in coords_dict:
            coords_dict[my_key] = 0

    for i in range(len(trips_in_order_for_route)):
        trip_temp = 'trip_'+str(i)
        tmp_vehicles = pd.unique(route_dat[(route_dat.trip == trips_in_order_for_route[i])].vehicle.ravel())
        coords_dict[trip_temp] = route_dat[(route_dat.trip == trips_in_order_for_route[i]) & (route_dat.vehicle == tmp_vehicles[0])][['lat','lon','time']]
        coords_dict[trip_temp].drop_duplicates(subset='time', inplace=True)
        coords_dict[trip_temp].index = coords_dict[trip_temp].time
        
    dist_tmp = pd.DataFrame()
    
    for i in range(len(trips_in_order_for_route)-1):
        trip_now = 'trip_'+str(i)
        trip_next = 'trip_'+str(i+1)
        #print 'Merging {trip1} and {trip2}'.format(trip1=trip_now,trip2=trip_next)
        tmp = pd.merge(left=coords_dict[trip_now],right=coords_dict[trip_next],on='time')
        if tmp.shape[0] != 0:
            tmp['dist'] = tmp.apply(lambda row: haversine(row['lat_x'],row['lon_x'],row['lat_y'],row['lon_y']), axis=1)
            dist_tmp = pd.concat([dist_tmp, tmp[['time','dist']]])
        else:
            continue

    dist_tmp.index = range(len(dist_tmp))
    
    return dist_tmp

List of all routes available in the Muni system, so we can just choose from this list.

Create a dict of these dataframes for **each** route in the list_of_muni_routes

## Reading in data from Google Maps API

From there get stop ID of starting bus, from there get bus data from NextBus

In [20]:
with open('google_maps_api/second_dirs.json') as data_file:
    data = json.load(data_file)

#pprint(data)

In [25]:
route_name = str(data['routes'][0]['legs'][0]['steps'][0]['transit_details']['line']['short_name'])
departure_stop = str(data['routes'][0]['legs'][0]['steps'][0]['transit_details']['departure_stop']['name'])
departure_lat = float(data['routes'][0]['legs'][0]['steps'][0]['transit_details']['departure_stop']['location']['lat'])
departure_lon = float(data['routes'][0]['legs'][0]['steps'][0]['transit_details']['departure_stop']['location']['lng'])

In [26]:
print route_name
print departure_stop
print departure_lat
print departure_lon

19
26th St & De Haro St
37.75086
-122.40015


In [38]:
url_get_route_config='http://webservices.nextbus.com/service/publicXMLFeed?command=routeConfig&a=sf-muni&r='+str(route_name)
route_config = pq(urlopen(url_get_route_config).read())

In [39]:
for bus_stop_obj in route_config('stop'):
    bus_stop = pq(bus_stop_obj)
    if bus_stop.attr('lat') is not None:
        stop_name = str(bus_stop.attr('title'))
        stop_lat = round(float(bus_stop.attr('lat')),5)
        stop_lon = round(float(bus_stop.attr('lon')),5)
        if stop_name == departure_stop and stop_lat == departure_lat and stop_lon == departure_lon:
            stop_id = str(bus_stop.attr('stopId'))
            print 'Match at '+stop_name+' with stop id: '+stop_id
            print 'Coordinates of stop: ('+str(stop_lat)+','+str(stop_lon)+')'

Match at 26th St & De Haro St with stop id: 13516
Coordinates of stop: (37.75086,-122.40015)


In [40]:
url_get_stop_info='http://webservices.nextbus.com/service/publicXMLFeed?command=predictions&a=sf-muni&stopId='+stop_id
stop_config = pq(urlopen(url_get_stop_info).read())

In [43]:
for next_bus_obj in stop_config('predictions'):
    if str(pq(next_bus_obj).attr('routeTag')) == route_name:
        for upcoming_trips in pq(next_bus_obj)
        trip_id = pq(next_bus_obj).attr('tripTag')
        print 'Found correct bus route '+str(pq(next_bus_obj).attr('routeTag'))+', trip ID = '+ str(trip_id)

Found correct bus route 19, trip ID = None


In [44]:
!head nextbus_one_day_sf_muni_dump.csv > tmp_dump.txt

In [77]:
with open('tmp_dump.txt') as my_file:
    data = my_file.readlines()
my_file.close()

for i, line in enumerate(data,0):
    # get rid of whitespaces, parentheses in coordinates, replace comma in coordinates with '|' so it splits,
    # get rid of newlines, split on '|'
    data[i] = line.replace(',','|').replace(' ','').replace('(','').replace(')','').replace('\n','').split('|')

In [79]:
print data[0]
print data[3][1]

['vehiclid', 'receivedtimestamp', 'gpstimestamp', 'gpsfix', 'gpsspeed', 'gpsheading', 'route', 'trip']
2016-01-1400:00:00.099-08


## Build distance distributions (nightly), save to disk

## Generate database of pairs
With distance quantiles built in from previous function