# Load up our libraries

In [1]:
# all purpose
import datetime, geoplotlib, re
from math import radians, cos, sin, asin, sqrt

# for talking to SQL databases
import psycopg2
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database

# json and XML parsing
import json
from pprint import pprint
from urllib2 import urlopen
from pyquery import PyQuery as pq

# for making maps
import geoplotlib
from geoplotlib.utils import BoundingBox
from IPython.display import Image

# all purpose data analysis and plotting
from scipy import stats
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns



Define this function for computing distances in meters from (lat,lon) coordinates

In [2]:
# We will need this function to compute the distance between two (lat,lon) points, in meters
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    meters = 6367 * c * 1000
    return meters

## Load data from SQL database (necessary only to load Muni routes and estlabish connection to database)

We need to connect to the PostgresSQL database that I am reading the NextBus Muni data into, which is called 'sf_muni_arrivals' in our case.

**This cell must be run.**

In [3]:
dbname = 'sf_muni_arrivals'
username = 'dstone'
table = 'nextbus_write_2016_01_15'

# Open up an engine, that we will use to create the database if it doesn't exist
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))

if not database_exists(engine.url):
    create_database(engine.url)
    
# If I want to filter the data first:
# connect:
db_con = None
db_con = psycopg2.connect(database = dbname, user = username)

These commands load the entire database into a pandas dataframe. I only used this to grab the list of Muni routes. It is not necessary below.

In [4]:
# the table name is 'nextbus':
# only run this code if the list_of_muni_routes needs to be reestablished
# sql_query = """
# SELECT * FROM {table};
# """.format(table=table)
# nbdata = pd.read_sql_query(sql_query,db_con)

In [5]:
# routeslist = pd.unique(nbdata['route'].ravel())
# list_of_muni_routes = pd.DataFrame(data=routeslist, index = range(len(routeslist)), columns=['route'])

# # Clean up a bit
# list_of_muni_routes = list_of_muni_routes[(list_of_muni_routes.route.isnull() == False) & (list_of_muni_routes.route != 'Inspectors') & (list_of_muni_routes.route != 'Training')]
# list_of_muni_routes = list_of_muni_routes[list_of_muni_routes.route != '']
# np.save('list_of_muni_routes',np.asarray(list_of_muni_routes).ravel())
list_of_muni_routes = np.load('list_of_muni_routes.npy')

## Reading in data from Google Maps API

From there get stop ID of starting bus, from there get bus data from NextBus

In [107]:
def get_googlemaps_json(start_loc, end_loc):
    my_googlemaps_auth = 'AIzaSyDWQv6WWQptI-6rjbavkoZ1TpVZhHKOm4w'
    # Note we specify transit_mode=bus, so we are ignoring tram lines for now
    googlemaps_url = 'https://maps.googleapis.com/maps/api/directions/json?origin='+str(start_loc).replace(' ','+')+'&destination='+str(end_loc).replace(' ','+')+'&mode=transit&key='+str(my_googlemaps_auth)
    return json.load(urlopen(googlemaps_url))

In [126]:
data = get_googlemaps_json('850 Shrader St, San Francisco, CA, United States','24th St. Mission BART Station, Mission Street, San Francisco, CA, United States')

In [44]:
! echo $data > google_directions_presidio.txt

In [127]:
data

{u'geocoded_waypoints': [{u'geocoder_status': u'OK',
   u'place_id': u'ChIJLSGqC1SHhYAR143fd6GI4K4',
   u'types': [u'street_address']},
  {u'geocoder_status': u'OK',
   u'place_id': u'ChIJxe_XM0d-j4ARVoVsrEVn4wA',
   u'types': [u'subway_station',
    u'transit_station',
    u'point_of_interest',
    u'establishment']}],
 u'routes': [{u'bounds': {u'northeast': {u'lat': 37.76946300000001,
     u'lng': -122.4180637},
    u'southwest': {u'lat': 37.7517094, u'lng': -122.451368}},
   u'copyrights': u'Map data \xa92016 Google',
   u'fare': {u'currency': u'USD', u'text': u'$2.25', u'value': 2.25},
   u'legs': [{u'arrival_time': {u'text': u'1:10am',
      u'time_zone': u'America/Los_Angeles',
      u'value': 1454058636},
     u'departure_time': {u'text': u'12:40am',
      u'time_zone': u'America/Los_Angeles',
      u'value': 1454056857},
     u'distance': {u'text': u'3.1 mi', u'value': 5038},
     u'duration': {u'text': u'30 mins', u'value': 1779},
     u'end_address': u'24th St Mission Station

TO DO: FIND TYPE BESIDES 'BUS' FOR VEHICLE TYPE, THEN GET BACK TO MODEL

Get stop name and (lat, lon) from Google Maps

In [122]:
# Need to walk through steps, since if directions require you to walk 
# to a transit stop, then first step is walking, not transit
steps = data['routes'][0]['legs'][0]['steps']

IndexError: list index out of range

In [110]:
steps

[{u'distance': {u'text': u'0.1 mi', u'value': 183},
  u'duration': {u'text': u'2 mins', u'value': 146},
  u'end_location': {u'lat': 37.775675, u'lng': -122.446828},
  u'html_instructions': u'Walk to Fulton St & Masonic Ave',
  u'polyline': {u'points': u'k|peFbejjVJ`BaCXw@HHnANC'},
  u'start_location': {u'lat': 37.774944, u'lng': -122.4457843},
  u'steps': [{u'distance': {u'text': u'141 ft', u'value': 43},
    u'duration': {u'text': u'1 min', u'value': 34},
    u'end_location': {u'lat': 37.774885, u'lng': -122.4462667},
    u'html_instructions': u'Head <b>west</b> on <b>Grove St</b> toward <b>Masonic Ave</b>',
    u'polyline': {u'points': u'k|peFbejjVJ`B'},
    u'start_location': {u'lat': 37.774944, u'lng': -122.4457843},
    u'travel_mode': u'WALKING'},
   {u'distance': {u'text': u'344 ft', u'value': 105},
    u'duration': {u'text': u'1 min', u'value': 79},
    u'end_location': {u'lat': 37.7758143, u'lng': -122.4464548},
    u'html_instructions': u'Turn <b>right</b> onto <b>Masonic Ave

In [112]:
for i, step in enumerate(steps):
    if step['travel_mode'] == 'TRANSIT':
        transit_step = i
        break
 
route_name = str(steps[transit_step]['transit_details']['line']['short_name'])
departure_stop = str(steps[transit_step]['transit_details']['departure_stop']['name'])
departure_lat = round(float(steps[transit_step]['transit_details']['departure_stop']['location']['lat']),5)
departure_lon = round(float(steps[transit_step]['transit_details']['departure_stop']['location']['lng']),5)

In [113]:
print route_name
print departure_stop
print departure_lat
print departure_lon

5
Fulton St & Masonic Ave
37.77567
-122.44683


In [116]:
steps[transit_step]['transit_details']['line']['vehicle']['type'] == 'BUS'

True

In [111]:
u'short_name': u'5',
    u'vehicle': {u'icon': u'//maps.gstatic.com/mapfiles/transit/iw2/6/bus.png',
     u'name': u'Bus',
     u'type': u'BUS'}

SyntaxError: invalid syntax (<ipython-input-111-4f53a3eec825>, line 1)

Get StopID from Nextbus 'routeConfig'

In [58]:
url_get_route_config='http://webservices.nextbus.com/service/publicXMLFeed?command=routeConfig&a=sf-muni&r='+str(route_name)
route_config = pq(urlopen(url_get_route_config).read())

In [59]:
for bus_stop_obj in route_config('stop'):
    bus_stop = pq(bus_stop_obj)
    if bus_stop.attr('lat') is not None:
        stop_name = str(bus_stop.attr('title'))
        stop_lat = round(float(bus_stop.attr('lat')),5)
        stop_lon = round(float(bus_stop.attr('lon')),5)
        print 'Coordinates of stop: ('+str(departure_lat)+','+str(departure_lon)+'), name: '+str(departure_stop)
        print 'Real stop coordinates ('+str(stop_lat)+', '+str(stop_lon)+'), name: '+str(stop_name)
        if stop_name == departure_stop and stop_lat == departure_lat and stop_lon == departure_lon:
            stop_id = str(bus_stop.attr('stopId'))

Coordinates of stop: (37.77395,-122.44568), name: Hayes St & Masonic Ave
Real stop coordinates (37.79151, -122.39547), name: Mission St & Main
Coordinates of stop: (37.77395,-122.44568), name: Hayes St & Masonic Ave
Real stop coordinates (37.79165, -122.39815), name: Fremont St & Market St
Coordinates of stop: (37.77395,-122.44568), name: Hayes St & Masonic Ave
Real stop coordinates (37.79016, -122.40041), name: Market St & Sansome St
Coordinates of stop: (37.77395,-122.44568), name: Hayes St & Masonic Ave
Real stop coordinates (37.78468, -122.40731), name: Market St & Powell St
Coordinates of stop: (37.77395,-122.44568), name: Hayes St & Masonic Ave
Real stop coordinates (37.78285, -122.40964), name: Market St & Mason St
Coordinates of stop: (37.77395,-122.44568), name: Hayes St & Masonic Ave
Real stop coordinates (37.78057, -122.41244), name: Market St & 7th St North
Coordinates of stop: (37.77395,-122.44568), name: Hayes St & Masonic Ave
Real stop coordinates (37.77878, -122.41472),

In [37]:
for bus_stop_obj in route_config('stop'):
    bus_stop = pq(bus_stop_obj)
    if bus_stop.attr('lat') is not None:
        stop_name = str(bus_stop.attr('title'))
        stop_lat = round(float(bus_stop.attr('lat')),5)
        stop_lon = round(float(bus_stop.attr('lon')),5)
        if stop_lat == departure_lat and stop_lon == departure_lon:
            stop_id = str(bus_stop.attr('stopId'))
            print 'Coordinates matched for stop: ('+str(departure_lat)+','+str(departure_lon)+'),  with google name: '+str(departure_stop)
            print 'Coordinates matched for real stop ('+str(stop_lat)+', '+str(stop_lon)+'), with Nextbus name: '+str(stop_name)
#             print 'Match at '+stop_name+' with stop id: '+stop_id
#             print 'Coordinates of stop: ('+str(stop_lat)+','+str(stop_lon)+')'
#         elif stop_name == departure_stop:
#             print 'Real stop coordinates ('+str(stop_lat)+', '+str(stop_lon)+'), name: '+str(stop_name)
#             print 'Tried stop coordinates ('+str(departure_lat)+', '+str(departure_lon)+'), name: '+str(departure_stop)

Get next two vehicles from Nextbus 'predictions'

In [105]:
stop_id

NameError: name 'stop_id' is not defined

In [371]:
url_get_stop_info='http://webservices.nextbus.com/service/publicXMLFeed?command=predictions&a=sf-muni&stopId='+stop_id+'&r='+str(route_name)
stop_config = pq(urlopen(url_get_stop_info).read())

In [372]:
print stop_config

<body copyright="All data copyright San Francisco Muni 2016.">
<predictions agencyTitle="San Francisco Muni" routeTitle="19-Polk" routeTag="19" stopTitle="26th St &amp; De Haro St" stopTag="3516">
  <direction title="Inbound to Fisherman's Wharf">
  <prediction epochTime="1453490682980" seconds="446" minutes="7" isDeparture="false" dirTag="19___I_F00" vehicle="8316" block="1901" tripTag="6697906"/>
  <prediction epochTime="1453491532715" seconds="1296" minutes="21" isDeparture="false" affectedByLayover="true" dirTag="19___I_F00" vehicle="8344" block="1909" tripTag="6697862"/>
  <prediction epochTime="1453492432715" seconds="2196" minutes="36" isDeparture="false" affectedByLayover="true" dirTag="19___I_F00" vehicle="8125" block="1904" tripTag="6697863"/>
  <prediction epochTime="1453493332715" seconds="3096" minutes="51" isDeparture="false" affectedByLayover="true" dirTag="19___I_F00" vehicle="8352" block="1906" tripTag="6697864"/>
  <prediction epochTime="1453494232715" seconds="3996" 

In [386]:
vehicle_array = []
arrival_time_array = []
for prediction in stop_config('prediction'):
    vehicle_array.append(pq(prediction).attr.vehicle)
    arrival_time_array.append(pq(prediction).attr.minutes)

vehicle_1 = vehicle_array[0]
vehicle_2 = vehicle_array[1]
arrival_time_1 = arrival_time_array[0]
arrival_time_2 = arrival_time_array[1]

In [387]:
arrival_time_1

'7'

Get both vehicles' information from Nextbus 'vehicleLocations'

In [374]:
url_get_realtime_info='http://webservices.nextbus.com/service/publicXMLFeed?command=vehicleLocations&a=sf-muni&r='+str(route_name)+'&t=0'
realtime_posits = pq(urlopen(url_get_realtime_info).read())

In [375]:
print realtime_posits('vehicle')

<vehicle id="8125" routeTag="19" dirTag="19___O_F00" lat="37.73504" lon="-122.37939" secsSinceReport="8" predictable="true" heading="180" speedKmHr="14"/>
<vehicle id="8344" routeTag="19" dirTag="19___O_F00" lat="37.72891" lon="-122.36726" secsSinceReport="14" predictable="true" heading="127" speedKmHr="0"/>
<vehicle id="8352" routeTag="19" dirTag="19___O_F00" lat="37.771454" lon="-122.405716" secsSinceReport="22" predictable="true" heading="135" speedKmHr="0"/>
<vehicle id="8361" routeTag="19" dirTag="19___O_F00" lat="37.79426" lon="-122.42144" secsSinceReport="62" predictable="true" heading="171" speedKmHr="22"/>
<vehicle id="8353" routeTag="19" dirTag="19___I_F00" lat="37.7928" lon="-122.42114" secsSinceReport="22" predictable="true" heading="350" speedKmHr="16"/>
<vehicle id="8174" routeTag="19" dirTag="19___I_F00" lat="37.76306" lon="-122.4014" secsSinceReport="13" predictable="true" heading="355" speedKmHr="24"/>
<vehicle id="8156" routeTag="19" dirTag="19___I_F00" lat="37.7804" 

In [376]:
time_stamp = datetime.datetime.utcfromtimestamp(int(pq(pq(realtime_posits('vehicle')[-1]).siblings()[-1]).attr('time'))/1000)
for vehicle in realtime_posits('vehicle'):
    v = pq(vehicle)
    if v.attr.id == vehicle_1:
        df1 = pd.DataFrame({'ind': 0,'time': time_stamp,'lat_x': float(v.attr.lat), 'lon_x': float(v.attr.lon), 'speed_x': float(v.attr.speedKmHr), 'route_x': str(v.attr.routeTag)},index=[0])
    elif v.attr.id == vehicle_2:
        df2 = pd.DataFrame({'ind': 0,'lat_y': float(v.attr.lat), 'lon_y': float(v.attr.lon), 'speed_y': float(v.attr.speedKmHr)},index=[0])

df_tmp = pd.merge(left=df1, right=df2)

In [82]:
df_tmp

NameError: name 'df_tmp' is not defined

In [378]:
def compute_distance_percentile(route_num, lat1, lon1, lat2, lon2):
        tmp_dist = haversine(lat1,lon1,lat2,lon2)
        path_to_dist_distribution = 'muni_route_distance_distributions/route_'+str(route_num)+'_distribution.npy'
        route_dist_distribution = np.load(path_to_dist_distribution)
        percentile_score = 1 - stats.percentileofscore(route_dist_distribution, tmp_dist)/100
        return percentile_score

In [379]:
df_tmp['dist_percentile'] = compute_distance_percentile(route_name, float(df_tmp['lat_x'][0]), float(df_tmp['lon_x'][0]), float(df_tmp['lat_y'][0]), float(df_tmp['lon_y'][0]))

In [388]:
df_tmp['arrival_x'] = arrival_time_1
df_tmp['arrival_y'] = arrival_time_2

In [381]:
df_tmp['bunched'] = 0

In [390]:
df_tmp['arrival_x'][0]

'7'

In [334]:
formula_for_realtime = 'bunched ~ time.dt.hour + time.dt.minute + lat_x + lon_x + speed_x  + lat_y + lon_y + speed_y + dist_percentile'

ytmp, xtmp = patsy.dmatrices(formula_for_realtime, data=df_tmp, return_type='dataframe')

In [342]:
df_tmp

Unnamed: 0,ind,lat_x,lon_x,route_x,speed_x,time,lat_y,lon_y,speed_y,dist_percentile,bunched
0,0,37.78368,-122.39866,91,38,2016-01-22 09:36:38,37.71344,-122.41531,9,0.817531,0


## Finding schedule frequency

And saving to disk

In [90]:
import pickle

In [103]:
def write_route_frequencies(route_name):
    url_get_schedule='http://webservices.nextbus.com/service/publicXMLFeed?command=schedule&a=sf-muni&r='+str(route_name)
    schedule_config = pq(urlopen(url_get_schedule).read())
    # A bit arbitrary, just choosing the last stop of the first block of addresses, but should be representative

    df_schedule = pd.DataFrame()
    stop_to_check = pq(pq(schedule_config('route')[0])('stop')[-1]).attr.tag

    indexer=0
    for set_of_stops in schedule_config('route'):
        for stop in pq(set_of_stops)('stop'):
            if str(pq(stop).attr('tag')) == str(stop_to_check):
                if pq(stop).attr.epochTime is not None:
                    df_tmp = pd.DataFrame({'time': datetime.datetime.utcfromtimestamp(int(pq(stop).attr.epochTime)/1000)}, index=[indexer])
                    df_schedule = df_schedule.append(df_tmp)
                    indexer += 1

    df_schedule['freq_pre'] = df_schedule.diff(1)
    df_schedule['freq'] = df_schedule[df_schedule['freq_pre'].isnull() == False].apply(lambda row: (row['freq_pre'].seconds)/60, axis=1)
    fake_freq = int(df_schedule.iloc[1]['freq'])
    df_schedule.loc[0,'freq'] = fake_freq
    df_schedule.drop('freq_pre', inplace=True, axis=1)
    # if frequency is above 120 minutes or below 4, which it shouldn't be, probably getting garbage
    df_schedule = df_schedule[(df_schedule.freq <= 120) & (df_schedule.freq >= 4)]
    df_schedule.index = df_schedule['time']
    
    #pickle time
    filename = 'route_frequencies/route_'+str(route_name)+'_frequencies.pkl'
    with open(filename,'wb') as output:
        pickle.dump(df_schedule, output, pickle.HIGHEST_PROTOCOL)

In [104]:
for route in list_of_muni_routes:
    write_route_frequencies(route)

In [100]:
write_route_frequencies(10)

In [93]:
with open('route_frequencies/route_30_frequencies.pkl','rb') as input:
        dftmp = pickle.load(input)

In [94]:
dftmp[dftmp.freq > 100]

Unnamed: 0_level_0,time,freq
time,Unnamed: 1_level_1,Unnamed: 2_level_1
1970-01-01 05:18:00,1970-01-01 05:18:00,25
1970-01-01 05:43:00,1970-01-01 05:43:00,25
1970-01-01 06:02:00,1970-01-01 06:02:00,19
1970-01-01 06:22:00,1970-01-01 06:22:00,20
1970-01-01 06:41:00,1970-01-01 06:41:00,19
1970-01-01 06:59:00,1970-01-01 06:59:00,18
1970-01-01 07:13:00,1970-01-01 07:13:00,14
1970-01-01 07:27:00,1970-01-01 07:27:00,14
1970-01-01 07:41:00,1970-01-01 07:41:00,14
1970-01-01 07:54:00,1970-01-01 07:54:00,13


## Scratch

In [61]:
with open('route_frequencies/route_10_frequencies.pkl','rb') as input:
    route10freqs = pickle.load(input)

In [10]:
t0 = datetime.datetime(1970,1,1,12,4)

In [62]:
route10freqs = route10freqs.drop_duplicates(subset='time')

In [63]:
route10freqs['diff'] = route10freqs.apply(lambda row: abs(row['time']-t0), axis=1)
loc = route10freqs.loc[route10freqs['diff'].idxmin()]
loc['freq']

In [79]:
str(t1)

'1970-01-01 12:34:00'

In [68]:
t1=t0 + pd.Timedelta(minutes=30)

In [89]:
route10freqs.loc[str(t0):str(t0+pd.Timedelta(hours=3, minutes=2))]['freq'][0]

20.0