# Streetcar Delay Prediction - Geocode Bounding boxes

Use dataset covering Toronto Transit Commission (TTC) streetcar delays 2014 - present to predict future delays and come up with recommendations for avoiding delays.

Source dataset: : https://www.toronto.ca/city-government/data-research-maps/open-data/open-data-catalogue/#e8f359f0-2f47-3058-bf64-6ec488de52da

This notebook contains the steps to get geo bounding boxes for routes.

# Streetcar routes

From https://www.ttc.ca/PDF/Maps/TTC_StreetcarMap.pdf

<table style="border: none" align="left">
   </tr>
   <tr style="border: none">
       <th style="border: none"><img src="https://raw.githubusercontent.com/ryanmark1867/manning/master/ttc_sc_map.jpg" width="900" alt="Icon"> </th>
   </tr>
</table>


In [1]:
! pip install -U folium
import folium

Requirement already up-to-date: folium in /opt/conda/envs/fastai/lib/python3.6/site-packages (0.9.1)


# Load libraries

In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
# import seaborn as sns
import datetime
import os
from folium.plugins import MarkerCluster
import folium
import pixiedust
from folium.plugins import HeatMap

remove_bad_values = False
city_name = 'Toronto'
pickled_output_dataframe = 'bounding_box_df_july15'

# define bounding box of valid streetcar locations
min_lat = 43.58735
max_lat = 43.687840
min_long = -79.547860
max_long = -79.280260


Pixiedust database opened successfully


In [4]:
# get the directory for that this notebook is in
rawpath = os.getcwd()
print("raw path is",rawpath)

raw path is /storage/manning/notebooks


In [5]:
# data is in a directory called "data" that is a sibling to the directory containing the notebook
path = os.path.abspath(os.path.join(rawpath, '..', 'data')) + "/"
print("path is", path)

path is /storage/manning/data/


# Load dataset

In [6]:
url="https://raw.githubusercontent.com/ryanmark1867/manning/master/2014_2018_df_cleaned_keep_bad_loc_geocoded_apr23.csv"

df=pd.read_csv(url)
df.head()


Unnamed: 0.1,Unnamed: 0,Report Date,Route,Time,Day,Location,Incident,Min Delay,Min Gap,Direction,Vehicle,Report Date Time,lat_long,latitude,longitude
0,0,2016-01-01 00:00:00,505,00:00:00,Friday,dundas west stationt to broadview station,General Delay,7.0,14.0,w,4028,2016-01-01 00:00:00,"[0.0, 0.0]",0.0,0.0
1,1,2016-01-01 00:00:00,511,02:14:00,Friday,fleet st. and strachan,Mechanical,10.0,20.0,e,4018,2016-01-01 02:14:00,"[43.6362976, -79.4096351]",43.636298,-79.409635
2,2,2016-01-01 00:00:00,301,02:22:00,Friday,queen st. west and roncesvalles,Mechanical,9.0,18.0,w,4201,2016-01-01 02:22:00,"[43.64533489999999, -79.4131843]",43.645335,-79.413184
3,3,2016-01-01 00:00:00,301,03:28:00,Friday,lake shore blvd. and superior st.,Mechanical,20.0,40.0,e,4251,2016-01-01 03:28:00,"[43.61496169999999, -79.4886581]",43.614962,-79.488658
4,4,2016-01-01 00:00:00,501,14:28:00,Friday,roncesvalles to neville park,Mechanical,6.0,12.0,e,4242,2016-01-01 14:28:00,"[0.0, 0.0]",0.0,0.0


In [7]:
df.shape

(69603, 15)

# Scope the dataset down to valid locations
Use the boundaries of the streetcar network to limit the dataset to just the locations that are covered by the streetcar network.

In [8]:
# remove locations outside of portion of Toronto with streetcar routes
# latitude NS (higher north), longitude EW (higher east)

# west of Queen and Victoria Park: 43.674280, -79.280260
# east and north of Lakeshore and Etobicoke Creek: 43.587350, -79.547860
# south of St Clair and Mt. Pleasant: 43.687840,-79.399800


def overall_bounding(df):    
    df = df[df.latitude >= min_lat]
    df = df[df.latitude <= max_lat]
    df = df[df.longitude >= min_long]
    df = df[df.longitude <= max_long]
    return(df)



# Define bounding boxes

In [9]:
# function that produces a dataframe with just the max and min latitude and longitude for each route
def def_min_max(df):
    # define dataframes with the maxes and mins for each
    df_max_lat = df.sort_values('latitude',ascending=False).drop_duplicates(['Route'])
    df_max_long = df.sort_values('longitude',ascending=False).drop_duplicates(['Route'])
    df_min_lat = df.sort_values('latitude',ascending=True).drop_duplicates(['Route'])
    df_min_long = df.sort_values('longitude',ascending=True).drop_duplicates(['Route'])
    # rename column names for final dataframe
    df_max_lat = df_max_lat.rename(columns = {'latitude':'max_lat'})
    df_max_long = df_max_long.rename(columns = {'longitude':'max_long'})
    df_min_lat = df_min_lat.rename(columns = {'latitude':'min_lat'})
    df_min_long = df_min_long.rename(columns = {'longitude':'min_long'})
    # join the max dataframes
    df_max = pd.merge(df_max_lat,df_max_long, on='Route', how='left')
    df_max = df_max.drop(['longitude','latitude'],1)
    # join the min dataframes
    df_min = pd.merge(df_min_lat,df_min_long, on='Route', how='left')
    df_min = df_min.drop(['longitude','latitude'],1)
    # join the intermediate dataframes to get the df with the bounding boxes
    df_bounding_box = pd.merge(df_min,df_max, on='Route', how='left')
    return(df_bounding_box)


In [10]:
df = overall_bounding(df)
# clear out all the columns that aren't needed for bounding boxes
df = df.drop(['Report Date','Time','Day','Location','lat_long','Incident','Min Delay','Min Gap','Direction','Vehicle','Report Date Time'], 1)
# remove unnamed column
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
bounding_box_df = def_min_max(df)
bounding_box_df.head()

Unnamed: 0,Route,min_lat,min_long,max_lat,max_long
0,501,43.588204,-79.546264,43.687095,-79.28135
1,301,43.591972,-79.544865,43.680364,-79.281542
2,bad route,43.591972,-79.543895,43.684692,-79.281542
3,504,43.591972,-79.543895,43.686952,-79.281542
4,502,43.591972,-79.543895,43.686952,-79.281542


In [12]:
# pickle the bounding box dataframe
file_name = path + pickled_output_dataframe
bounding_box_df.to_pickle(file_name)

In [13]:
dfn = pd.read_pickle(file_name)
dfn.head()

Unnamed: 0,Route,min_lat,min_long,max_lat,max_long
0,501,43.588204,-79.546264,43.687095,-79.28135
1,301,43.591972,-79.544865,43.680364,-79.281542
2,bad route,43.591972,-79.543895,43.684692,-79.281542
3,504,43.591972,-79.543895,43.686952,-79.281542
4,502,43.591972,-79.543895,43.686952,-79.281542


# Visualize using Folium: clustering delay incidents
Use Folium to display a cluster view of delay counts

In [21]:
def point_box(min_tuple,max_tuple):
    points = []
    #sw corner
    points.append(min_tuple)
    #nw corner
    points.append(tuple([min_tuple[0],max_tuple[1]]))
    #ne corner
    points.append(max_tuple)
    #se corner
    points.append(tuple([max_tuple[0],min_tuple[1]]))
    # finish off box
    points.append(min_tuple)
    return(points)

In [22]:
points

[(43.588204, -79.546264),
 (43.588204, -79.28135),
 (43.687095, -79.28135),
 (43.687095, -79.546264)]

In [26]:
points[0][0]

43.588204

In [27]:
points

[(43.588204, -79.546264),
 (43.588204, -79.28135),
 (43.687095, -79.28135),
 (43.687095, -79.546264),
 (43.588204, -79.546264),
 [(43.588203799999995, -79.54626379999998),
  (43.588203799999995, -79.28135040000001),
  (43.687095, -79.28135040000001),
  (43.687095, -79.54626379999998),
  (43.588203799999995, -79.54626379999998)],
 [(43.5919725, -79.5448645),
  (43.5919725, -79.28154219999998),
  (43.68036370000001, -79.28154219999998),
  (43.68036370000001, -79.5448645),
  (43.5919725, -79.5448645)],
 [(43.5919725, -79.54389520000001),
  (43.5919725, -79.28154219999998),
  (43.68469220000001, -79.28154219999998),
  (43.68469220000001, -79.54389520000001),
  (43.5919725, -79.54389520000001)],
 [(43.5919725, -79.54389520000001),
  (43.5919725, -79.28154219999998),
  (43.68695169999999, -79.28154219999998),
  (43.68695169999999, -79.54389520000001),
  (43.5919725, -79.54389520000001)],
 [(43.5919725, -79.54389520000001),
  (43.5919725, -79.28154219999998),
  (43.68695169999999, -79.2815421

In [28]:
# define centre of map
TOR_COORDINATES = (df['latitude'].mean(), df['longitude'].mean())
min_tuple = tuple([43.588204,-79.546264])
max_tuple = tuple([43.687095,-79.281350])

points = []
# latitude - distance from the equator; longitude - distance from prime meridian
# points.append(min_tuple)
# points.append(max_tuple)
# points = point_box(min_tuple,max_tuple)

for route in dfn['Route']:
    # df.loc[df['column_name'] == some_value]
    dfs = dfn.loc[dfn['Route'] == route]
    # purchase_group['Column_name'].values[0]
    min_tuple = tuple([dfs['min_lat'].values[0],dfs['min_long'].values[0]])
    max_tuple = tuple([dfs['max_lat'].values[0],dfs['max_long'].values[0]])
    pbox = point_box(min_tuple,max_tuple)
    points.append(pbox)

#points 

# folium.PolyLine(points).add_to(my_map)

'''points = []
for track in gpx.tracks:
    for segment in track.segments:        
        for point in segment.points:
            points.append(tuple([point.latitude, point.longitude]))'''
 
# subset to match subset of locations
MAX_RECORDS = 2500
  
# create empty map zoomed in on Toronto
map_tor = folium.Map(location=TOR_COORDINATES, zoom_start=12)
folium.PolyLine(points).add_to(map_tor)

mc = MarkerCluster()

# iterate through dataset to create clusters

'''for row in df[0:MAX_RECORDS].itertuples():
    mc.add_child(folium.Marker(location=[row.latitude,  row.longitude],
                 popup=row.Location))'''

# map_tor.add_child(mc)
display(map_tor)

# Visualize using Folium: heatmap of delay counts
Use Folium to display a heat map view of delay counts

In [9]:
# define centre of map
TOR_COORDINATES = (df['latitude'].mean(), df['longitude'].mean())
 
  
# create empty map zoomed in on Toronto
map_tor = folium.Map(location=TOR_COORDINATES, zoom_start=12)
df['count'] = 1

# define heat map

HeatMap(data=df[['latitude', 'longitude', 'count']].groupby(['latitude', 'longitude']).sum().reset_index().values.tolist(), radius=8, max_zoom=13).add_to(map_tor)


display(map_tor)

# Visualize using Folium: heatmap of delay durations
Use Folium to display a heat map view of delay durations

In [10]:
# define centre of map
TOR_COORDINATES = (df['latitude'].mean(), df['longitude'].mean())
 
  
# create empty map zoomed in on Toronto
map_tor = folium.Map(location=TOR_COORDINATES, zoom_start=12)

# define heat map

HeatMap(data=df[['latitude', 'longitude', 'Min Delay']].groupby(['latitude', 'longitude']).sum().reset_index().values.tolist(), radius=8, max_zoom=13).add_to(map_tor)


display(map_tor)

# Tableau rendering of the same dataset

Here is an example of the same dataset rendered in Tableau:

<table style="border: none" align="left">
   </tr>
   <tr style="border: none">
       <th style="border: none"><img src="https://raw.githubusercontent.com/ryanmark1867/manning/master/tableau_smalldots.jpg" width="900" alt="Icon"> </th>
   </tr>
</table>

# Tableau rendering using size and colour

<table style="border: none" align="left">
   </tr>
   <tr style="border: none">
       <th style="border: none"><img src="https://raw.githubusercontent.com/ryanmark1867/manning/master/tableau_size_colour_zoom.jpg" width="900" alt="Icon"> </th>
   </tr>
</table>

This notebook demonstrated using Pixiedust and Folium to visualize a dataset including latitude and longitude values.