# Streetcar Delay Prediction - Data Preparation Geocode Specific

Use dataset covering Toronto Transit Commission (TTC) streetcar delays 2014 - present to predict future delays and come up with recommendations for avoiding delays.

Source dataset: : https://www.toronto.ca/city-government/data-research-maps/open-data/open-data-catalogue/#e8f359f0-2f47-3058-bf64-6ec488de52da

This notebook contains the data preparation steps specific to mapping free-form location descriptions to latitude and longitude

- use the Google Maps API Web Services for Python  https://github.com/googlemaps/google-maps-services-python
- generate the latitude and longitude values for locations and create new columns in the output dataset

# Streetcar routes

From https://www.ttc.ca/Routes/Streetcars.jsp

<table style="border: none" align="left">
   </tr>
   <tr style="border: none">
       <th style="border: none"><img src="https://raw.githubusercontent.com/ryanmark1867/streetcarnov3/master/streetcar%20routes.jpg" width="600" alt="Icon"> </th>
   </tr>
</table>

In [7]:
! pwd

/notebooks/manning/notebooks


# Get path and load dataframe saved from previous data preparation step

In [17]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
# import seaborn as sns
import datetime
import os

remove_bad_values = False
city_name = 'Toronto'


In [3]:
# get the directory for that this notebook is in
rawpath = os.getcwd()
print("raw path is",rawpath)

raw path is /notebooks/manning/notebooks


In [4]:
# data is in a directory called "data" that is a sibling to the directory containing the notebook
path = os.path.abspath(os.path.join(rawpath, '..', 'data')) + "/"
print("path is", path)

path is /notebooks/manning/data/


In [45]:
# constants for the streetcar problem
# same values saved in data_preparation notebook: pickled_input_dataframe, pickled_output_dataframe
pickled_data_file = '2014_2018.pkl'
#pickled_dataframe = '2014_2018_df.pkl'
pickled_dataframe = '2014_2018_df_cleaned_keep_bad.pkl'
pickled_output_dataframe = '2014_2018_df_cleaned_keep_bad_loc_geocoded.pkl'

In [11]:
file_name = path + pickled_dataframe
df = pd.read_pickle(file_name)
df.head()

Unnamed: 0_level_0,Report Date,Route,Time,Day,Location,Incident,Min Delay,Min Gap,Direction,Vehicle,Report Date Time
Report Date Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2015-01-01 01:25:00,2015-01-01,504,01:25:00,Thursday,broadview and gerrard,Mechanical,9.0,18.0,s,4092,2015-01-01 01:25:00
2015-01-01 01:44:00,2015-01-01,504,01:44:00,Thursday,galley and roncesvalles,Held By,14.0,23.0,s,4030,2015-01-01 01:44:00
2015-01-01 02:04:00,2015-01-01,504,02:04:00,Thursday,king and sherborne,Mechanical,9.0,18.0,e,4147,2015-01-01 02:04:00
2015-01-01 02:12:00,2015-01-01,306,02:12:00,Thursday,main st. and upper gerard,Investigation,29.0,39.0,s,4049,2015-01-01 02:12:00
2015-01-01 05:05:00,2015-01-01,306,05:05:00,Thursday,gerrard and sumach,Mechanical,30.0,60.0,w,4114,2015-01-01 05:05:00


In [43]:
df.shape

(83365, 11)

# Set up geocode

In [12]:
! pip install -U googlemaps

Requirement already up-to-date: googlemaps in /opt/conda/envs/fastai/lib/python3.6/site-packages (3.0.2)


In [13]:
import googlemaps
from datetime import datetime

# API key comes from https://console.developers.google.com/google/maps-apis/apis/geocoding-backend.googleapis.com/credentials?project=streetcardec2018&duration=PT1H
gmaps = googlemaps.Client(key='AIzaSyBeIBLutP16BmXmXkb14v8gOUYCh9xMhVI')

# Geocoding an address
geocode_result = gmaps.geocode('queen and broadview, Toronto')

print("geocode result",geocode_result)

geocode result [{'address_components': [{'long_name': 'Riverside', 'short_name': 'Riverside', 'types': ['neighborhood', 'political']}, {'long_name': 'Old Toronto', 'short_name': 'Old Toronto', 'types': ['political', 'sublocality', 'sublocality_level_1']}, {'long_name': 'Toronto', 'short_name': 'Toronto', 'types': ['locality', 'political']}, {'long_name': 'Toronto Division', 'short_name': 'Toronto Division', 'types': ['administrative_area_level_2', 'political']}, {'long_name': 'Ontario', 'short_name': 'ON', 'types': ['administrative_area_level_1', 'political']}, {'long_name': 'Canada', 'short_name': 'CA', 'types': ['country', 'political']}, {'long_name': 'M4M', 'short_name': 'M4M', 'types': ['postal_code', 'postal_code_prefix']}], 'formatted_address': 'Riverside, Toronto, ON M4M, Canada', 'geometry': {'bounds': {'northeast': {'lat': 43.6670025, 'lng': -79.3414164}, 'southwest': {'lat': 43.6557326, 'lng': -79.3560076}}, 'location': {'lat': 43.6593626, 'lng': -79.34769709999999}, 'locatio

In [18]:
# data["results"][0]["geometry"]["location"]
locs = geocode_result[0]["geometry"]["location"]
print("locs",locs)
lats = locs["lat"]
print("lats",lats)

locs {'lat': 43.6593626, 'lng': -79.34769709999999}
lats 43.6593626


In [19]:
def get_geocode_result(junction):
    geo_string = junction+", "+city_name
    geocode_result = gmaps.geocode(geo_string)
    locs = geocode_result[0]["geometry"]["location"]
    return locs["lat"], locs["lng"]



In [23]:
latf, lngf = get_geocode_result("queen and bathurst")
print("latf ",latf)
print("lngf ",lngf)

latf  43.6471969
lngf  -79.4039809


In [27]:
get_geocode_result("queen and bathurst")[0]

43.6471969

In [24]:
df.shape

(83365, 11)

In [25]:
df_cut = df[:100]
df_cut.shape

(100, 11)

In [26]:
df_cut.head()

Unnamed: 0_level_0,Report Date,Route,Time,Day,Location,Incident,Min Delay,Min Gap,Direction,Vehicle,Report Date Time
Report Date Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2015-01-01 01:25:00,2015-01-01,504,01:25:00,Thursday,broadview and gerrard,Mechanical,9.0,18.0,s,4092,2015-01-01 01:25:00
2015-01-01 01:44:00,2015-01-01,504,01:44:00,Thursday,galley and roncesvalles,Held By,14.0,23.0,s,4030,2015-01-01 01:44:00
2015-01-01 02:04:00,2015-01-01,504,02:04:00,Thursday,king and sherborne,Mechanical,9.0,18.0,e,4147,2015-01-01 02:04:00
2015-01-01 02:12:00,2015-01-01,306,02:12:00,Thursday,main st. and upper gerard,Investigation,29.0,39.0,s,4049,2015-01-01 02:12:00
2015-01-01 05:05:00,2015-01-01,306,05:05:00,Thursday,gerrard and sumach,Mechanical,30.0,60.0,w,4114,2015-01-01 05:05:00


In [38]:

# df.merge(df.textcol.apply(lambda s: pd.Series({'feature1':s+1, 'feature2':s-1})), 
#    left_index=True, right_index=True)
# df['Route'] = df['Route'].apply(lambda x:check_route(x))
# merge two new columns to the dataframe by apply get_geocode_result function to the Location values and 
# with the first result populating the Latitude col and the second result populating the Longitude col
# small sample - saved as 2014_2018_df_cleaned_keep_bad_loc_geocoded_first100.pkl
#df_cut = df_cut.merge(df_cut.Location.apply(lambda s: pd.Series({'Latitude':get_geocode_result(s)[0],'Longitude':get_geocode_result(s)[1]})),left_index=True, right_index=True)

# with the first result populating the Latitude col and the second result populating the Longitude col
df = df.merge(df.Location.apply(lambda s: pd.Series({'Latitude':get_geocode_result(s)[0],'Longitude':get_geocode_result(s)[1]})),left_index=True, right_index=True)

In [44]:
df.shape

(83365, 11)

In [39]:
df.head()


Unnamed: 0_level_0,Report Date,Route,Time,Day,Location,Incident,Min Delay,Min Gap,Direction,Vehicle,Report Date Time,Latitude,Longitude
Report Date Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2015-01-01 01:25:00,2015-01-01,504,01:25:00,Thursday,broadview and gerrard,Mechanical,9.0,18.0,s,4092,2015-01-01 01:25:00,43.665483,-79.352634
2015-01-01 01:44:00,2015-01-01,504,01:44:00,Thursday,galley and roncesvalles,Held By,14.0,23.0,s,4030,2015-01-01 01:44:00,43.642825,-79.447703
2015-01-01 02:04:00,2015-01-01,504,02:04:00,Thursday,king and sherborne,Mechanical,9.0,18.0,e,4147,2015-01-01 02:04:00,43.658005,-79.37101
2015-01-01 02:12:00,2015-01-01,306,02:12:00,Thursday,main st. and upper gerard,Investigation,29.0,39.0,s,4049,2015-01-01 02:12:00,43.684192,-79.300463
2015-01-01 05:05:00,2015-01-01,306,05:05:00,Thursday,gerrard and sumach,Mechanical,30.0,60.0,w,4114,2015-01-01 05:05:00,43.663155,-79.361489


# Remove bad rows

In [54]:
print("Location count post cleanup:",df['Location'].nunique())
print("Route count post cleanup:",df['Route'].nunique())
print("Direction count post cleanup:",df['Direction'].nunique())
print("Vehicle count post cleanup:",df['Vehicle'].nunique())
# print("Bad Location count":df[df.Vehicle == 'bad vehicle'].shape[0])
print("Bad route count:",df[df.Route == 'bad route'].shape[0])
print("Bad direction count:",df[df.Direction == 'bad direction'].shape[0])
print("Bad vehicle count:",df[df.Vehicle == 'bad vehicle'].shape[0])

Location count post cleanup: 10074
Route count post cleanup: 15
Direction count post cleanup: 6
Vehicle count post cleanup: 1017
Bad route count: 3091
Bad direction count: 334
Bad vehicle count: 14480


In [55]:
# remove rows with bad vehicle value
if remove_bad_values:
    df = df[df.Vehicle != 'bad vehicle']
    df = df[df.Direction != 'bad direction']
    df = df[df.Route != 'bad route']

In [56]:
df.shape

(66095, 11)

In [40]:
# pickle the cleansed dataframe
file_name = path + pickled_output_dataframe
df_cut.to_pickle(file_name)

In [36]:
dfn.shape

(100, 11)

In [41]:
dfn = pd.read_pickle(file_name)
dfn.head()

Unnamed: 0_level_0,Report Date,Route,Time,Day,Location,Incident,Min Delay,Min Gap,Direction,Vehicle,Report Date Time,Latitude,Longitude
Report Date Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2015-01-01 01:25:00,2015-01-01,504,01:25:00,Thursday,broadview and gerrard,Mechanical,9.0,18.0,s,4092,2015-01-01 01:25:00,43.665483,-79.352634
2015-01-01 01:44:00,2015-01-01,504,01:44:00,Thursday,galley and roncesvalles,Held By,14.0,23.0,s,4030,2015-01-01 01:44:00,43.642825,-79.447703
2015-01-01 02:04:00,2015-01-01,504,02:04:00,Thursday,king and sherborne,Mechanical,9.0,18.0,e,4147,2015-01-01 02:04:00,43.658005,-79.37101
2015-01-01 02:12:00,2015-01-01,306,02:12:00,Thursday,main st. and upper gerard,Investigation,29.0,39.0,s,4049,2015-01-01 02:12:00,43.684192,-79.300463
2015-01-01 05:05:00,2015-01-01,306,05:05:00,Thursday,gerrard and sumach,Mechanical,30.0,60.0,w,4114,2015-01-01 05:05:00,43.663155,-79.361489


# Visualize cleaned data

In [None]:
!pip install pixiedust

In [None]:
import pixiedust

In [None]:
display(df)