# Streetcar Delay Prediction - Data Preparation Geocode Specific

Use dataset covering Toronto Transit Commission (TTC) streetcar delays 2014 - present to predict future delays and come up with recommendations for avoiding delays.

Source dataset: : https://www.toronto.ca/city-government/data-research-maps/open-data/open-data-catalogue/#e8f359f0-2f47-3058-bf64-6ec488de52da

This notebook contains the data preparation steps specific to mapping free-form location descriptions to latitude and longitude

- use the Google Maps API Web Services for Python  https://github.com/googlemaps/google-maps-services-python
- generate the latitude and longitude values for locations and create new columns in the output dataset

# Streetcar routes

From https://www.ttc.ca/Routes/Streetcars.jsp

<table style="border: none" align="left">
   </tr>
   <tr style="border: none">
       <th style="border: none"><img src="https://raw.githubusercontent.com/ryanmark1867/streetcarnov3/master/streetcar%20routes.jpg" width="600" alt="Icon"> </th>
   </tr>
</table>

In [1]:
! pwd

/storage/manning/notebooks


# Get path and load dataframe saved from previous data preparation step

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
# import seaborn as sns
import datetime
import os

remove_bad_values = False
city_name = 'Toronto'


In [3]:
# get the directory for that this notebook is in
rawpath = os.getcwd()
print("raw path is",rawpath)

raw path is /storage/manning/notebooks


In [4]:
# data is in a directory called "data" that is a sibling to the directory containing the notebook
path = os.path.abspath(os.path.join(rawpath, '..', 'data')) + "/"
print("path is", path)

path is /storage/manning/data/


In [5]:
# constants for the streetcar problem
# same values saved in data_preparation notebook: pickled_input_dataframe, pickled_output_dataframe
pickled_data_file = '2014_2018.pkl'
#pickled_dataframe = '2014_2018_df.pkl'
pickled_dataframe = '2014_2018_df_cleaned_keep_bad_apr23.pkl'
pickled_output_dataframe = '2014_2018_df_cleaned_keep_bad_loc_geocoded_apr23.pkl'

In [6]:
file_name = path + pickled_dataframe
df = pd.read_pickle(file_name)
df.head()

Unnamed: 0_level_0,Report Date,Route,Time,Day,Location,Incident,Min Delay,Min Gap,Direction,Vehicle,Report Date Time
Report Date Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2016-01-01 00:00:00,2016-01-01,505,00:00:00,Friday,dundas west stationt to broadview station,General Delay,7.0,14.0,w,4028,2016-01-01 00:00:00
2016-01-01 02:14:00,2016-01-01,511,02:14:00,Friday,fleet st. and strachan,Mechanical,10.0,20.0,e,4018,2016-01-01 02:14:00
2016-01-01 02:22:00,2016-01-01,301,02:22:00,Friday,queen st. west and roncesvalles,Mechanical,9.0,18.0,w,4201,2016-01-01 02:22:00
2016-01-01 03:28:00,2016-01-01,301,03:28:00,Friday,lake shore blvd. and superior st.,Mechanical,20.0,40.0,e,4251,2016-01-01 03:28:00
2016-01-01 14:28:00,2016-01-01,501,14:28:00,Friday,roncesvalles to neville park,Mechanical,6.0,12.0,e,4242,2016-01-01 14:28:00


In [7]:
df.shape

(69603, 11)

In [8]:
# create a dataframe just containing
# gapminder['continent'].unique().tolist
loc_unique = df['Location'].unique().tolist()
print("loc_unique", loc_unique[0])
# pd.DataFrame(q_list, columns=['q_data'])
df_unique = pd.DataFrame(loc_unique, columns=['Location'])
df_unique.head()

loc_unique dundas west stationt to broadview station


Unnamed: 0,Location
0,dundas west stationt to broadview station
1,fleet st. and strachan
2,queen st. west and roncesvalles
3,lake shore blvd. and superior st.
4,roncesvalles to neville park


In [9]:
df_unique.shape

(10074, 1)

# Set up geocode

In [10]:
! pip install -U googlemaps

Requirement already up-to-date: googlemaps in /opt/conda/envs/fastai/lib/python3.6/site-packages (3.0.2)


In [17]:
import googlemaps

# API key comes from https://console.developers.google.com/google/maps-apis/apis/geocoding-backend.googleapis.com/credentials?project=streetcardec2018&duration=PT1H
gmaps = googlemaps.Client(key='AIzaSyBeIBLutP16BmXmXkb14v8gOUYCh9xMhVI')

# Geocoding an address
geocode_result = gmaps.geocode('lake shore blvd. and superior st., Toronto')

print("geocode result",geocode_result[0]["geometry"]["location"])

geocode result {'lat': 43.61496169999999, 'lng': -79.4886581}


In [12]:
# given an address / junction, return a list containg the latitude and longitude values returned by geocode api

def get_geocode_result(junction):
    
    geo_string = junction+", "+city_name
    # print("geo_string is", geo_string)
    geocode_result = gmaps.geocode(geo_string)
    # check to see if the result is empty and if so return zeros to indicate unparseable junction value
    if len(geocode_result) > 0:
        locs = geocode_result[0]["geometry"]["location"]
        return [locs["lat"], locs["lng"]]
    else:
        return [0.0,0.0]



In [13]:
# test geocode api with value that will return empty result

locs = get_geocode_result("roncesvalles to longbranch")
print("locs ",locs)

locs  [0.0, 0.0]


In [14]:
# test geocode api with value that will return non-empty result
get_geocode_result("queen and bathurst")[0]

43.6471969

In [15]:
df.shape

(69603, 11)

In [16]:

# to avoid making multiple calls to the geocode API, bring in the latitude and longitude values as a single 
# column to a dataframe containing just the unique location values and once we have that go through steps
# to get the desired columns in the overall dataframe
df_unique['lat_long'] = df_unique.Location.apply(lambda s: get_geocode_result(s))



In [17]:
df_unique.head()

Unnamed: 0,Location,lat_long
0,dundas west stationt to broadview station,"[0.0, 0.0]"
1,fleet st. and strachan,"[43.6362976, -79.4096351]"
2,queen st. west and roncesvalles,"[43.64533489999999, -79.4131843]"
3,lake shore blvd. and superior st.,"[43.61496169999999, -79.4886581]"
4,roncesvalles to neville park,"[0.0, 0.0]"


In [18]:
df_unique.shape

(10074, 2)

In [22]:
# derive latitude and longitude columns from list column
# df["new_col"] = df["A"].str[0]
df_unique["latitude"] = df_unique["lat_long"].str[0]
df_unique["longitude"] = df_unique["lat_long"].str[1]
df_unique.head()

Unnamed: 0,Location,lat_long,latitude,longitude
0,dundas west stationt to broadview station,"[0.0, 0.0]",0.0,0.0
1,fleet st. and strachan,"[43.6362976, -79.4096351]",43.636298,-79.409635
2,queen st. west and roncesvalles,"[43.64533489999999, -79.4131843]",43.645335,-79.413184
3,lake shore blvd. and superior st.,"[43.61496169999999, -79.4886581]",43.614962,-79.488658
4,roncesvalles to neville park,"[0.0, 0.0]",0.0,0.0


In [23]:
df_unique.shape

(10074, 4)

In [24]:
# join df_unique dataframe with original df dataframe on Location column to get latitude and longitude cols in original df dataframe
# result1 = pd.merge(date_frame, routedirection_frame, on='count', how='outer')
df_out = pd.merge(df, df_unique, on="Location", how='left')
df_out.head()

Unnamed: 0,Report Date,Route,Time,Day,Location,Incident,Min Delay,Min Gap,Direction,Vehicle,Report Date Time,lat_long,latitude,longitude
0,2016-01-01,505,00:00:00,Friday,dundas west stationt to broadview station,General Delay,7.0,14.0,w,4028,2016-01-01 00:00:00,"[0.0, 0.0]",0.0,0.0
1,2016-01-01,511,02:14:00,Friday,fleet st. and strachan,Mechanical,10.0,20.0,e,4018,2016-01-01 02:14:00,"[43.6362976, -79.4096351]",43.636298,-79.409635
2,2016-01-01,301,02:22:00,Friday,queen st. west and roncesvalles,Mechanical,9.0,18.0,w,4201,2016-01-01 02:22:00,"[43.64533489999999, -79.4131843]",43.645335,-79.413184
3,2016-01-01,301,03:28:00,Friday,lake shore blvd. and superior st.,Mechanical,20.0,40.0,e,4251,2016-01-01 03:28:00,"[43.61496169999999, -79.4886581]",43.614962,-79.488658
4,2016-01-01,501,14:28:00,Friday,roncesvalles to neville park,Mechanical,6.0,12.0,e,4242,2016-01-01 14:28:00,"[0.0, 0.0]",0.0,0.0


In [30]:
df_out.head(30)

Unnamed: 0,Report Date,Route,Time,Day,Location,Incident,Min Delay,Min Gap,Direction,Vehicle,Report Date Time,lat_long,latitude,longitude
0,2016-01-01,505,00:00:00,Friday,dundas west stationt to broadview station,General Delay,7.0,14.0,w,4028,2016-01-01 00:00:00,"[0.0, 0.0]",0.0,0.0
1,2016-01-01,511,02:14:00,Friday,fleet st. and strachan,Mechanical,10.0,20.0,e,4018,2016-01-01 02:14:00,"[43.6362976, -79.4096351]",43.636298,-79.409635
2,2016-01-01,301,02:22:00,Friday,queen st. west and roncesvalles,Mechanical,9.0,18.0,w,4201,2016-01-01 02:22:00,"[43.64533489999999, -79.4131843]",43.645335,-79.413184
3,2016-01-01,301,03:28:00,Friday,lake shore blvd. and superior st.,Mechanical,20.0,40.0,e,4251,2016-01-01 03:28:00,"[43.61496169999999, -79.4886581]",43.614962,-79.488658
4,2016-01-01,501,14:28:00,Friday,roncesvalles to neville park,Mechanical,6.0,12.0,e,4242,2016-01-01 14:28:00,"[0.0, 0.0]",0.0,0.0
5,2016-01-01,505,15:42:00,Friday,broadview station loop,Investigation,4.0,10.0,w,4187,2016-01-01 15:42:00,"[43.677135, -79.35820799999999]",43.677135,-79.358208
6,2016-01-01,504,15:54:00,Friday,broadview and queen,Mechanical,6.0,12.0,e,4181,2016-01-01 15:54:00,"[43.6593626, -79.34769709999999]",43.659363,-79.347697
7,2016-01-01,501,16:05:00,Friday,roncesvalles to humber loop,Mechanical,6.0,12.0,w,4245,2016-01-01 16:05:00,"[0.0, 0.0]",0.0,0.0
8,2016-01-01,506,16:27:00,Friday,main station,Mechanical,8.0,16.0,w,4092,2016-01-01 16:27:00,"[43.6890219, -79.3016857]",43.689022,-79.301686
9,2016-01-01,510,16:34:00,Friday,richmond st. and spadina,Diversion,41.0,46.0,s,bad vehicle,2016-01-01 16:34:00,"[43.6478469, -79.39588049999999]",43.647847,-79.39588


In [25]:
df_out.shape

(69603, 14)

In [32]:
print("Bad route latitude:",df_out[df_out.latitude == 0.0].shape[0])

Bad route latitude: 1675


# Remove bad rows

In [54]:
print("Location count post cleanup:",df['Location'].nunique())
print("Route count post cleanup:",df['Route'].nunique())
print("Direction count post cleanup:",df['Direction'].nunique())
print("Vehicle count post cleanup:",df['Vehicle'].nunique())
# print("Bad Location count":df[df.Vehicle == 'bad vehicle'].shape[0])
print("Bad route count:",df[df.Route == 'bad route'].shape[0])
print("Bad direction count:",df[df.Direction == 'bad direction'].shape[0])
print("Bad vehicle count:",df[df.Vehicle == 'bad vehicle'].shape[0])

Location count post cleanup: 10074
Route count post cleanup: 15
Direction count post cleanup: 6
Vehicle count post cleanup: 1017
Bad route count: 3091
Bad direction count: 334
Bad vehicle count: 14480


In [55]:
# remove rows with bad vehicle value
if remove_bad_values:
    df = df[df.Vehicle != 'bad vehicle']
    df = df[df.Direction != 'bad direction']
    df = df[df.Route != 'bad route']

In [56]:
df.shape

(66095, 11)

In [26]:
pickled_output_dataframe

'2014_2018_df_cleaned_keep_bad_loc_geocoded_apr23.pkl'

In [27]:
# pickle the cleansed dataframe
file_name = path + pickled_output_dataframe
df_out.to_pickle(file_name)

In [28]:
dfn = pd.read_pickle(file_name)
dfn.head()

Unnamed: 0,Report Date,Route,Time,Day,Location,Incident,Min Delay,Min Gap,Direction,Vehicle,Report Date Time,lat_long,latitude,longitude
0,2016-01-01,505,00:00:00,Friday,dundas west stationt to broadview station,General Delay,7.0,14.0,w,4028,2016-01-01 00:00:00,"[0.0, 0.0]",0.0,0.0
1,2016-01-01,511,02:14:00,Friday,fleet st. and strachan,Mechanical,10.0,20.0,e,4018,2016-01-01 02:14:00,"[43.6362976, -79.4096351]",43.636298,-79.409635
2,2016-01-01,301,02:22:00,Friday,queen st. west and roncesvalles,Mechanical,9.0,18.0,w,4201,2016-01-01 02:22:00,"[43.64533489999999, -79.4131843]",43.645335,-79.413184
3,2016-01-01,301,03:28:00,Friday,lake shore blvd. and superior st.,Mechanical,20.0,40.0,e,4251,2016-01-01 03:28:00,"[43.61496169999999, -79.4886581]",43.614962,-79.488658
4,2016-01-01,501,14:28:00,Friday,roncesvalles to neville park,Mechanical,6.0,12.0,e,4242,2016-01-01 14:28:00,"[0.0, 0.0]",0.0,0.0


In [29]:
dfn.shape

(69603, 14)

In [None]:
file_outname = "2014_2018_df_cleaned_keep_bad_loc_geocoded_apr29.csv"
dfn.to_csv(path+file_outname)

# Visualize cleaned data

In [None]:
!pip install pixiedust

In [None]:
import pixiedust

In [None]:
display(df)