# KDD 2020 
# Where really are the parking spots?
Using RAPIDS to find proper distances to parking spots in Seattle.

## Load the modules

In [1]:
import cudf
from cuspatial import haversine_distance
from collections import OrderedDict
import numpy as np
import datetime as dt
import cugraph

%load_ext autotime

## Read in the data

In [2]:
dtypes = OrderedDict([
    ('OccupancyDateTime', 'date'),
    ('PaidOccupancy', 'int64'),
    ('BlockfaceName', 'str'),
    ('SideOfStreet', 'str'),
    ('SourceElementKey', 'int64'),
    ('ParkingTimeLimitCategory', 'int64'),
    ('ParkingSpaceCount', 'int64'),
    ('PaidParkingArea', 'str'),
    ('PaidParkingSubArea', 'str'),
    ('PaidParkingRate', 'int8'),
    ('ParkingCategory', 'str'),
    ('Location', 'str'),
    ('dow', 'int8')
])

df = cudf.read_csv(
    '../data/parking_MayJun2019.csv'
    , skiprows=1
    , dtype=list(dtypes.values())
    , names=list(dtypes.keys())
)

df = df[['SourceElementKey', 'Location']].dropna()

time: 3.5 s


Let's extract the geo-coordinates for the parking locations.

In [3]:
def extractLon(location):
    lon = location.str.extract('([0-9\.\-]+) ([0-9\.]+)')[0]
    return lon

def extractLat(location):
    lon = location.str.extract('([0-9\.\-]+) ([0-9\.]+)')[1]
    return lon
    

time: 611 µs


In [4]:
locations = df.drop_duplicates()
del df

time: 1.39 s


In [5]:
locations['longitude'] = extractLon(locations['Location']).astype('float')
locations['latitude'] = extractLat(locations['Location']).astype('float')
locations = locations[['SourceElementKey', 'longitude', 'latitude']]

time: 7.96 ms


As before, we'll use the Nomatim encoder to find the coordinates of the Space Needle.

In [6]:
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="todrabas_test")
location = geolocator.geocode("400 Broad St, Seattle, WA 98109") # SPACE NEEDLE

location

Location(Space Needle, 400, Broad Street, South Lake Union, Belltown, Seattle, King County, Washington, 98109, United States of America, (47.6205131, -122.34930359883187, 0.0))

time: 629 ms


## As crow flies vs as people walk

### Read in the graph data
Thanks to John Murray for analyzing the map of King County roads and producing the data we will now use.

#### Download and unpack the data

In [7]:
import os

directory  = os.path.exists('../data')
archive    = os.path.exists('../data/king_county_road_graph_20190909.tar.gz')
file_graph = os.path.exists('../data/king_county_road_graph_20190909.csv')
file_nodes = os.path.exists('../data/king_county_road_nodes_20190909.csv')

if not directory:
    os.mkdir('../data')

if not archive:
    import wget, shutil
    
    wget.download('http://tomdrabas.com/data/seattle_parking/king_county_road_graph_20190909.tar.gz')
    shutil.move('king_county_road_graph_20190909.tar.gz', '../data/king_county_road_graph_20190909.tar.gz')
    
if not file_graph or not file_nodes:
    import tarfile

    tf = tarfile.open('../data/king_county_road_graph_20190909.tar.gz')
    tf.extractall(path='../data/')

time: 1.23 ms


#### Let's read the King County road data

In [8]:
road_graph_data = cudf.read_csv('../data/king_county_road_graph_20190909.csv')
road_graph_data['node1'] = road_graph_data['node1'].astype('int32')
road_graph_data['node2'] = road_graph_data['node2'].astype('int32')
road_graph_data['LENGTH'] = road_graph_data['LENGTH'] * 3 # convert to feet as the LENGHT was given in yards

time: 16.2 ms


In [9]:
road_nodes = cudf.read_csv('../data/king_county_road_nodes_20190909.csv')
road_nodes['NodeID'] = road_nodes['NodeID'].astype('int32')

time: 4.73 ms


Store the maximum of the `NodeID` so we can later append the additional nodes that will be perpendicular to the actual parking locations. We also specify the offset - this will be used to append parking nodes.

In [10]:
offset = 100000
nodeId = road_nodes['NodeID'].max()                       ## so we can number the parking nodes properly (since we'll be adding a perpendicular projections)
parking_nodes_idx = road_nodes['NodeID'].max() + offset   ## retain it so we can later filter the results to only parking locations
nodeId

127380

time: 1.99 ms


Move all the parking locations to host (via `.to_pandas()` method) so we can loop through all the ~1500 parking locations. Here, we also create an empty DataFrame that will hold the parking location nodes.

In [11]:
parking_locations = locations.to_pandas().to_dict('records')
parking_locations_nodes = cudf.DataFrame(columns=['NodeID', 'Lon', 'Lat', 'SourceElementKey'])
added_location_edges    = cudf.DataFrame(columns=['node1', 'node2', 'LENGTH'])

time: 10.9 ms


Let's process the parking data. The kernel below finds equations of two lines:

1. Line that goes through road intersections
2. Line that is perpendicular to (1) and goes through the parking location.

Ultimately, we are finind the intersection of these two lines -- we call it the `PROJ` point below.

In [12]:
def kernel_find_projection(Lon_x, Lat_x, Lon_y, Lat_y, Lon_PROJ, Lat_PROJ, Lon_REF, Lat_REF):
    for i, (lon_x, lat_x, lon_y, lat_y) in enumerate(zip(Lon_x, Lat_x, Lon_y, Lat_y)):
        # special case where A and B have the same LON i.e. vertical line
        if lon_x == lon_y:
            Lon_PROJ[i] = lon_x
            Lat_PROJ[i] = Lat_REF    
        else:
            # find slope
            a_xy = (lat_x - lat_y) / float(lon_x - lon_y)

            # special case where A and B have the same LAT i.e. horizontal line
            if a_xy == 0:
                Lon_PROJ[i] = Lon_REF
                Lat_PROJ[i] = lat_x
            else: 
                # if neither of the above special cases apply
                # find the equation of the perpendicular line
                a_R  = -1 / a_xy                    ### SLOPE

                # find intersections
                b_xy = lat_x - a_xy * lon_x
                b_R  = Lat_REF - a_R  * Lon_REF

                # find the coordinates
                Lon_PROJ[i] = (b_R - b_xy) / (a_xy - a_R)
                Lat_PROJ[i] = a_R * Lon_PROJ[i] + b_R

time: 1.34 ms


In [13]:
parking_locations_cnt = len(parking_locations)
print('Number of parking locations: {0:,}'.format(parking_locations_cnt))

for i, loc in enumerate(parking_locations):
    if i % 100 == 0:
        print('Processed: {0:,} ({1:2%}) nodes'.format(i, i/float(parking_locations_cnt)))
        
    #### INCREASE THE COUNTER AND GET THE REFERENCE POINT
    nodeId = nodeId + 1
    lat_r = loc['latitude']
    lon_r = loc['longitude']

    #### APPEND GEO COORDINATES TO INTERSECTION AND SUBSET DOWN THE DATASET
    #### TO POINTS WITHIN ~2000ft FROM PARKING SPOT
    paths = (
        road_graph_data
        .rename(columns={'node1': 'NodeID'})
        .merge(road_nodes[['NodeID', 'Lat', 'Lon']], on='NodeID', how='left')
        .rename(columns={'NodeID': 'node1', 'node2': 'NodeID'})
        .merge(road_nodes[['NodeID', 'Lat', 'Lon']], on='NodeID', how='left')
        .rename(columns={'NodeID': 'node2'})
        .query('Lat_x >= (@lat_r - 0.0075) and Lat_x <= (@lat_r + 0.0075)')
        .query('Lon_x >= (@lon_r - 0.0075) and Lon_x <= (@lon_r + 0.0075)')
        .query('Lat_y >= (@lat_r - 0.0075) and Lat_y <= (@lat_r + 0.0075)')
        .query('Lon_y >= (@lon_r - 0.0075) and Lon_y <= (@lon_r + 0.0075)')
    )

    #### APPEND THE PARKING LOCATION SO WE CAN CALCULATE DISTANCES
    paths['Lon_REF'] = loc['longitude']
    paths['Lat_REF'] = loc['latitude']

    paths = paths.apply_rows(
        kernel_find_projection
        , incols  = ['Lon_x', 'Lat_x', 'Lon_y', 'Lat_y', 'Lon_REF', 'Lat_REF']
        , outcols = {'Lon_PROJ': np.float64, 'Lat_PROJ': np.float64}
        , kwargs  = {'Lon_REF': loc['longitude'], 'Lat_REF': loc['latitude']}
    )

    #### CALCULATE THE DISTANCES SO WE CAN CHECK IF THE PROJ POINT IS BETWEEN ROAD NODES
    paths['Length_x_PROJ'] = haversine_distance(
              paths['Lon_x']
            , paths['Lat_x']
            , paths['Lon_PROJ']
            , paths['Lat_PROJ'])

    paths['Length_y_PROJ'] = haversine_distance(
              paths['Lon_y']
            , paths['Lat_y']
            , paths['Lon_PROJ']
            , paths['Lat_PROJ'])

    paths['Length_REF_PROJ'] = haversine_distance(
              paths['Lon_REF']
            , paths['Lat_REF']
            , paths['Lon_PROJ']
            , paths['Lat_PROJ'])
    
    paths['Length_x_PROJ']   = paths['Length_x_PROJ'] * 0.621371 * 5280
    paths['Length_y_PROJ']   = paths['Length_y_PROJ'] * 0.621371 * 5280
    paths['Length_REF_PROJ'] = paths['Length_REF_PROJ'] * 0.621371 * 5280

    #### SELECT THE POINTS THAT A LESS THAN OR EQAL TO TOTAL LENGTH OF THE EDGE (WITHIN 1 ft)
    paths['PROJ_between'] = (paths['Length_x_PROJ'] + paths['Length_y_PROJ']) <= (paths['LENGTH'] + 4)
    
    #### SELECT THE CLOSEST
    closest = (
        paths
        .query('PROJ_between')
        .nsmallest(1, 'Length_REF_PROJ')
        .to_pandas()
        .to_dict('records')[0]
    )
    
    # add nodes
    nodes =    cudf.DataFrame({
          'NodeID': [nodeId + offset, nodeId]
        , 'Lon':    [closest['Lon_REF'], closest['Lon_PROJ']]
        , 'Lat':    [closest['Lat_REF'], closest['Lat_PROJ']]
        , 'SourceElementKey': [loc['SourceElementKey'], None]
    })

    parking_locations_nodes = cudf.concat([parking_locations_nodes, nodes])

    # add edges (bi-directional)
    edges = cudf.DataFrame({
          'node1':  [nodeId, nodeId, nodeId, closest['node1'], closest['node2'], nodeId + offset]
        , 'node2':  [closest['node1'], closest['node2'], nodeId + offset, nodeId, nodeId, nodeId]
        , 'LENGTH': [
              closest['Length_x_PROJ'], closest['Length_y_PROJ'], closest['Length_REF_PROJ']
            , closest['Length_x_PROJ'], closest['Length_y_PROJ'], closest['Length_REF_PROJ']
        ]
    })

    added_location_edges = cudf.concat([added_location_edges, edges]) ## append to the temp DataFrame

print('Finished processing...')

Number of parking locations: 1,473
Processed: 0 (0.000000%) nodes
Processed: 100 (6.788866%) nodes
Processed: 200 (13.577733%) nodes
Processed: 300 (20.366599%) nodes
Processed: 400 (27.155465%) nodes
Processed: 500 (33.944331%) nodes
Processed: 600 (40.733198%) nodes
Processed: 700 (47.522064%) nodes
Processed: 800 (54.310930%) nodes
Processed: 900 (61.099796%) nodes
Processed: 1,000 (67.888663%) nodes
Processed: 1,100 (74.677529%) nodes
Processed: 1,200 (81.466395%) nodes
Processed: 1,300 (88.255261%) nodes
Processed: 1,400 (95.044128%) nodes
Finished processing...
time: 1min 30s


In [18]:
road_nodes = (
    cudf
    .concat([road_nodes[['NodeID', 'Lon', 'Lat']], parking_locations_nodes])
    .reset_index(drop=True)
)

time: 9.1 ms


Now we can find the nearest intersections from the Space Needle!

In [19]:
road_nodes['Lon_REF'] = location.longitude
road_nodes['Lat_REF'] = location.latitude

road_nodes['Distance'] = haversine_distance(
          road_nodes['Lon']
        , road_nodes['Lat']
        , road_nodes['Lon_REF']
        , road_nodes['Lat_REF'])
road_nodes['Distance'] = road_nodes['Distance'] * 0.621371 * 5280

space_needle_to_nearest_intersection = road_nodes.nsmallest(5, 'Distance') ### Space Needle is surrounded by around 5 road intersections hence we add 5
space_needle_to_nearest_intersection_dist = space_needle_to_nearest_intersection['Distance'].to_array()[0]

space_needle_to_nearest_intersection['node1'] = nodeId + 2
space_needle_to_nearest_intersection = (
    space_needle_to_nearest_intersection
    .rename(columns={'NodeID': 'node2', 'Distance': 'LENGTH'})
    [['node1', 'node2', 'LENGTH']]
)

road_graph_data = cudf.concat([space_needle_to_nearest_intersection, added_location_edges, road_graph_data])
space_needle_to_nearest_intersection ### SHOW THE EDGES

Unnamed: 0,node1,node2,LENGTH
47756,128855,47757,175.906391
80448,128855,80449,200.062128
96739,128855,96740,261.056715
108797,128855,108798,277.221141
47827,128855,47828,301.71549


time: 287 ms


### The road graph

In [20]:
road_graph_data = road_graph_data.reset_index(drop=True)
road_graph_data['node1'] = road_graph_data['node1'].astype('int32')
road_graph_data['node2'] = road_graph_data['node2'].astype('int32')

g = cugraph.Graph()
g.from_cudf_edgelist(road_graph_data, source='node1', destination='node2', edge_attr='LENGTH')

time: 56.8 ms


Now we can use the `.sssp(...)` method from `cugraph` to find the shortest distances to parking spots from the Space Needle!

In [21]:
all_distances = cugraph.sssp(g, nodeId + 2)
distances = all_distances.query('vertex > @parking_nodes_idx and distance < 1000')
distances

Unnamed: 0,distance,vertex,predecessor
93958,978.340665,227822,127822
93959,954.882271,227823,127823
103850,978.714937,227666,127666
103851,986.632352,227667,127667
105152,796.521771,227912,127912
105153,793.130113,227913,127913
109108,494.541097,227585,127585
109109,471.843338,227586,127586
111643,783.243667,228506,128506
128846,982.444003,228224,128224


time: 320 ms


`cugraph` returns a DataFrame with vertex, distance to that vertex, and the total distance traveled to that vertex from the `nodeId + 1` node -- the Space Needle. Here, we unfold the full path.

In [22]:
# unfold -- create the whole path
closest_node = nodeId + 2
parking_cnt = distances['vertex'].count()

for i in range(parking_cnt):
    print('Processing record: {0}'.format(i))
    parking_node = distances.iloc[i].to_pandas()
    vertex = int(parking_node[1])
    predecessor = int(parking_node[2])
    
    if i == 0:
        paths = all_distances.query('vertex == @vertex')
    else:
        paths = cudf.concat([all_distances.query('vertex == @vertex'), paths])

    while vertex != closest_node:
        temp = all_distances.query('vertex == @predecessor')
        paths = cudf.concat([temp, paths])
        predecessor = temp['predecessor'].to_array()[0]
        vertex = temp['vertex'].to_array()[0]

Processing record: 0
Processing record: 1
Processing record: 2
Processing record: 3
Processing record: 4
Processing record: 5
Processing record: 6
Processing record: 7
Processing record: 8
Processing record: 9
Processing record: 10
Processing record: 11
time: 622 ms


### Charting the paths

In [23]:
paths['vertex'] = paths['vertex'].astype('int64')
paths['predecessor'] = paths['predecessor'].astype('int64')
paths = paths.drop_duplicates()

### process the data so we get the Lat/Lon back for both src and dest
### then move to host
paths_host = (
    paths
    .rename(columns={'vertex': 'NodeID'})
    .merge(road_nodes[['NodeID', 'Lat', 'Lon']], on='NodeID', how='left')
    .rename(columns={'NodeID': 'vertex', 'predecessor': 'NodeID'})
    .merge(road_nodes[['NodeID', 'Lat', 'Lon']], on='NodeID', how='left')
    .fillna({'Lat_y': location.latitude, 'Lon_y': location.longitude})
    [['vertex', 'Lat_x', 'Lon_x', 'Lat_y', 'Lon_y']]
    .query('vertex != @nodeId + 2')
    .to_pandas()
)

time: 126 ms


Get the information about the parking spots so we can create info boxes.

In [24]:
distances['vertex'] = distances['vertex'].astype('int64')
distances_host = (
    distances
    .rename(columns={'vertex': 'NodeID'})
    .merge(road_nodes[['NodeID', 'Lat', 'Lon', 'SourceElementKey']], on='NodeID')
    [['SourceElementKey', 'Lat', 'Lon', 'distance']]
    .to_pandas()
)

time: 7.53 ms


In [25]:
info_box_template = """
<dl>
<dt><dd>SourceElementKey</dd><dd>{SourceElementKey}</dd></dt>
<dt><dd>Distance        </dd><dd>{distance:.0f} ft.</dd></dt>
</dl>
"""

parking_info = [info_box_template.format(**parking) for parking in distances_host.to_dict('records')]

time: 962 µs


And... plot!

In [26]:
import gmaps
from ipywidgets.embed import embed_minimal_html

####################################################
##                                                ##
## CHANGE THE API CREDS IN THE GoogleMapsAPI.cred ##
##                                                ##
####################################################
with open('config/GoogleMapsAPI.cred', 'r') as f:
    cred = f.read()
    
gmaps.configure(api_key=cred) # Your Google API key, go to https://console.developers.google.com

parking_layer = gmaps.symbol_layer(
    distances_host[['Lat', 'Lon']], fill_color="green", stroke_color="green", scale=3, info_box_content=parking_info
)

destinations_layer = gmaps.symbol_layer(
    [[location.latitude, location.longitude]]
    , info_box_content=['DESTINATION']
    , scale=5
    , fill_color="red"
    , stroke_color="red"
)

lines_layer = gmaps.drawing_layer(features=[
    gmaps.Line(
          start = (path['Lat_x'], path['Lon_x'])
        , end   = (path['Lat_y'], path['Lon_y'])
        , stroke_weight=2
        , stroke_color="red"
    )
    for path in paths_host.to_dict('records')]
)

fig = gmaps.figure(layout={'height': '500px'})
fig.add_layer(parking_layer)
fig.add_layer(destinations_layer)
fig.add_layer(lines_layer)
embed_minimal_html('maps_rendered/map_walk_final.html', views=[fig])

time: 197 ms
