# cuSpatial API Demo - Reverse Geocoding
GTC April 2023 Michael Wang and Thomson Comer

Demo System: Intel Xeon Gold 3.4Ghz, 48GB RAM, 32GB GV100 GPU

The following notebook demonstrates the use of cuSpatial to perform analytics using large datasets.

The structure of the notebook is as follows:
1. Imports
1. Read datasets: National Address Database (NAD), NYC Taxi Zones Polygons, 2015 NYC Taxi pickup/dropoff information with lon/lat. Also convert epsg:2263 (NYC Long Island) to WGS.
1. Convert separate lon/lat columns in DataFrames into cuspatial.GeoSeries
1. Compute number of addresses and pickups in each zone
1. Compute addresses for each pickup in one zone

## Data

[National Address Database](https://nationaladdressdata.s3.amazonaws.com/NAD_r12_TXT.zip)

[NYC Taxi Zones](https://d37ci6vzurychx.cloudfront.net/misc/taxi_zones.zip)

[taxi2015.csv](https://rapidsai-data.s3.us-east-2.amazonaws.com/viz-data/nyc_taxi.tar.gz)

<style>
:table {
    background-color:transparent;
}
</style>
<table align="center" style="background-color: transparent">
    <!--
    <td>
        <img src="https://www.transportation.gov/sites/dot.gov/files/images/NAD_Partners_20221201_v12_Release_0.jpg" width="350" height="400" float="left">
    </td>
    -->
    <td>
        <img src="https://www.dropbox.com/s/rql9lo7we92k9wc/nad-addresses.png?dl=1" width="350">
    </td>
    <td>
        <img src="https://www.dropbox.com/s/bj4ad8iba5p6u77/nyc-taxi-zones.png?raw=1" width="350" height="400">
    </td>
    <td>
        <img src="https://www.dropbox.com/s/i516rqruz97fd2q/nyc-taxi-pickups.png?dl=1" float="left" width="330">
    </td>
</table>

I/O
 
- National Address Database (NAD): 
- NYC Taxi Zones Shapefile (zones)
- NYC 2015 Taxi Pickups and Dropoffs with Lon/Lat Coords (taxi2015)

In [None]:
%%time
import cudf
import cuspatial
import geopandas

cudf.set_option("spill", True) 

In [None]:
%%time
# I/O (18GB NAD, 265 borough polygons, 13m taxi pickups and dropoffs.
NAD = cudf.read_csv('NAD_r11.txt', usecols=[
    'State',
    'Longitude',
    'Latitude',
])
NAD = NAD[NAD['State'] == 'NY']
NAD_Street = cudf.read_csv('NAD_r11.txt', usecols=[
    'State',
    'StN_PreDir',
    'StreetName',
    'StN_PosTyp',
    'Add_Number',
])
NAD_Street = NAD_Street[NAD_Street['State'] == 'NY']
# Read taxi_zones.zip shapefile with GeoPandas, then convert to epsg:4326 for lon/lat
host_zones = geopandas.read_file('taxi_zones.zip')
host_lonlat = host_zones.to_crs(epsg=4326)
zones = cuspatial.from_geopandas(host_lonlat)
zones.set_index(zones['OBJECTID'], inplace=True)
taxi2015 = cudf.read_csv('taxi2015.csv')

<center><img src="https://www.dropbox.com/s/pp75u59z5uxwrlz/table-to-geoseries.png?dl=1" width=500></center>

In [None]:
%%time
# Convert DataFrames to GeoSeries

pickups = cuspatial.GeoSeries.from_points_xy(
    cudf.DataFrame({
        'x': taxi2015['pickup_longitude'],
        'y': taxi2015['pickup_latitude'],
    }).interleave_columns()
)
addresses = cuspatial.GeoSeries.from_points_xy(
    cudf.DataFrame({
        'x': NAD['Longitude'],
        'y': NAD['Latitude'],
    }).interleave_columns()
)

In [None]:
%%time
zone_addresses = zones['geometry'].contains_properly(addresses, allpairs=True)
display(zone_addresses)

In [None]:
%%time
zone_pickups = zones['geometry'].iloc[0:120].contains_properly(pickups, allpairs=True)
display(zone_pickups)

# You can do it one of two ways: .contains_properly, or write the pip yourself.

In [None]:
%%time
# Add pickup and address counts to zones dataframe

zones["pickup_count"] = zone_pickups.groupby('polygon_index').count()
zones["address_count"] = zone_addresses.groupby('polygon_index').count()
zones.head(12)

# Computing distances
## Cartesian product via tiling

<center><img src="https://www.dropbox.com/s/wlcr9fugq79nyut/tiled-cartesian-product.png?dl=1" width=650></center>

In [None]:
%%time

NEIGHBORHOOD_ID = 12

# Let's make two GeoSeries: For each zone, create a GeoSeries with all address Points
# repeated the number of times there are pickups in that zone, and another GeoSeries with
# the opposite: all pickups Points repeated the number of times there are addresses in that
# zone.

# addresses tiled
zone_address_point_ids = zone_addresses['point_index'][zone_addresses['polygon_index'] == NEIGHBORHOOD_ID]
pickups_count = len(zone_pickups[zone_pickups['polygon_index'] == NEIGHBORHOOD_ID])
addresses_tiled = NAD.iloc[
    zone_address_point_ids
].tile(pickups_count)

# pickups tiled
zone_pickup_point_ids = zone_pickups['point_index'][zone_pickups['polygon_index'] == NEIGHBORHOOD_ID]
addresses_count = len(zone_addresses[zone_addresses['polygon_index'] == NEIGHBORHOOD_ID])
pickups_tiled = taxi2015[[
    'pickup_longitude',
    'pickup_latitude'
]].iloc[
    zone_pickup_point_ids
].tile(addresses_count)

pickup_points = cuspatial.GeoSeries.from_points_xy(
    cudf.DataFrame({
        'x': pickups_tiled['pickup_longitude'],
        'y': pickups_tiled['pickup_latitude']   
    }).interleave_columns()
)
address_points = cuspatial.GeoSeries.from_points_xy(
    cudf.DataFrame({
        'x': addresses_tiled['Longitude'],
        'y': addresses_tiled['Latitude']
    }).interleave_columns()
)
len(address_points)

<center><img src="https://www.dropbox.com/s/30rntm6p67mw96c/pairwise_point_distance.png?dl=1" width=550></center>

In [None]:
%%time
# get the list of addresses and their indices that are closest to a pickup point

haversines = cuspatial.haversine_distance(pickup_points, address_points)

gb_df = cudf.DataFrame({
    'address': addresses_tiled.index,
    'pickup': pickups_tiled.index,
    'distance': haversines
})

address_indices_of_nearest = gb_df[['address', 'distance']].groupby('address').idxmin()
pickup_indices_of_nearest = gb_df[['pickup', 'distance']].groupby('pickup').idxmin()
address_nearest_pickups = gb_df.loc[address_indices_of_nearest['distance']]
pickups_nearest_address = gb_df.loc[pickup_indices_of_nearest['distance']]

# We're almost there

### We have the index of the addresses and their pickups

In [None]:
# Original data nearest pickups and addresses

nearest_pickups = taxi2015.iloc[pickups_nearest_address['pickup']]
nearest_addresses_lonlat = NAD.loc[pickups_nearest_address['address']]

In [None]:
%%time
# Concatenate address fields

def build_address_string(NAD_Street):
    blanks = cudf.Series([' '] * len(NAD_Street))
    blanks.index = NAD_Street.index
    NAD_Street['StN_PreDir'] = NAD_Street['StN_PreDir'].fillna('')
    NAD_Street['StN_PosTyp'] = NAD_Street['StN_PosTyp'].fillna('')
    street_names = NAD_Street['Add_Number'].astype('str').str.cat(
        blanks
    ).str.cat(
        NAD_Street['StN_PreDir']
    ).str.cat(
        blanks
    ).str.cat(
        NAD_Street['StreetName']
    ).str.cat(
        blanks
    ).str.cat(
        NAD_Street['StN_PosTyp']
    )
    return street_names.str.replace('  ', ' ')

nearest_addresses_street_name = NAD_Street.loc[pickups_nearest_address['address']]
street_names = build_address_string(nearest_addresses_street_name)

# Last Step

In [None]:
%%time
# Attach the street names to the original pickups dataframe

# save the taxi2015 index
no_index = nearest_pickups.reset_index()
# set taxi2015 street names and distances based on their iloc positions
no_index['pickup_address'] = street_names.reset_index(drop=True)
no_index['distance'] = pickups_nearest_address['distance'].reset_index(drop=True)
# return the index
taxi_pickups_with_address = no_index.set_index(no_index['index'])
taxi_pickups_with_address.drop('index', inplace=True, axis=1)

display(taxi_pickups_with_address[[
    'tpep_pickup_datetime',
    'passenger_count',
    'trip_distance',
    'distance',
    'pickup_longitude',
    'pickup_latitude',
    'fare_amount',
    'tip_amount',
    'pickup_address'
]])
display(taxi_pickups_with_address[[
    'pickup_latitude',
    'pickup_longitude',
    'pickup_address',
    'distance'
]].sort_values('distance'))

# Use cuXfilter to display these coordinates

In [None]:
import cuxfilter
from bokeh import palettes
from cuxfilter.layouts import feature_and_double_base
import cupy as cp

from pyproj import Proj, Transformer

display_pickups = taxi2015.iloc[address_nearest_pickups['pickup']]
display_addresses = NAD.loc[address_nearest_pickups['address']]

combined_pickups_and_addresses = cudf.concat([
    display_pickups[['pickup_longitude', 'pickup_latitude']].rename(
        columns={
            'pickup_longitude': 'Longitude',
            'pickup_latitude': 'Latitude'
        }
    ),
    display_addresses[['Longitude', 'Latitude']]], axis=0
)
combined_pickups_and_addresses['color'] = cp.repeat(cp.array([1, 2]), len(
    combined_pickups_and_addresses
)//2)
# Back to NYC CRS for display
transform_4326_to_3857 = Transformer.from_crs('epsg:4326', 'epsg:3857')
combined_pickups_and_addresses['location_x'], combined_pickups_and_addresses['location_y'] = transform_4326_to_3857.transform(
    combined_pickups_and_addresses['Latitude'].values_host, combined_pickups_and_addresses['Longitude'].values_host
)

In [None]:
cux_df = cuxfilter.DataFrame.from_dataframe(combined_pickups_and_addresses)
chart1 = cuxfilter.charts.scatter(
    title="Matched address pickup pairs",
    x='location_x',
    y='location_y',
    color_palette=["Green", "Red"],
    aggregate_col="color", aggregate_fn="mean",
    unselected_alpha=0.0,
    tile_provider="CartoLight", x_range=(-8239910.23,-8229529.24), y_range=(4968481.34,4983152.92),
)
d = cux_df.dashboard([chart1],  theme=cuxfilter.themes.dark, title= 'NYC TAXI DATASSET')

In [None]:
chart1.view()