In [1]:
import pandas as pd
import numpy as np
import os
import reverse_geocoder as rg
import json
from shapely.geometry import shape, Point
import shapely.wkt
from tqdm import tqdm_notebook as tqdm
import glob
from shapely import geometry

pd.options.mode.chained_assignment = None

In [2]:
# Get the paths for every file that will be used in the notebook
org_path = os.getcwd()
taxi_path = org_path + '\Taxi_Data\\'

borough_outlines_path = taxi_path + 'Borough_Information'
rough_outlines_path = taxi_path + 'Borough_Information\Custom_Polygons\Rough_Outlines'
airport_path = taxi_path + 'Borough_Information\Custom_Polygons\Airports'
manhattan_path = taxi_path + 'Borough_Information\Custom_Polygons\Manhattan_Areas'

save_path = org_path + '\\Notebook_Data\\'

# Get a list of files located in every folder
rough_outlines_files = glob.glob(rough_outlines_path + "/*.csv")
airport_files = glob.glob(airport_path + "/*.csv")
manhattan_files = glob.glob(manhattan_path + "/*.csv")

# Read in the data
df = pd.read_csv(taxi_path + 'train.csv', float_precision='round_trip')

In [3]:
# Extract pickup and dropoff locations and align in a single dataframe
locations_pickup = df[['id', 'pickup_longitude', 'pickup_latitude']].copy()
locations_pickup['is_pickup'] = 1

locations_dropoff = df[['id', 'dropoff_longitude', 'dropoff_latitude']].copy()
locations_dropoff['is_pickup'] = 0

locations_pickup.columns = ['id', 'longitude', 'latitude', 'is_pickup']
locations_dropoff.columns = ['id', 'longitude', 'latitude', 'is_pickup']

locations =  locations_pickup.append(locations_dropoff, ignore_index=True)

In [4]:
%%time
# load Json file containing the borough boundaries
with open(borough_outlines_path + '\Borough Boundaries.json') as f:
    js = json.load(f)

    
polygon_list = list()

# Loop through the json data to extract the imporant parts
for feature in js['data']:
    # In this case the polygons and name of the borough corrispoing to the polygon
    polygon = shapely.wkt.loads(feature[8])
    polygon_list.append([polygon, feature[10]])
setlist = list()    
borough_list = list()

# Loop over every location coordinate
for row in tqdm(locations.itertuples()):
    # Create a tempset that will be used later in the notebook
    tempset = (row.latitude, row.longitude)
    setlist.append(tempset)
    
    # Get a single geometric point from the longitude and latitude coordinates
    point = Point(row.longitude,
                  row.latitude)
    notfound = True
    
    # Check each polygon to see if it contains the point
    for polygon in polygon_list:
        if polygon[0].contains(point):
            # Input borough name into a list of boroughs
            borough_list.append(polygon[1])
            notfound = False
            break
    
    # If it is not found then fill with NaN
    if notfound:
        borough_list.append(np.nan)
        

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Wall time: 5h 7min 47s


In [5]:
# Run a function from the reverse geocode library to get detailed information location information
geocode_info = rg.search(setlist)

Loading formatted geocoded file...


In [6]:
# Convert into a dataframe
geocodedf = pd.DataFrame(geocode_info)

In [7]:
# Combine with the locations dataframe, this will enable remapping of the IDs later
geocode_df = pd.concat([locations, geocodedf], axis=1, sort=False)

In [8]:
# Attach the borough list to the new geocode dataframe
geocode_df['borough'] = borough_list

Just to clarify, the reverse geocoder does also return borough information. However, it relies on centroid clustering which turned out to be extremely inaccurate because of all the straight borough boundries. This is why the polygon method was used instead.

The next few cells rely on coordinate files manually created to segment New York City with more detail. This includes rough outlines around the boroughs to pick up stray data points, outlines around the major airports in New York, John F Kennedy airport, LaGuardia and Newark Liberty airport and more precise segmentation of Manhattan, into lower, midtown and upper Manhattan.

In [9]:
# Define a function that greates polygons for files that include coordinate data
def create_polygons(file):
    # Load in the file in question
    file_name = file.split('\\')[-1][:-4]
    temp_load = pd.read_csv(file, delimiter = ';')
    
    # Create a list of geometry points from the coordinates
    pointlist = list()
    for row in temp_load.itertuples():
        pointlist.append(geometry.Point(row.longitude, row.latitude))
    
    # Combine the points into a polygon
    polygon = geometry.Polygon([[p.x, p.y] for p in pointlist])
    
    return [polygon, file_name]

In [10]:
# Run the create_polygons function for the rough outlines
rough_outlines_polygon_list = list()
for file in rough_outlines_files:
    rough_outlines_polygon_list.append(create_polygons(file))

# Run the create_polygons function for the airport outlines
airport_polygon_list = list()
for file in airport_files:
    airport_polygon_list.append(create_polygons(file))

# Run the create_polygons function for the manhattan outlines
manhattan_polygon_list = list()
for file in manhattan_files:
    manhattan_polygon_list.append(create_polygons(file))

In [11]:
# Define a function that checks if data is included in a list of polygons
def find_polygon(data, polygon_list, column_name):
    for row in tqdm(data.itertuples()):
        # Get a single geometric point from the longitude and latitude coordinates
        point = Point(row.longitude,
                      row.latitude)
        # Check each polygon to see if it contains the point
        for polygon in polygon_list:
            if polygon[0].contains(point):
                geocode_df.at[row.Index, column_name] = polygon[1]
                break

The original borough outlines were very precise, this resulted in locations such as bridges and docks to be missed by the original classification. So to fix this, the rough outlines will identify where the NaN values might belong.

In [12]:
# Get all rows where the borough was not found and filled as Nan
nan_df = geocode_df[geocode_df['borough'].isna()]

# Run the find polygon function based on the rough outlines
find_polygon(nan_df, rough_outlines_polygon_list, 'borough')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [13]:
# Initalise a new column where airport information will be located
geocode_df['airport'] = 'No Airport'

# To create a smaller subset to loop over, all manhattan trips were removed. 
# This is reasonable because no airports are in Manhattan
poss_airport_df = geocode_df[geocode_df['borough'] != 'Manhattan']

# Find if data points are located near any major airports
find_polygon(poss_airport_df, airport_polygon_list, 'airport')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [14]:
# Create a new column which will be filled with more detailed segmentation
geocode_df['detailed_borough'] = geocode_df['borough']

# Filter only on Manhattan data points to use in this run
manhattan_df = geocode_df[geocode_df['borough'] == 'Manhattan']

# Find which segment of Manhattan, lower, mid and upper, the data point belongs to
find_polygon(manhattan_df, manhattan_polygon_list, 'detailed_borough')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [15]:
geocode_df.head()

Unnamed: 0,id,longitude,latitude,is_pickup,lat,lon,name,admin1,admin2,cc,borough,airport,detailed_borough
0,id2875421,-73.982155,40.767937,1,40.78343,-73.96625,Manhattan,New York,New York County,US,Manhattan,No Airport,Midtown Manhattan
1,id2377394,-73.980415,40.738564,1,40.74482,-73.94875,Long Island City,New York,Queens County,US,Manhattan,No Airport,Midtown Manhattan
2,id3858529,-73.979027,40.763939,1,40.78343,-73.96625,Manhattan,New York,New York County,US,Manhattan,No Airport,Midtown Manhattan
3,id3504673,-74.01004,40.719971,1,40.71427,-74.00597,New York City,New York,,US,Manhattan,No Airport,Lower Manhattan
4,id2181028,-73.973053,40.793209,1,40.78343,-73.96625,Manhattan,New York,New York County,US,Manhattan,No Airport,Upper Manhattan


In [16]:
# Fill NaN values in the borough information to indicate that they are located well outside New York City
geocode_df['detailed_borough'].fillna('Well Outside NYC', inplace=True)

In [17]:
# Replace 'No Airport' string with NaN 
geocode_df = geocode_df.replace('No Airport', np.nan, regex=True)

In [18]:
# Save the newly created dataframe
geocode_df.to_csv(save_path + 'geocode_info_mapping_with_pol.csv', index = False)