In [97]:
import itertools
import os
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
%matplotlib inline
from geopy.distance import geodesic
from geopy.geocoders import Nominatim
import time

In [98]:
DATA_ROOT = '../data/'
DATA_DIR = os.path.join(DATA_ROOT, 'raw/boston')

In [99]:
readfile_liquor = os.path.join(DATA_DIR, 'liquor-licenses.csv')

In [100]:
df_liquor = pd.read_csv(readfile_liquor)
df_liquor.head(3)

Unnamed: 0,LICENSENO,BUSINESSNAME,DBANAME,COMMENTS,LOCATIONCOMMENTS,ISSDTTM,EXPDTTM,LICSTATUS,LICCAT,LICCATDESC,...,CAPACITY,PRIMAPPLICANT,PHONE,STNO,STNOHI,Address,CITY,STATE,ZIP,Location
0,LB-101572,NEW ENGLAND AQUARIUM CORP.,,Outdoor Patio opening hours are 12:00 P.M. to ...,IN WHOLE OF MAIN EXHIBIT BUILDING IN WHOLE OF...,11/27/2013 12:00:00 AM,12/31/2016 12:00:00 AM,Active,GOPAL,GOP All Alc.,...,2400,NEW ENGLAND AQUARIUM CORP.,,,,Central Wharf,Boston,MA,2110,"(0.0, 0.0)"
1,LB-101576,"SPEAK EASY CONCEPTS, LLC",,PATIO HOURS: 12 NOON - 2:00 A.M. PATIO CONDITI...,ENTRANCE AND EXIT AT 120-124 BOYLSTON PLACE; I...,11/27/2013 12:00:00 AM,12/31/2016 12:00:00 AM,Active,GOPAL,GOP All Alc.,...,520,"SPEAKEASY CONCEPTS, LLC",,25.0,,Boylston PL,Boston,MA,2116,"(42.351899, -71.065829)"
2,LB-101577,"FROST GROUP, LLC",,NONE,In two rooms on third level with main entrance...,11/20/2013 12:00:00 AM,12/31/2016 12:00:00 AM,Active,GOPAL,GOP All Alc.,...,240,"FROST GROUP, LLC",,200.0,,State ST,Boston,MA,2109,"(42.36003, -71.05319)"


In [101]:
#row count
count_row = df_liquor.shape[0]
print("Total Records: ",count_row)

df_liquor['Location'].value_counts().nlargest(10)

#Seeing that 423 of our 1109 records are missing lat/lon coordinates 
#need to transform address into lat/lon for impacted records

Total Records:  1109


(0.0, 0.0)                 423
(42.347296, -71.081457)      5
(42.35927, -71.05643)        3
(42.36629, -71.0622)         3
(42.346219, -71.043099)      3
(42.348545, -71.077314)      2
(42.348075, -71.039263)      2
(42.352166, -71.11768)       2
(42.314637, -71.104489)      2
(42.34716, -71.08251)        2
Name: Location, dtype: int64

In [102]:
#Example of geolocating an address

geolocator = Nominatim(user_agent="predictCrime")
locationTest = geolocator.geocode("Central Wharf Boston, MA")

print(locationTest.address)
print((locationTest.latitude, locationTest.longitude))

Central Wharf (New England Aquarium), Boston HarborWalk, Waterfront, Financial District, Boston, Suffolk County, Massachusetts, MA 02109, United States
(42.35909945, -71.0495668203563)


In [103]:
#Adding a helper column "Address_Complete" to make finding our missing coordinates easier
df_liquor['Address_Complete'] = df_liquor["Address"].str.strip() + ' ' + df_liquor["CITY"].str.strip() + ", " + df_liquor["STATE"].str.strip()
df_liquor.head(3)

Unnamed: 0,LICENSENO,BUSINESSNAME,DBANAME,COMMENTS,LOCATIONCOMMENTS,ISSDTTM,EXPDTTM,LICSTATUS,LICCAT,LICCATDESC,...,PRIMAPPLICANT,PHONE,STNO,STNOHI,Address,CITY,STATE,ZIP,Location,Address_Complete
0,LB-101572,NEW ENGLAND AQUARIUM CORP.,,Outdoor Patio opening hours are 12:00 P.M. to ...,IN WHOLE OF MAIN EXHIBIT BUILDING IN WHOLE OF...,11/27/2013 12:00:00 AM,12/31/2016 12:00:00 AM,Active,GOPAL,GOP All Alc.,...,NEW ENGLAND AQUARIUM CORP.,,,,Central Wharf,Boston,MA,2110,"(0.0, 0.0)","Central Wharf Boston, MA"
1,LB-101576,"SPEAK EASY CONCEPTS, LLC",,PATIO HOURS: 12 NOON - 2:00 A.M. PATIO CONDITI...,ENTRANCE AND EXIT AT 120-124 BOYLSTON PLACE; I...,11/27/2013 12:00:00 AM,12/31/2016 12:00:00 AM,Active,GOPAL,GOP All Alc.,...,"SPEAKEASY CONCEPTS, LLC",,25.0,,Boylston PL,Boston,MA,2116,"(42.351899, -71.065829)","Boylston PL Boston, MA"
2,LB-101577,"FROST GROUP, LLC",,NONE,In two rooms on third level with main entrance...,11/20/2013 12:00:00 AM,12/31/2016 12:00:00 AM,Active,GOPAL,GOP All Alc.,...,"FROST GROUP, LLC",,200.0,,State ST,Boston,MA,2109,"(42.36003, -71.05319)","State ST Boston, MA"


In [104]:
#Testing new address field with geolocator function
#Found the following issues with our geolocator API calls and the current format of our addresses: 
    #1.) Remove "," from "Address" field (only impacts certain Logan Airport)
    #2.) Convert "BL" to "Blvd"
    #3.) Convert "CI" to "Cir"
    #4.) Convert "WH" to "Wharf"
    #5-End.) Specific to each entry, update address to be more specific
    
#Fix # 1
df_liquor['Address_Complete'] = df_liquor['Address_Complete'].str.replace(',', '')

#Fix #2
df_liquor['Address_Complete'] = df_liquor['Address_Complete'].str.replace(' BL ', ' Blvd ')

#Fix #3
df_liquor['Address_Complete'] = df_liquor['Address_Complete'].str.replace(' CI ', ' Cir ')

#Fix #4
df_liquor['Address_Complete'] = df_liquor['Address_Complete'].str.replace(' Wh ', ' Wharf ')

#Fix #4.1
df_liquor['Address_Complete'] = df_liquor['Address_Complete'].str.replace(' Pz ', ' Plaza ')
df_liquor['Address_Complete'] = df_liquor['Address_Complete'].str.replace(' PZ ', ' Plaza ')

#Fix #5
df_liquor['Address_Complete'] = df_liquor['Address_Complete'].str.replace('Allstate RD Dorchester MA', '11A Allstate Rd Boston MA')
df_liquor['Address_Complete'] = df_liquor['Address_Complete'].str.replace('Allstate Rd Dorchester MA', '11A Allstate Rd Boston MA')


#Fix #6
df_liquor['Address_Complete'] = df_liquor['Address_Complete'].str.replace('Battery WH Boston MA', '3 Battery Wharf Boston MA')

#Fix #7
df_liquor['Address_Complete'] = df_liquor['Address_Complete'].str.replace('VFW PW Boston MA', '683 Vfw Pkwy Boston MA')
df_liquor['Address_Complete'] = df_liquor['Address_Complete'].str.replace('VFW PW West Roxbury MA', '1430 VFW Pkwy West Roxbury MA')
df_liquor['Address_Complete'] = df_liquor['Address_Complete'].str.replace('VFW Pw West Roxbury MA', '1430 VFW Pkwy West Roxbury MA')



#Fix #8
df_liquor['Address_Complete'] = df_liquor['Address_Complete'].str.replace('Merchants Ro Boston MA', 'Merchants Row Boston MA')
df_liquor['Address_Complete'] = df_liquor['Address_Complete'].str.replace('Merchants RO Boston MA', 'Merchants Row Boston MA')

#Fix #9
df_liquor['Address_Complete'] = df_liquor['Address_Complete'].str.replace('International PL Boston MA', 'One International Place Boston MA')

#Fix #10
df_liquor['Address_Complete'] = df_liquor['Address_Complete'].str.replace('Norfolk AV Roxbury MA', '257 Norfolk Avenue Boston MA')

#Fix #11
df_liquor['Address_Complete'] = df_liquor['Address_Complete'].str.replace('W Howell ST Dorchester MA', '33 W Howell ST Boston MA')

#Fix #12
df_liquor['Address_Complete'] = df_liquor['Address_Complete'].str.replace('Logan Airport F-2 Terminal A East Boston MA', 'Logan Airport Terminal A East Boston MA')

#Fix #13
df_liquor['Address_Complete'] = df_liquor['Address_Complete'].str.replace('Mechanic CT Boston MA', '44 Mechanic Street Boston MA')

#Fix #14
df_liquor['Address_Complete'] = df_liquor['Address_Complete'].str.replace('William F McClellan Hw East Boston MA', 'Courtyard by Marriott Boston Logan Airport Boston MA')

#Fix #15
df_liquor['Address_Complete'] = df_liquor['Address_Complete'].str.replace('Logan Airport 200 Terminal B East Boston MA', 'Logan Airport Terminal B East Boston MA')

#Fix #16
df_liquor['Address_Complete'] = df_liquor['Address_Complete'].str.replace('Block C Seaport SQ Boston MA', '60 Seaport Blvd #315 Boston MA')

#Fix #17
df_liquor['Address_Complete'] = df_liquor['Address_Complete'].str.replace('W Dedham St Roxbury MA', '57 W Dedham St Boston MA')

#Fix #18
df_liquor['Address_Complete'] = df_liquor['Address_Complete'].str.replace('Dry Dock Av Boston MA', 'Dry Dock Av, Boston MA')

#Fix #19
df_liquor['Address_Complete'] = df_liquor['Address_Complete'].str.replace('W Broadway ST South Boston MA', '80 W Broadway Boston MA')

In [127]:
#Due to API limitations, 50-75 calls appear to be my current max. We're doing 45 at a time to be safe with a 45 second
#wait time between calls. Even with the wait time I would not recommend rushing through these cells. 
#Output data file and plots are available separately so running this notebook is not necessary. 
#See /processed/liquor_processed.csv for final .csv output

newCoordinates = []
addresses = []
for val in df_liquor['Address_Complete'][0:45]: 
    geo = geolocator.geocode(val)
    addresses.append(val)
    newCoordinates.append((geo.latitude,geo.longitude))
    #print(val,(geo.latitude, geo.longitude))
    
print(len(newCoordinates))
#print(addresses)

45


In [128]:
time.sleep(45)#Need to briefly wait to avoid timeout
for val in df_liquor['Address_Complete'][45:90]:
    geo = geolocator.geocode(val)
    addresses.append(val)
    newCoordinates.append((geo.latitude,geo.longitude))
    #print(val,(geo.latitude, geo.longitude))
print(len(newCoordinates))

90


In [129]:
time.sleep(45)
for val in df_liquor['Address_Complete'][90:135]:
    geo = geolocator.geocode(val)
    addresses.append(val)
    newCoordinates.append((geo.latitude,geo.longitude))
    #print(val,(geo.latitude, geo.longitude)) 
print(len(newCoordinates))

135


In [130]:
time.sleep(45)
for val in df_liquor['Address_Complete'][135:180]:
    geo = geolocator.geocode(val)
    addresses.append(val)
    newCoordinates.append((geo.latitude,geo.longitude))
    #print(val,(geo.latitude, geo.longitude)) 
print(len(newCoordinates))

180


In [131]:
time.sleep(45)
for val in df_liquor['Address_Complete'][180:225]:
    geo = geolocator.geocode(val)
    addresses.append(val)
    newCoordinates.append((geo.latitude,geo.longitude))
    #print(val,(geo.latitude, geo.longitude))     
    
print(len(newCoordinates))

225


In [132]:
time.sleep(45)
for val in df_liquor['Address_Complete'][225:270]:
    geo = geolocator.geocode(val)
    addresses.append(val)
    newCoordinates.append((geo.latitude,geo.longitude))
    #print(val,(geo.latitude, geo.longitude))

print(len(newCoordinates))

270


In [133]:
time.sleep(45)
for val in df_liquor['Address_Complete'][270:315]:
    geo = geolocator.geocode(val)
    addresses.append(val)
    newCoordinates.append((geo.latitude,geo.longitude))
    #print(val,(geo.latitude, geo.longitude)) 

print(len(newCoordinates))

315


In [134]:
time.sleep(45)
for val in df_liquor['Address_Complete'][315:360]:
    geo = geolocator.geocode(val)
    addresses.append(val)
    newCoordinates.append((geo.latitude,geo.longitude))
    #print(val,(geo.latitude, geo.longitude)) 

print(len(newCoordinates))

360


In [135]:
time.sleep(45)
for val in df_liquor['Address_Complete'][360:405]:
    geo = geolocator.geocode(val)
    addresses.append(val)
    newCoordinates.append((geo.latitude,geo.longitude))
    #print(val,(geo.latitude, geo.longitude)) 

print(len(newCoordinates))

405


In [136]:
time.sleep(45)
for val in df_liquor['Address_Complete'][405:450]:
    geo = geolocator.geocode(val)
    addresses.append(val)
    newCoordinates.append((geo.latitude,geo.longitude))
    #print(val,(geo.latitude, geo.longitude)) 
    
print(len(newCoordinates))

450


In [137]:
time.sleep(45)
for val in df_liquor['Address_Complete'][450:495]:
    geo = geolocator.geocode(val)
    addresses.append(val)
    newCoordinates.append((geo.latitude,geo.longitude))
    #print(val,(geo.latitude, geo.longitude)) 

print(len(newCoordinates))

495


In [138]:
time.sleep(45)
for val in df_liquor['Address_Complete'][495:540]:
    geo = geolocator.geocode(val)
    addresses.append(val)
    newCoordinates.append((geo.latitude,geo.longitude))
    #print(val,(geo.latitude, geo.longitude))

print(len(newCoordinates))

540


In [141]:
time.sleep(45)
for val in df_liquor['Address_Complete'][540:585]:
    geo = geolocator.geocode(val)
    addresses.append(val)
    newCoordinates.append((geo.latitude,geo.longitude))
    #print(val,(geo.latitude, geo.longitude))

print(len(newCoordinates))

585


In [142]:
time.sleep(45)
for val in df_liquor['Address_Complete'][585:630]:
    geo = geolocator.geocode(val)
    addresses.append(val)
    newCoordinates.append((geo.latitude,geo.longitude))
    #print(val,(geo.latitude, geo.longitude))

print(len(newCoordinates))

630


In [143]:
time.sleep(45)
for val in df_liquor['Address_Complete'][630:675]:
    geo = geolocator.geocode(val)
    addresses.append(val)
    newCoordinates.append((geo.latitude,geo.longitude))
    #print(val,(geo.latitude, geo.longitude))

print(len(newCoordinates))

675


In [146]:
time.sleep(45)
for val in df_liquor['Address_Complete'][675:720]:
    geo = geolocator.geocode(val)
    addresses.append(val)
    newCoordinates.append((geo.latitude,geo.longitude))
    #print(val,(geo.latitude, geo.longitude))

print(len(newCoordinates))

720


In [147]:
time.sleep(45)
for val in df_liquor['Address_Complete'][720:765]:
    geo = geolocator.geocode(val)
    addresses.append(val)
    newCoordinates.append((geo.latitude,geo.longitude))
    #print(val,(geo.latitude, geo.longitude))

print(len(newCoordinates))

765


In [150]:
time.sleep(45)
for val in df_liquor['Address_Complete'][765:810]:
    geo = geolocator.geocode(val)
    addresses.append(val)
    newCoordinates.append((geo.latitude,geo.longitude))
    #print(val,(geo.latitude, geo.longitude))

print(len(newCoordinates))

810


In [151]:
time.sleep(45)
for val in df_liquor['Address_Complete'][810:855]:
    geo = geolocator.geocode(val)
    addresses.append(val)
    newCoordinates.append((geo.latitude,geo.longitude))
    #print(val,(geo.latitude, geo.longitude))

print(len(newCoordinates))

855


In [152]:
time.sleep(45)
for val in df_liquor['Address_Complete'][855:900]:
    geo = geolocator.geocode(val)
    addresses.append(val)
    newCoordinates.append((geo.latitude,geo.longitude))
    #print(val,(geo.latitude, geo.longitude))

print(len(newCoordinates))

900


In [153]:
time.sleep(45)
for val in df_liquor['Address_Complete'][900:945]:
    geo = geolocator.geocode(val)
    addresses.append(val)
    newCoordinates.append((geo.latitude,geo.longitude))
    #print(val,(geo.latitude, geo.longitude))

print(len(newCoordinates))

945


In [154]:
time.sleep(45)
for val in df_liquor['Address_Complete'][945:990]:
    geo = geolocator.geocode(val)
    addresses.append(val)
    newCoordinates.append((geo.latitude,geo.longitude))
    #print(val,(geo.latitude, geo.longitude))

print(len(newCoordinates))

990


In [155]:
time.sleep(45)
for val in df_liquor['Address_Complete'][990:1035]:
    geo = geolocator.geocode(val)
    addresses.append(val)
    newCoordinates.append((geo.latitude,geo.longitude))
    #print(val,(geo.latitude, geo.longitude))

print(len(newCoordinates))

In [158]:
time.sleep(45)
for val in df_liquor['Address_Complete'][1035:1080]:
    geo = geolocator.geocode(val)
    addresses.append(val)
    newCoordinates.append((geo.latitude,geo.longitude))
    #print(val,(geo.latitude, geo.longitude))
print(len(newCoordinates))

1080


In [159]:
time.sleep(45)
for val in df_liquor['Address_Complete'][1080:1125]:
    geo = geolocator.geocode(val)
    addresses.append(val)
    newCoordinates.append((geo.latitude,geo.longitude))
    #print(val,(geo.latitude, geo.longitude))

print(len(newCoordinates))

1109


In [437]:
#=============Testing Problematic Addresses=========================
#locationTest3 = geolocator.geocode("80 W Broadway Boston MA")
#print(locationTest3.address)

#df_liquor['Address_Complete'][0:10]
#df_liquor['Address_Complete'][100:150]

In [160]:
#Making sure our new lists match in length
print(len(addresses),len(newCoordinates))

1109 1109


In [161]:
#Adding in "Location_Complete" for a more robust lat/lon column
df_liquor['Location_Complete'] = newCoordinates
df_liquor.head(3)
#df_liquor.tail(3)

Unnamed: 0,LICENSENO,BUSINESSNAME,DBANAME,COMMENTS,LOCATIONCOMMENTS,ISSDTTM,EXPDTTM,LICSTATUS,LICCAT,LICCATDESC,...,PHONE,STNO,STNOHI,Address,CITY,STATE,ZIP,Location,Address_Complete,Location_Complete
0,LB-101572,NEW ENGLAND AQUARIUM CORP.,,Outdoor Patio opening hours are 12:00 P.M. to ...,IN WHOLE OF MAIN EXHIBIT BUILDING IN WHOLE OF...,11/27/2013 12:00:00 AM,12/31/2016 12:00:00 AM,Active,GOPAL,GOP All Alc.,...,,,,Central Wharf,Boston,MA,2110,"(0.0, 0.0)",Central Wharf Boston MA,"(42.35909945, -71.0495668203563)"
1,LB-101576,"SPEAK EASY CONCEPTS, LLC",,PATIO HOURS: 12 NOON - 2:00 A.M. PATIO CONDITI...,ENTRANCE AND EXIT AT 120-124 BOYLSTON PLACE; I...,11/27/2013 12:00:00 AM,12/31/2016 12:00:00 AM,Active,GOPAL,GOP All Alc.,...,,25.0,,Boylston PL,Boston,MA,2116,"(42.351899, -71.065829)",Boylston PL Boston MA,"(42.316157, -71.102592)"
2,LB-101577,"FROST GROUP, LLC",,NONE,In two rooms on third level with main entrance...,11/20/2013 12:00:00 AM,12/31/2016 12:00:00 AM,Active,GOPAL,GOP All Alc.,...,,200.0,,State ST,Boston,MA,2109,"(42.36003, -71.05319)",State ST Boston MA,"(42.3588444, -71.0572152)"


In [163]:
#Creating processed dataframe including only 'potentially' valuable data points
df_liquor_processed = df_liquor[['LICENSENO','LICSTATUS','LICCAT','LICCATDESC','Address_Complete','Location_Complete']]

In [164]:
df_liquor_processed.head(3) #Inspect for missing data

Unnamed: 0,LICENSENO,LICSTATUS,LICCAT,LICCATDESC,Address_Complete,Location_Complete
0,LB-101572,Active,GOPAL,GOP All Alc.,Central Wharf Boston MA,"(42.35909945, -71.0495668203563)"
1,LB-101576,Active,GOPAL,GOP All Alc.,Boylston PL Boston MA,"(42.316157, -71.102592)"
2,LB-101577,Active,GOPAL,GOP All Alc.,State ST Boston MA,"(42.3588444, -71.0572152)"


In [165]:
len(df_liquor_processed) #Should match 1109 records.

1109

In [167]:
#Saving processed dataframe to file
df_liquor_processed.to_csv('../data/processed/liquor_processed.csv', sep=',')