# Final Project - Analyze Boston Data

## Boston data is avaialble from data.boston.gov site. Get data from liqour licenses and clean-up / use columns needed

In [1]:
import csv
import re
import pandas as pd
filename = "liquor-licenses.csv"
bos_liq_df = pd.read_csv(filename)
cols = ['LICENSENO', 'BUSINESSNAME', 'CAPACITY', 'STNO', 'Address', 'CITY', 'STATE', 'ZIP', 'Location']
bos_liq_df = bos_liq_df.drop(columns=['LICENSENO', 'BUSINESSNAME', 'CAPACITY', 'STNO', 'Address', 'STATE', 'DBANAME', 'COMMENTS', 'LOCATIONCOMMENTS', 'ISSDTTM', 'EXPDTTM', 'LICSTATUS', 'LICCAT', 'LICCATDESC', 'OPENING','CLOSING', 'PATRONSOUT', 'PRIMAPPLICANT', 'PHONE', 'STNOHI'])
bos_liq_df['Location'] = bos_liq_df.groupby('ZIP')['Location'].transform('max')
bos_liq_df['NumOfLics'] = bos_liq_df.groupby(['ZIP'])['Location'].transform('count')
bos_liq_df = bos_liq_df.drop_duplicates(subset='ZIP', keep='first').reset_index()
bos_liq_df['ZIP'] = '0' + bos_liq_df['ZIP'].astype(str)
bos_liq_df[["Latitude", "Longitude"]] = bos_liq_df["Location"].str.split(",", n = 2, expand = True) 
bos_liq_df["Latitude"] = bos_liq_df["Latitude"].str.replace('(','').astype(float)
bos_liq_df["Longitude"] = bos_liq_df["Longitude"].str.replace(')', '').astype(float)
bos_liq_df.drop(columns=["Location"], inplace = True) 
# bos_liq_count_df['ZIP'] = '0'& bos_liq_count_df['ZIP']
# bos_liq_df.groupby('ZIP').max().reset_index()
bos_liq_df

Unnamed: 0,index,CITY,ZIP,NumOfLics,Latitude,Longitude
0,0,Boston,2110,37,42.362516,-71.050719
1,1,Boston,2116,147,42.35283,-71.0716
2,2,Boston,2109,52,42.3654,-71.05142
3,3,East Boston,2128,93,42.39443,-71.00029
4,4,Boston,2114,48,42.366963,-71.05818
5,7,Hyde Park,2136,13,42.25783,-71.12278
6,9,Dorchester,2122,18,42.30591,-71.0586
7,10,Roxbury,2119,13,42.33006,-71.08405
8,14,Boston,2115,52,42.350586,-71.08976
9,17,Boston,2215,67,42.352561,-71.118483


In [2]:
from geopy.geocoders import Nominatim

address = 'Boston, MA'

geolocator = Nominatim(user_agent="boston_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Boston are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Boston are 42.3602534, -71.0582912.


## Show it on the map for visualization

In [5]:
import folium
# create map of Toronot using latitude and longitude values
map_boston = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, zipcode, city in zip(bos_liq_df['Latitude'], bos_liq_df['Longitude'], bos_liq_df['ZIP'], bos_liq_df['CITY']):
    label = '{}, {}'.format(city, zipcode)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_boston)  
    
map_boston

## Use Foursquare API to get the top venues

In [8]:
import requests
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100

In [9]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']
    
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
       # print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['ZipCode', 
                  'ZipCode Latitude', 
                  'ZipCode Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [12]:
boston_venues = getNearbyVenues(names=bos_liq_df['ZIP']+','+bos_liq_df['CITY'],
                                   latitudes=bos_liq_df['Latitude'],
                                   longitudes=bos_liq_df['Longitude']
                                  )

In [13]:
print(boston_venues.shape)
# boston_venues.head(10)
boston_venues.groupby('ZipCode').count()

(1472, 7)


Unnamed: 0_level_0,ZipCode Latitude,ZipCode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
ZipCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"02108,Boston",100,100,100,100,100,100
"02109,Boston",97,97,97,97,97,97
"02110,Boston",100,100,100,100,100,100
"02111,Boston",100,100,100,100,100,100
"02113,Boston",100,100,100,100,100,100
"02114,Boston",100,100,100,100,100,100
"02115,Boston",83,83,83,83,83,83
"02116,Boston",100,100,100,100,100,100
"02118,Roxbury",50,50,50,50,50,50
"02119,Roxbury",18,18,18,18,18,18


In [14]:
# one hot encoding
boston_one = pd.get_dummies(boston_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
boston_one['ZipCode'] = boston_venues['ZipCode'] 

# move neighborhood column to the first column
fixed_columns = [boston_one.columns[-1]] + list(boston_one.columns[:-1])
boston_one = boston_one[fixed_columns]

boston_one.head()

Unnamed: 0,ZipCode,Accessories Store,Afghan Restaurant,African Restaurant,American Restaurant,Antique Shop,Aquarium,Art Gallery,Art Museum,Arts & Crafts Store,...,Tunnel,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"02110,Boston",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"02110,Boston",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"02110,Boston",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"02110,Boston",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"02110,Boston",0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
boston_grouped = boston_one.groupby('ZipCode').mean().reset_index()
boston_grouped.head()

Unnamed: 0,ZipCode,Accessories Store,Afghan Restaurant,African Restaurant,American Restaurant,Antique Shop,Aquarium,Art Gallery,Art Museum,Arts & Crafts Store,...,Tunnel,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"02108,Boston",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.01
1,"02109,Boston",0.0,0.0,0.0,0.010309,0.0,0.0,0.0,0.0,0.0,...,0.010309,0.0,0.0,0.0,0.0,0.0,0.030928,0.0,0.0,0.010309
2,"02110,Boston",0.0,0.0,0.0,0.03,0.0,0.03,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01
3,"02111,Boston",0.0,0.0,0.0,0.03,0.0,0.03,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01
4,"02113,Boston",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01


In [16]:
import numpy as np
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

# Create the new dataframe and display the top 10 venues for each neighborhood.
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['ZipCode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['ZipCode'] = boston_grouped['ZipCode']

for ind in np.arange(boston_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(boston_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.columns


Index(['ZipCode', '1st Most Common Venue', '2nd Most Common Venue',
       '3rd Most Common Venue', '4th Most Common Venue',
       '5th Most Common Venue', '6th Most Common Venue',
       '7th Most Common Venue', '8th Most Common Venue',
       '9th Most Common Venue', '10th Most Common Venue'],
      dtype='object')

In [20]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

boston_grouped_clustering = boston_grouped.drop('ZipCode', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(boston_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:9] 

# add clustering labels
#neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

boston_merged = bos_liq_df
boston_merged["CityZip"] = bos_liq_df['ZIP']+','+bos_liq_df['CITY']
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
boston_merged = boston_merged.join(neighborhoods_venues_sorted.set_index('ZipCode'), on='CityZip')

boston_merged # check the last columns!

Unnamed: 0,index,CITY,ZIP,NumOfLics,Latitude,Longitude,CityZip,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,0,Boston,2110,37,42.362516,-71.050719,"02110,Boston",1,Italian Restaurant,Seafood Restaurant,Park,Bakery,Café,Aquarium,Coffee Shop,Pizza Place,American Restaurant,Hotel
1,1,Boston,2116,147,42.35283,-71.0716,"02116,Boston",0,Spa,Hotel,American Restaurant,Gym / Fitness Center,Sandwich Place,Steakhouse,Seafood Restaurant,Jewelry Store,Coffee Shop,Italian Restaurant
2,2,Boston,2109,52,42.3654,-71.05142,"02109,Boston",1,Italian Restaurant,Park,Seafood Restaurant,Bakery,Pizza Place,Café,Wine Shop,Harbor / Marina,Coffee Shop,Grocery Store
3,3,East Boston,2128,93,42.39443,-71.00029,"02128,East Boston",0,Food,Circus,Shoe Store,Chinese Restaurant,Supplement Shop,Athletics & Sports,Health & Beauty Service,Discount Store,Buffet,Food Truck
4,4,Boston,2114,48,42.366963,-71.05818,"02114,Boston",1,Italian Restaurant,Pizza Place,Park,Bakery,Coffee Shop,Hotel,Café,Brewery,Seafood Restaurant,Sandwich Place
5,7,Hyde Park,2136,13,42.25783,-71.12278,"02136,Hyde Park",0,Pizza Place,American Restaurant,Ice Cream Shop,Fried Chicken Joint,Donut Shop,Discount Store,Bar,Gym,Thai Restaurant,Caribbean Restaurant
6,9,Dorchester,2122,18,42.30591,-71.0586,"02122,Dorchester",4,Vietnamese Restaurant,Pizza Place,Grocery Store,Rental Car Location,Dive Bar,New American Restaurant,Metro Station,Park,Furniture / Home Store,Plaza
7,10,Roxbury,2119,13,42.33006,-71.08405,"02119,Roxbury",0,Café,American Restaurant,State / Provincial Park,Diner,Baseball Field,Donut Shop,Grocery Store,Pharmacy,Thrift / Vintage Store,Bike Rental / Bike Share
8,14,Boston,2115,52,42.350586,-71.08976,"02115,Boston",0,Clothing Store,Coffee Shop,Grocery Store,Bookstore,Sushi Restaurant,Accessories Store,Sandwich Place,New American Restaurant,Sporting Goods Shop,Women's Store
9,17,Boston,2215,67,42.352561,-71.118483,"02215,Boston",0,Pizza Place,Coffee Shop,Gym / Fitness Center,Mexican Restaurant,Donut Shop,Yoga Studio,Café,Bike Rental / Bike Share,Massage Studio,Bubble Tea Shop


In [22]:
import matplotlib.cm as cm
import matplotlib.colors as colors
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(boston_merged['Latitude'], boston_merged['Longitude'], boston_merged['CityZip'], boston_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters