# The Battle of the Neighborhoods - The Final Showdown!

In [27]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata (repodata.json): done
Solving environment: failed

PackagesNotFoundError: The following packages are not available from current channels:

  - anaconda/linux-64::grpcio==1.16.1=py36hf8bcb03_1 -> openssl[version='>=1.1.1,<1.1.2.0a0']

Current channels:

  - https://conda.anaconda.org/conda-forge/linux-64
  - https://conda.anaconda.org/conda-forge/noarch
  - https://repo.anaconda.com/pkgs/main/linux-64
  - https://repo.anaconda.com/pkgs/main/noarch
  - https://repo.anaconda.com/pkgs/r/linux-64
  - https://repo.anaconda.com/pkgs/r/noarch

To search for alternate channels that may provide the conda package you're
looking for, navigate to

    https://anaconda.org

and use the search bar at the top of the page.


Collecting package metadata (repodata.json): done
Solving environment: failed

PackagesNotFoundError: The following packages are not available from current channels:

  - anaconda/linux-64::grpcio==1.16.1=py36hf8bcb03_1 -> openssl[version='>=1.1.1,<1.1.2

## Utility Functions

In [28]:
# add markers to map
def map_markers(processed_map, latitude, longitude, bor, neigh):
    for lat, lng, borough, neighborhood in zip(latitude, longitude, bor, neigh):
        label = '{}, {}'.format(neighborhood, borough)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color='blue',
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.7,
            parse_html=False).add_to(processed_map)  

    return processed_map

def map_markers_neigh(processed_map, latitude, longitude, neigh):
    for lat, lng, label in zip(latitude,longitude, neigh):
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color='blue',
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.7,
            parse_html=False).add_to(processed_map)  

    return processed_map

# Foursquare API Credentials & Version
CLIENT_ID = 'CFQW5PO1QNCEZDJGARV2O0RJZYB33ND1PVMUAXAI41O1TO5H' 
CLIENT_SECRET = 'WRSPCUWVB3PCBQLMCR2F4T3PWCCI3TUVQOWCNYFJPFBJBAME' 
VERSION = '20180605' 

# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

# function to repeat the same process to all the neighborhoods
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    LIMIT = 100
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

# First City - New York

## Download New York data and format it as a pandas dataframe

In [29]:
# NEW YORK
!wget -q -O 'newyork_data.json' https://cocl.us/new_york_dataset
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

neighborhoods_data = newyork_data['features']
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 
df_newyork = pd.DataFrame(columns=column_names)

for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    df_newyork = df_newyork.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

print ("New York DF Rows: {}".format(df_newyork.shape[0]))
df_newyork.head()

New York DF Rows: 306


Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


## Pick Brooklyn as the sample district, to compare and focus on its neighborhoods

In [30]:
brooklyn_data = df_newyork[df_newyork['Borough'] == 'Brooklyn'].reset_index(drop=True)

geolocator = Nominatim(user_agent="bl_explorer")
location = geolocator.geocode('Brooklyn, NY')
latitude = location.latitude
longitude = location.longitude

map_brooklyn = folium.Map(location=[latitude, longitude], zoom_start=12) # create map of Brooklyn using latitude and longitude values
map_brooklyn = map_markers_neigh(map_brooklyn, brooklyn_data['Latitude'], brooklyn_data['Longitude'], brooklyn_data['Neighborhood']) # add markers to map
map_brooklyn

In [31]:
a_shape = brooklyn_data.shape
print ("Brooklyn has {} neighborhoods".format(a_shape[0], a_shape[1]))

Brooklyn has 70 neighborhoods


## Lets explore neighborhoods in Brooklyn

In [32]:
brooklyn_venues = getNearbyVenues(names=brooklyn_data['Neighborhood'], latitudes=brooklyn_data['Latitude'], longitudes=brooklyn_data['Longitude'])
brooklyn_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Bay Ridge,40.625801,-74.030621,Pilo Arts Day Spa and Salon,40.624748,-74.030591,Spa
1,Bay Ridge,40.625801,-74.030621,Bagel Boy,40.627896,-74.029335,Bagel Shop
2,Bay Ridge,40.625801,-74.030621,Cocoa Grinder,40.623967,-74.030863,Juice Bar
3,Bay Ridge,40.625801,-74.030621,Pegasus Cafe,40.623168,-74.031186,Breakfast Spot
4,Bay Ridge,40.625801,-74.030621,Leo's Casa Calamari,40.623348,-74.031082,Pizza Place


## Lets filter venue categories that we are interested in

In [33]:
venue_cat = ['Arts & Crafts Store', 'Athletics & Sports', 'Bakery', 'Bank', 'Bike Shop', 'Boutique', 'Bus Line', 'Bus Station', 'Butcher', 'Café', 'Clothing Store', 'Coffee Shop', 'Department Store', 'Discount Store', 'Electronics Store', 'Food Court', 'Frozen Yogurt Shop', 'Furniture / Home Store', 'General Entertainment', 'Golf Course', 'Grocery Store', 'Gym / Fitness Center', 'Italian Restaurant', 'Japanese Restaurant', 'Juice Bar', 'Kids Store', 'Metro Station', 'Movie Theater', 'Park', 'Pharmacy', 'Pool', 'Restaurant', 'Salon / Barbershop', 'Shopping Mall', 'Spa', 'Sporting Goods Shop', 'Supermarket', 'Tea Room', 'Toy / Game Store', 'Video Store', 'Women\'s Store']
brooklyn_venues_filtered = brooklyn_venues[brooklyn_venues["Venue Category"].isin(venue_cat)]
brooklyn_venues_filtered.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Bay Ridge,40.625801,-74.030621,Pilo Arts Day Spa and Salon,40.624748,-74.030591,Spa
2,Bay Ridge,40.625801,-74.030621,Cocoa Grinder,40.623967,-74.030863,Juice Bar
6,Bay Ridge,40.625801,-74.030621,Brooklyn Market,40.626939,-74.029948,Grocery Store
9,Bay Ridge,40.625801,-74.030621,Mimi Nails,40.622571,-74.031477,Spa
12,Bay Ridge,40.625801,-74.030621,A.L.C. Italian Grocery,40.623051,-74.031224,Grocery Store


In [34]:
print ("{} Neighborhoods, {} Venues & {} Venue Categories".format(brooklyn_venues_filtered["Neighborhood"].unique().shape[0], brooklyn_venues_filtered["Venue Category"].shape[0], brooklyn_venues_filtered["Venue Category"].unique().shape[0]))

70 Neighborhoods, 784 Venues & 41 Venue Categories


## Lets analyse each venue

In [35]:
# one hot encoding
brooklyn_onehot = pd.get_dummies(brooklyn_venues_filtered[['Venue Category']], prefix="", prefix_sep="")
#brooklyn_onehot.drop('Neighborhood', axis=1, inplace=True)
brooklyn_onehot['Neighborhood'] = brooklyn_venues_filtered['Neighborhood'] # add neighborhood column back to dataframe

fixed_columns = [brooklyn_onehot.columns[-1]] + list(brooklyn_onehot.columns[:-1]) # move neighborhood column to the first column
brooklyn_onehot = brooklyn_onehot[fixed_columns]
brooklyn_onehot.head()

Unnamed: 0,Neighborhood,Arts & Crafts Store,Athletics & Sports,Bakery,Bank,Bike Shop,Boutique,Bus Line,Bus Station,Butcher,Café,Clothing Store,Coffee Shop,Department Store,Discount Store,Electronics Store,Food Court,Frozen Yogurt Shop,Furniture / Home Store,General Entertainment,Golf Course,Grocery Store,Gym / Fitness Center,Italian Restaurant,Japanese Restaurant,Juice Bar,Kids Store,Metro Station,Movie Theater,Park,Pharmacy,Pool,Restaurant,Salon / Barbershop,Shopping Mall,Spa,Sporting Goods Shop,Supermarket,Tea Room,Toy / Game Store,Video Store,Women's Store
0,Bay Ridge,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,Bay Ridge,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,Bay Ridge,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,Bay Ridge,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
12,Bay Ridge,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [36]:
print ("We have {} venue categories in Brooklyn".format(brooklyn_onehot.shape))

We have (784, 42) venue categories in Brooklyn


## Let's group rows by neighborhood and take the mean of the frequency of occurrence of each category

In [37]:
brooklyn_grouped = brooklyn_onehot.groupby('Neighborhood').mean().reset_index()
brooklyn_grouped.head()

Unnamed: 0,Neighborhood,Arts & Crafts Store,Athletics & Sports,Bakery,Bank,Bike Shop,Boutique,Bus Line,Bus Station,Butcher,Café,Clothing Store,Coffee Shop,Department Store,Discount Store,Electronics Store,Food Court,Frozen Yogurt Shop,Furniture / Home Store,General Entertainment,Golf Course,Grocery Store,Gym / Fitness Center,Italian Restaurant,Japanese Restaurant,Juice Bar,Kids Store,Metro Station,Movie Theater,Park,Pharmacy,Pool,Restaurant,Salon / Barbershop,Shopping Mall,Spa,Sporting Goods Shop,Supermarket,Tea Room,Toy / Game Store,Video Store,Women's Store
0,Bath Beach,0.0,0.0,0.066667,0.066667,0.0,0.0,0.0,0.0,0.0,0.066667,0.066667,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133333,0.0,0.0,0.133333,0.0,0.0,0.066667,0.2,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066667
1,Bay Ridge,0.038462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038462,0.038462,0.038462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.038462,0.192308,0.0,0.038462,0.038462,0.0,0.0,0.038462,0.076923,0.0,0.0,0.0,0.0,0.230769,0.0,0.038462,0.038462,0.038462,0.0,0.0
2,Bedford Stuyvesant,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.111111,0.0,0.222222,0.0,0.222222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.111111,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bensonhurst,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.125,0.0,0.0,0.0,0.0
4,Bergen Beach,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Establish top 10 venue categories for Brooklyn's neighborhoods

In [39]:
brooklyn_grouped.shape

(70, 42)

In [40]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
bn_venues_sorted = pd.DataFrame(columns=columns)
bn_venues_sorted['Neighborhood'] = brooklyn_grouped['Neighborhood']

for ind in np.arange(brooklyn_grouped.shape[0]):
    bn_venues_sorted.iloc[ind, 1:] = return_most_common_venues(brooklyn_grouped.iloc[ind, :], num_top_venues)

bn_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Bath Beach,Pharmacy,Italian Restaurant,Kids Store,Restaurant,Coffee Shop,Clothing Store,Café,Park,Women's Store,Bank
1,Bay Ridge,Spa,Italian Restaurant,Grocery Store,Pharmacy,Clothing Store,Coffee Shop,Department Store,Gym / Fitness Center,Juice Bar,Kids Store
2,Bedford Stuyvesant,Coffee Shop,Café,Japanese Restaurant,Boutique,Juice Bar,Bus Station,Park,Women's Store,Frozen Yogurt Shop,Food Court
3,Bensonhurst,Grocery Store,Spa,Italian Restaurant,Coffee Shop,Park,Butcher,Supermarket,Bakery,Bank,Bike Shop
4,Bergen Beach,Athletics & Sports,Women's Store,Clothing Store,Furniture / Home Store,Frozen Yogurt Shop,Food Court,Electronics Store,Discount Store,Department Store,Coffee Shop


## Cluster Brooklyn's neighborhoods

In [41]:
# set number of clusters
kclusters = 5

bn_grouped_clustering = brooklyn_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(bn_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:100] 

array([1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 3, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1], dtype=int32)

In [42]:
# add clustering labels
bn_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
bn_merged = brooklyn_data
bn_merged = bn_merged.join(bn_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

bn_merged.head() # check the last columns!

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Brooklyn,Bay Ridge,40.625801,-74.030621,1,Spa,Italian Restaurant,Grocery Store,Pharmacy,Clothing Store,Coffee Shop,Department Store,Gym / Fitness Center,Juice Bar,Kids Store
1,Brooklyn,Bensonhurst,40.611009,-73.99518,1,Grocery Store,Spa,Italian Restaurant,Coffee Shop,Park,Butcher,Supermarket,Bakery,Bank,Bike Shop
2,Brooklyn,Sunset Park,40.645103,-74.010316,1,Bakery,Bank,Pharmacy,Women's Store,Italian Restaurant,Grocery Store,Spa,Butcher,Electronics Store,Discount Store
3,Brooklyn,Greenpoint,40.730201,-73.954241,1,Coffee Shop,Bakery,Café,Furniture / Home Store,Grocery Store,Boutique,Italian Restaurant,Restaurant,Juice Bar,Arts & Crafts Store
4,Brooklyn,Gravesend,40.59526,-73.973471,1,Italian Restaurant,Bakery,Bus Station,Pharmacy,Sporting Goods Shop,Furniture / Home Store,Bike Shop,Boutique,Bus Line,Bank


In [43]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(bn_merged['Latitude'], bn_merged['Longitude'], bn_merged['Neighborhood'], bn_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# Second City - North York, Toronto

In [46]:
# TORONTO

!wget -q -O 'toronto_data.csv' https://www.dropbox.com/s/yxodhl9t7z8s2z5/toronto.csv?dl=0
dfToronto = pd.read_csv('toronto_data.csv')
dfToronto = dfToronto.rename(columns = {'Neighbourhood' : 'Neighborhood'})
dfToronto = dfToronto.drop(dfToronto[dfToronto.Borough == 'Not assigned'].index)
dfToronto.loc[dfToronto['Neighborhood'] == 'Not assigned', 'Neighborhood'] = dfToronto['Borough']
df_toronto = dfToronto.groupby(['Postcode', 'Borough'])['Neighborhood'].apply(','.join).reset_index()
!wget -q -O 'toronto_geo_data.csv' https://www.dropbox.com/s/rd9rtsnx3nz9bfh/Geospatial_Coordinates.csv?dl=0
dfGeoData = pd.read_csv('toronto_geo_data.csv')
dfGeoData.set_index('Postal Code', inplace=True)

geolocator = Nominatim(user_agent="toronto_explorer")
aLatitude = []
aLongitude = []

for postcode, borough in zip(df_toronto['Postcode'], df_toronto['Borough']):
    aLatitude.append(dfGeoData.loc[postcode,:]['Latitude'])
    aLongitude.append(dfGeoData.loc[postcode,:]['Longitude'])

df_toronto['Latitude'] = aLatitude
df_toronto['Longitude'] = aLongitude

## Pick North York as the sample district to compare and focus on its neighborhoods

In [47]:
northyork_data = df_toronto[df_toronto['Borough'] == 'North York'].reset_index(drop=True)

address = 'North York, Toronto, ON'

geolocator = Nominatim(user_agent="north_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# create map of North York using latitude and longitude values
map_northyork = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
map_northyork = map_markers_neigh(map_northyork, northyork_data['Latitude'], northyork_data['Longitude'], northyork_data['Neighborhood'])

map_northyork

## Similarly, lets repeat the same steps for North York, as we did for Brooklyn

In [48]:
a_shape = northyork_data.shape
print ("North York has {} neighborhoods".format(a_shape[0]))

northyork_data.head()

North York has 24 neighborhoods


Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M2H,North York,Hillcrest Village,43.803762,-79.363452
1,M2J,North York,"Fairview,Henry Farm,Oriole",43.778517,-79.346556
2,M2K,North York,Bayview Village,43.786947,-79.385975
3,M2L,North York,"Silver Hills,York Mills",43.75749,-79.374714
4,M2M,North York,"Newtonbrook,Willowdale",43.789053,-79.408493


In [49]:
northyork_venues = getNearbyVenues(names=northyork_data['Neighborhood'],
                                   latitudes=northyork_data['Latitude'],
                                   longitudes=northyork_data['Longitude']
                                  )

# print the size of the resulting dataframe along with the dataframe itself
print ("We have {} venues in North York".format(northyork_venues.shape[0]))
northyork_venues.head()

We have 251 venues in North York


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Hillcrest Village,43.803762,-79.363452,Eagle's Nest Golf Club,43.805455,-79.364186,Golf Course
1,Hillcrest Village,43.803762,-79.363452,AY Jackson Pool,43.804515,-79.366138,Pool
2,Hillcrest Village,43.803762,-79.363452,Villa Madina,43.801685,-79.363938,Mediterranean Restaurant
3,Hillcrest Village,43.803762,-79.363452,Duncan Creek Park,43.805539,-79.360695,Dog Run
4,"Fairview,Henry Farm,Oriole",43.778517,-79.346556,The LEGO Store,43.778207,-79.343483,Toy / Game Store


## We need to merge the two datafames (Brooklyn & North York) to run K-means clustering against them to establish similarities

In [51]:
# Lets filter venue categories
venue_cat = ['Arts & Crafts Store', 'Athletics & Sports', 'Bakery', 'Bank', 'Bike Shop', 'Boutique', 'Bus Line', 'Bus Station', 'Butcher', 'Café', 'Clothing Store', 'Coffee Shop', 'Department Store', 'Discount Store', 'Electronics Store', 'Food Court', 'Frozen Yogurt Shop', 'Furniture / Home Store', 'General Entertainment', 'Golf Course', 'Grocery Store', 'Gym / Fitness Center', 'Italian Restaurant', 'Japanese Restaurant', 'Juice Bar', 'Kids Store', 'Metro Station', 'Movie Theater', 'Park', 'Pharmacy', 'Pool', 'Restaurant', 'Salon / Barbershop', 'Shopping Mall', 'Spa', 'Sporting Goods Shop', 'Supermarket', 'Tea Room', 'Toy / Game Store', 'Video Store', 'Women\'s Store']
northyork_venues_filtered = northyork_venues[northyork_venues["Venue Category"].isin(venue_cat)]
northyork_venues_filtered.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Hillcrest Village,43.803762,-79.363452,Eagle's Nest Golf Club,43.805455,-79.364186,Golf Course
1,Hillcrest Village,43.803762,-79.363452,AY Jackson Pool,43.804515,-79.366138,Pool
4,"Fairview,Henry Farm,Oriole",43.778517,-79.346556,The LEGO Store,43.778207,-79.343483,Toy / Game Store
5,"Fairview,Henry Farm,Oriole",43.778517,-79.346556,SilverCity Fairview Mall Cinemas,43.778681,-79.344085,Movie Theater
6,"Fairview,Henry Farm,Oriole",43.778517,-79.346556,CF Fairview Mall,43.77775,-79.344105,Shopping Mall


In [52]:
dfm = pd.concat([brooklyn_venues_filtered, northyork_venues_filtered])
dfm['Venue Category'].unique().shape

(41,)

## Let us merge Brooklyn and North York

In [54]:
# one hot encoding
onehot = pd.get_dummies(dfm[['Venue Category']], prefix="", prefix_sep="")
#brooklyn_onehot.drop('Neighborhood', axis=1, inplace=True)
onehot['Neighborhood'] = dfm['Neighborhood'] # add neighborhood column back to dataframe

fixed_columns = [onehot.columns[-1]] + list(onehot.columns[:-1]) # move neighborhood column to the first column
onehot = onehot[fixed_columns]
onehot.head()

Unnamed: 0,Neighborhood,Arts & Crafts Store,Athletics & Sports,Bakery,Bank,Bike Shop,Boutique,Bus Line,Bus Station,Butcher,Café,Clothing Store,Coffee Shop,Department Store,Discount Store,Electronics Store,Food Court,Frozen Yogurt Shop,Furniture / Home Store,General Entertainment,Golf Course,Grocery Store,Gym / Fitness Center,Italian Restaurant,Japanese Restaurant,Juice Bar,Kids Store,Metro Station,Movie Theater,Park,Pharmacy,Pool,Restaurant,Salon / Barbershop,Shopping Mall,Spa,Sporting Goods Shop,Supermarket,Tea Room,Toy / Game Store,Video Store,Women's Store
0,Bay Ridge,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,Bay Ridge,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,Bay Ridge,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,Bay Ridge,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
12,Bay Ridge,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [55]:
grp = onehot.groupby('Neighborhood').mean().reset_index()
grp.head()

Unnamed: 0,Neighborhood,Arts & Crafts Store,Athletics & Sports,Bakery,Bank,Bike Shop,Boutique,Bus Line,Bus Station,Butcher,Café,Clothing Store,Coffee Shop,Department Store,Discount Store,Electronics Store,Food Court,Frozen Yogurt Shop,Furniture / Home Store,General Entertainment,Golf Course,Grocery Store,Gym / Fitness Center,Italian Restaurant,Japanese Restaurant,Juice Bar,Kids Store,Metro Station,Movie Theater,Park,Pharmacy,Pool,Restaurant,Salon / Barbershop,Shopping Mall,Spa,Sporting Goods Shop,Supermarket,Tea Room,Toy / Game Store,Video Store,Women's Store
0,Bath Beach,0.0,0.0,0.066667,0.066667,0.0,0.0,0.0,0.0,0.0,0.066667,0.066667,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133333,0.0,0.0,0.133333,0.0,0.0,0.066667,0.2,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066667
1,"Bathurst Manor,Downsview North,Wilson Heights",0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.222222,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.111111,0.0,0.111111,0.0,0.0,0.111111,0.0,0.0,0.111111,0.0
2,Bay Ridge,0.038462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038462,0.038462,0.038462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.038462,0.192308,0.0,0.038462,0.038462,0.0,0.0,0.038462,0.076923,0.0,0.0,0.0,0.0,0.230769,0.0,0.038462,0.038462,0.038462,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park,Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.090909,0.0,0.181818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.181818,0.090909,0.090909,0.0,0.0,0.0,0.0,0.090909,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
venues_sorted = pd.DataFrame(columns=columns)
venues_sorted['Neighborhood'] = grp['Neighborhood']

for ind in np.arange(grp.shape[0]):
    venues_sorted.iloc[ind, 1:] = return_most_common_venues(grp.iloc[ind, :], num_top_venues)

venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Bath Beach,Pharmacy,Italian Restaurant,Kids Store,Restaurant,Coffee Shop,Clothing Store,Café,Park,Women's Store,Bank
1,"Bathurst Manor,Downsview North,Wilson Heights",Coffee Shop,Pharmacy,Shopping Mall,Video Store,Restaurant,Frozen Yogurt Shop,Supermarket,Bank,Bike Shop,Bakery
2,Bay Ridge,Spa,Italian Restaurant,Grocery Store,Pharmacy,Clothing Store,Coffee Shop,Department Store,Gym / Fitness Center,Juice Bar,Kids Store
3,Bayview Village,Café,Bank,Japanese Restaurant,Clothing Store,Furniture / Home Store,Frozen Yogurt Shop,Food Court,Electronics Store,Discount Store,Department Store
4,"Bedford Park,Lawrence Manor East",Italian Restaurant,Coffee Shop,Grocery Store,Restaurant,Japanese Restaurant,Juice Bar,Café,Butcher,Pharmacy,Bus Station


In [57]:
# set number of clusters
kclusters = 5

grouped_clustering = grp.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:100] 

array([0, 0, 2, 2, 2, 2, 2, 3, 2, 0, 0, 2, 2, 2, 2, 1, 2, 2, 0, 2, 2, 0,
       2, 0, 0, 2, 2, 0, 1, 2, 2, 4, 0, 0, 2, 0, 2, 0, 0, 2, 2, 0, 2, 0,
       2, 1, 2, 2, 2, 0, 4, 0, 0, 0, 2, 2, 2, 0, 3, 0, 0, 2, 0, 2, 2, 0,
       0, 2, 2, 1, 2, 2, 0, 2, 0, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2,
       0, 1], dtype=int32)

In [59]:
# merge data for brooklyn and northyork

data = pd.concat([brooklyn_data, northyork_data])

# add clustering labels
#del venues_sorted['Cluster Labels']
#del venues_sorted['Postcode']

venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)


mrg = data
mrg = mrg.join(venues_sorted.set_index('Neighborhood'), on='Neighborhood')
mrg = mrg[np.isfinite(mrg['Cluster Labels'])]
mrg['Cluster Labels'] = mrg['Cluster Labels'].astype(int)

mrg.head() # check the last columns!

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Borough,Latitude,Longitude,Neighborhood,Postcode,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Brooklyn,40.625801,-74.030621,Bay Ridge,,2,Spa,Italian Restaurant,Grocery Store,Pharmacy,Clothing Store,Coffee Shop,Department Store,Gym / Fitness Center,Juice Bar,Kids Store
1,Brooklyn,40.611009,-73.99518,Bensonhurst,,2,Grocery Store,Spa,Italian Restaurant,Coffee Shop,Park,Butcher,Supermarket,Bakery,Bank,Bike Shop
2,Brooklyn,40.645103,-74.010316,Sunset Park,,0,Bakery,Bank,Pharmacy,Women's Store,Italian Restaurant,Grocery Store,Spa,Butcher,Electronics Store,Discount Store
3,Brooklyn,40.730201,-73.954241,Greenpoint,,2,Coffee Shop,Bakery,Café,Furniture / Home Store,Grocery Store,Boutique,Italian Restaurant,Restaurant,Juice Bar,Arts & Crafts Store
4,Brooklyn,40.59526,-73.973471,Gravesend,,2,Italian Restaurant,Bakery,Bus Station,Pharmacy,Sporting Goods Shop,Furniture / Home Store,Bike Shop,Boutique,Bus Line,Bank


In [60]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(mrg['Latitude'], mrg['Longitude'], mrg['Neighborhood'], mrg['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters