# Clustering Neighborhoods in Toronto
By: Ruben Vecino

In [1]:
#!pip install beautifulsoup4
#!pip install html5lib
#!pip install requests
#!pip install geocoder
#!pip install geopy
#!pip install Folium
#!pip install sklearn

In [2]:
#Import libraries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup as bs

In [3]:
#Get html content of Wikipedia page
wiki = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
src = wiki.content

#Create BeautifulSoup parser
soup = bs(src,'html5lib')

In [4]:
#Find headers and rows for the table
headers = soup.find_all('table')[0].find_all('th')
rows = soup.find_all('table')[0].find_all('tr')

#Append header names
names = []
for h in headers:
    names.append(h.getText().replace('\n',''))

#Append row values
content = []
for r in rows:
    cols = r.find_all('td')
    
    colVals = []
    for c in cols:
        colVals.append(c.getText().replace('\n',''))
        
    content.append(colVals)

In [5]:
#Initialize dataframe with given column names and row values
df = pd.DataFrame(columns=names, data=content)
#Rename column for familiarity
df.rename({'Neighbourhood':'Neighborhood'}, axis=1, inplace=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [6]:
#Clean dataframe from null or empty entries ('Not Assigned')
df.dropna(inplace=True)
df = df[(df['Borough'] != 'Not assigned') | (df['Neighborhood'] != 'Not assigned')]

#Sort by postal code to keep consistency
df.sort_values(by='Postal Code', inplace=True)
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [7]:
#Get latitude and longitude values from the .csv file
geo_coords = pd.read_csv('Geospatial_Coordinates.csv')
geo_coords.head()

df = df.merge(geo_coords, on='Postal Code')
#keep only boroughs that contain 'Toronto on it'
df = df[df['Borough'].str.contains('Toronto')]
df.reset_index(drop=True, inplace=True)

del(geo_coords)

df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [8]:
#Get coordinates from toronto using Nominatim geocoder

from geopy.geocoders import Nominatim
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto is {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto is 43.6534817, -79.3839347.


In [9]:
#Import keys from personal file that stores my API keys. Not added to repository
import keys 

CLIENT_ID = keys.CLIENT_ID # your Foursquare ID
CLIENT_SECRET = keys.CLIENT_SECRET # your Foursquare Secret
ACCESS_TOKEN = keys.ACCESS_TOKEN # your FourSquare Access Token
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

In [10]:
#define function for API call for each neighborhood
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

        columns = ['Neighborhood', 
              'Neighborhood Latitude', 
              'Neighborhood Longitude', 
              'Venue', 
              'Venue Latitude', 
              'Venue Longitude', 
              'Venue Category']
        
        nearby_venues = pd.DataFrame(data=[item for venue_list in venues_list for item in venue_list], columns=columns)
    
    return(nearby_venues)

In [11]:
toronto_venues = getNearbyVenues(df['Neighborhood'],
                                 df['Latitude'],
                                 df['Longitude'])

The Beaches
The Danforth West, Riverdale
India Bazaar, The Beaches West
Studio District
Lawrence Park
Davisville North
North Toronto West,  Lawrence Park
Davisville
Moore Park, Summerhill East
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
Rosedale
St. James Town, Cabbagetown
Church and Wellesley
Regent Park, Harbourfront
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North & West, Forest Hill Road Park
The Annex, North Midtown, Yorkville
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Stn A PO Boxes
First Canadian Place, Underground city
Christie
Dufferin, Dovercourt Village
Little Portugal, Trinity
Brockton, Parkdale Village, Exhibition Place
High

In [12]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,"The Danforth West, Riverdale",43.679557,-79.352188,MenEssentials,43.67782,-79.351265,Cosmetics Shop


In [13]:
#Perform one hot encoding on toronto_venues
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

#Some venues had 'neighborhood' for category. Remove Neighborhood column from toronto_onehot
toronto_onehot.drop('Neighborhood', axis=1, inplace=True)

#Append bacnk Neighborhood column and move it to the first position
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood']
new_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[new_columns]

toronto_onehot.head()

Unnamed: 0,Neighborhood,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Theme Restaurant,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"The Danforth West, Riverdale",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
#Get frequency values for venues per neighborhood
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Theme Restaurant,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.017241,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.058824,0.058824,0.058824,0.117647,0.117647,0.117647,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.015625,0.0,0.0,0.015625,0.015625
5,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Church and Wellesley,0.0,0.0,0.0,0.0,0.0,0.0,0.012987,0.0,0.0,...,0.012987,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025974
7,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0
8,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.030303,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
#Create new dataframe with the top 10 most frequent venue categories per nieghborhood
num = 10
temp=[]
columns = [i for i in range(1,11)]
top_10 = pd.DataFrame(columns=columns)

for n in toronto_grouped['Neighborhood']:
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == n].transpose().reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:].sort_values(by='freq', ascending=False).head(num)
    temp = temp[['venue']].transpose()
    temp.columns = columns    
    
    top_10 = top_10.append(temp)

top_10.reset_index(drop=True, inplace=True)
top_10['Neighborhood'] = toronto_grouped['Neighborhood']
top_10 = top_10[[top_10.columns[-1]] + list(top_10.columns[:-1])]
top_10

Unnamed: 0,Neighborhood,1,2,3,4,5,6,7,8,9,10
0,Berczy Park,Coffee Shop,Cocktail Bar,Beer Bar,Restaurant,Seafood Restaurant,Cheese Shop,Bakery,Farmers Market,Nightclub,Steakhouse
1,"Brockton, Parkdale Village, Exhibition Place",Café,Breakfast Spot,Coffee Shop,Bakery,Stadium,Restaurant,Bar,Climbing Gym,Nightclub,Italian Restaurant
2,"Business reply mail Processing Centre, South C...",Park,Auto Workshop,Smoke Shop,Burrito Place,Fast Food Restaurant,Butcher,Farmers Market,Light Rail Station,Garden,Garden Center
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Lounge,Airport Service,Airport Terminal,Airport,Bar,Coffee Shop,Rental Car Location,Sculpture Garden,Boutique,Boat or Ferry
4,Central Bay Street,Coffee Shop,Café,Sandwich Place,Italian Restaurant,Bubble Tea Shop,Salad Place,Burger Joint,Furniture / Home Store,Mediterranean Restaurant,Diner
5,Christie,Grocery Store,Café,Park,Baby Store,Coffee Shop,Italian Restaurant,Restaurant,Nightclub,Athletics & Sports,Candy Store
6,Church and Wellesley,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Restaurant,Fast Food Restaurant,Gay Bar,Yoga Studio,Mediterranean Restaurant,Pub,Men's Store
7,"Commerce Court, Victoria Hotel",Coffee Shop,Restaurant,Café,Hotel,Gym,American Restaurant,Deli / Bodega,Seafood Restaurant,Japanese Restaurant,Italian Restaurant
8,Davisville,Sandwich Place,Dessert Shop,Sushi Restaurant,Café,Pizza Place,Gym,Coffee Shop,Italian Restaurant,Discount Store,Brewery
9,Davisville North,Breakfast Spot,Gym / Fitness Center,Park,Food & Drink Shop,Sandwich Place,Department Store,Hotel,Airport,Miscellaneous Shop,Molecular Gastronomy Restaurant


In [16]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
set(kmeans.labels_)

{0, 1, 2, 3, 4}

In [17]:
#Append cluster labels to top_10
top_10['Cluster Label'] = kmeans.labels_
top_10

Unnamed: 0,Neighborhood,1,2,3,4,5,6,7,8,9,10,Cluster Label
0,Berczy Park,Coffee Shop,Cocktail Bar,Beer Bar,Restaurant,Seafood Restaurant,Cheese Shop,Bakery,Farmers Market,Nightclub,Steakhouse,0
1,"Brockton, Parkdale Village, Exhibition Place",Café,Breakfast Spot,Coffee Shop,Bakery,Stadium,Restaurant,Bar,Climbing Gym,Nightclub,Italian Restaurant,0
2,"Business reply mail Processing Centre, South C...",Park,Auto Workshop,Smoke Shop,Burrito Place,Fast Food Restaurant,Butcher,Farmers Market,Light Rail Station,Garden,Garden Center,0
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Lounge,Airport Service,Airport Terminal,Airport,Bar,Coffee Shop,Rental Car Location,Sculpture Garden,Boutique,Boat or Ferry,0
4,Central Bay Street,Coffee Shop,Café,Sandwich Place,Italian Restaurant,Bubble Tea Shop,Salad Place,Burger Joint,Furniture / Home Store,Mediterranean Restaurant,Diner,0
5,Christie,Grocery Store,Café,Park,Baby Store,Coffee Shop,Italian Restaurant,Restaurant,Nightclub,Athletics & Sports,Candy Store,0
6,Church and Wellesley,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Restaurant,Fast Food Restaurant,Gay Bar,Yoga Studio,Mediterranean Restaurant,Pub,Men's Store,0
7,"Commerce Court, Victoria Hotel",Coffee Shop,Restaurant,Café,Hotel,Gym,American Restaurant,Deli / Bodega,Seafood Restaurant,Japanese Restaurant,Italian Restaurant,0
8,Davisville,Sandwich Place,Dessert Shop,Sushi Restaurant,Café,Pizza Place,Gym,Coffee Shop,Italian Restaurant,Discount Store,Brewery,0
9,Davisville North,Breakfast Spot,Gym / Fitness Center,Park,Food & Drink Shop,Sandwich Place,Department Store,Hotel,Airport,Miscellaneous Shop,Molecular Gastronomy Restaurant,0


In [18]:
#Merge neighborhood and clustering data
toronto_merged = df.merge(top_10, on='Neighborhood')
toronto_merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,1,2,3,4,5,6,7,8,9,10,Cluster Label
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,Health Food Store,Trail,Pub,Airport,Movie Theater,Martial Arts School,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant,0
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Furniture / Home Store,Yoga Studio,Spa,Liquor Store,Fruit & Vegetable Store,Bookstore,0
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572,Fast Food Restaurant,Park,Italian Restaurant,Brewery,Fish & Chips Shop,Food & Drink Shop,Board Shop,Liquor Store,Sandwich Place,Light Rail Station,0
3,M4M,East Toronto,Studio District,43.659526,-79.340923,Coffee Shop,Brewery,Gastropub,Bakery,American Restaurant,Café,Fish Market,Stationery Store,Bookstore,Middle Eastern Restaurant,0
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,Park,Bus Line,Swim School,Airport,Moroccan Restaurant,Martial Arts School,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant,4


In [19]:
#Plot cluster points

import folium
from matplotlib import cm
import matplotlib.colors as colors

cmap = list(cm.rainbow(np.linspace(0, 1, len(set(kmeans.labels_)))))
cmap = [colors.rgb2hex(i) for i in cmap]
map_toronto_clustered = folium.Map(location=[latitude, longitude], zoom_start=11.2)


for neighborhood, cluster, lat, lon in zip(toronto_merged['Neighborhood'],
                                           toronto_merged['Cluster Label'],
                                           toronto_merged['Latitude'],
                                           toronto_merged['Longitude']):
   
    label = folium.Popup('{}, {}'.format(neighborhood, cluster), parse_html = False)
    folium.CircleMarker(
            [lat, lon],
            radius=5,
            popup=label,
            color=None,
            fill=True,
            fill_color=cmap[cluster],
            fill_opacity=0.7,
            parse_html=False).add_to(map_toronto_clustered)
map_toronto_clustered