# Clustering Analysis of Toronto Neighborhoods

This notebook contains the codes to obtain the Toronto neighborhoods table from https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M and cluster it

In [1]:
import requests # for handling URL requests

import numpy as np
import pandas as pd # for reading and processing tabular data
import matplotlib.cm as cm
import matplotlib.colors as colors

import folium
from sklearn.cluster import KMeans

## **Task 1:** Exploring the neighborhoods of Toronto

Read the Toronto neighborhoods table from Wikipedia

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

df_nb = pd.read_html(url, header=0)[0]
df_nb.columns = ['Postal Code', 'Borough', 'Neighborhood'] # rename columns
df_nb.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [3]:
# Remove rows with 'Not assigned' values for 'Borough'
df_nb = df_nb[df_nb.Borough != 'Not assigned']
df_nb.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


Merge neighborhoods of the same postal code into a single row

In [4]:
df_nb = df_nb.groupby(by=['Postal Code', 'Borough']).agg(list)
df_nb.Neighborhood = df_nb.Neighborhood.str.join(', ')
df_nb.reset_index(inplace=True)
df_nb.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


Obtain boroughs with no assigned neighborhoods and replace them

Neighborhoods with a 'Not assigned' value is replaced with their borough's name

In [5]:
# Obtain neighborhoods with 'Not assigned' values and replace them with the borough name
na_indices = df_nb.index[df_nb.Neighborhood.str.contains('Not assigned')].tolist()
for i in na_indices:
    df_nb.iloc[i, 2] = df_nb.iloc[i, 1]

df_nb.iloc[na_indices]

Unnamed: 0,Postal Code,Borough,Neighborhood
85,M7A,Queen's Park,Queen's Park


In [6]:
# Get the shape of the final DataFrame
print('The Toronto neighborhoods table consists of %i rows' % df_nb.shape[0])

The Toronto neighborhoods table consists of 103 rows


## **Task 2:** Obtain the geospatial coordinates of Toronto neighborhoods

Read the geospatial coordinates data of Toronto neighborhoods by postal codes

In [7]:
df_geo = pd.read_csv('Geospatial_Coordinates.csv')
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merge the geospatial data with the names of the neighborhoods

In [8]:
df_merged = pd.merge(df_nb, df_geo, on='Postal Code')
df_merged.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


## **Task 3:** Cluster the neighborhoods in Toronto

Search nearby venues around each neighborhood with the Foursquare API

In [9]:
# Define the necessary fields for the Foursquare API URL
CLIENT_ID = '4VMG0FL2G2B3ZOYS50FCSQOG2AVG4CEZP1KMEFMI0ANLIRMP'
CLIENT_SECRET = '1C34YHK4BNAVYX4UQNGV2QDAG4EHVOMSZF0ZK2ZDDL2TVP0F'
VERSION = '20190809'
radius = 500
LIMIT = 40

In [10]:
# Loop through all neighborhoods
venues = [] # store venues in a list
for post_code, name, lat, lng in df_merged[['Postal Code', 'Neighborhood', 'Latitude', 'Longitude']].values:
    # create the Foursquare API request URL
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT)
    
    # make request
    results = requests.get(url).json()['response']['groups'][0]['items']
    
    # return only relevant fields for each venue
    venues.extend([(
        post_code,
        name,
        lat,
        lng,
        v['venue']['name'],
        v['venue']['categories'][0]['name'])
        for v in results])

# Store the information of all venues in a DataFrame
df_venues = pd.DataFrame(venues)
df_venues.columns = [
    'Postal Code',
    'Neighborhood',
    'Latitude',
    'Longitude',
    'Venue',
    'Venue Category'
]

In [11]:
# Check the size of the Dataframe
print(df_venues.shape)
df_venues.head()

(1548, 6)


Unnamed: 0,Postal Code,Neighborhood,Latitude,Longitude,Venue,Venue Category
0,M1B,"Rouge, Malvern",43.806686,-79.194353,Wendy's,Fast Food Restaurant
1,M1B,"Rouge, Malvern",43.806686,-79.194353,Interprovincial Group,Print Shop
2,M1C,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,RIGHT WAY TO GOLF,Golf Course
3,M1C,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Royal Canadian Legion,Bar
4,M1E,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,Pizza Place


Convert the venue categories into features with the one-hot encoding scheme

In [12]:
df_onehot = pd.get_dummies(df_venues['Venue Category'])

# Add neighborhoods postal codes to the DataFrame for grouping
df_onehot = pd.concat([df_venues[['Postal Code']], df_onehot], axis=1)
df_onehot.head()

Unnamed: 0,Postal Code,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Aquarium,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M1C,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M1C,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M1E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Obtain the averages accross venue categories as features for clustering
df_grouped = df_onehot.groupby('Postal Code').mean().reset_index()

print(df_grouped.shape)
df_grouped.head()

(101, 250)


Unnamed: 0,Postal Code,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Aquarium,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Perform *k*-means clustering with averaged number of venue categories as features

In [14]:
k = 7
km = KMeans(n_clusters=k, random_state=1)
km.fit(df_grouped.drop('Postal Code', 1))

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=7, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=1, tol=0.0001, verbose=0)

In [15]:
# Associate cluster labels with their respective neighborhood's postal code
df_cluster = pd.concat([df_grouped['Postal Code'], pd.Series(km.labels_, name='Cluster')], axis=1)
df_cluster.head()

Unnamed: 0,Postal Code,Cluster
0,M1B,1
1,M1C,1
2,M1E,1
3,M1G,0
4,M1H,1


Add cluster labels to the merged neighborhood DataFrame (i.e. `df_merged`)

In [16]:
df_merged = pd.merge(df_merged, df_cluster, on='Postal Code')
df_merged.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,1
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,1
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,1
3,M1G,Scarborough,Woburn,43.770992,-79.216917,0
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,1
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476,1
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029,0
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577,1
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476,1
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848,1


Visualize the resulting clusters in the world map with `folium`

In [17]:
# Create a map centered on Toronto
toronto_map = folium.Map(location=[43.6532, -79.3832], zoom_start=10)

# Set color scheme for clusters
x = np.arange(k)
y = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(y)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Add markers to the map
for post_code, bor, nb, lat, lng, cluster in df_merged.values:
    label = folium.Popup(str(post_code) + '\nCluster ' + str(cluster), parse_html=True)
    _ = folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(toronto_map)

# Display map
toronto_map