# Segmenting and Clustering Neighborhoods in Toronto

Set up dependencies

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

import requests # library to fetch URLs

from bs4 import BeautifulSoup # library to scrape data from a webpage

print('Libraries imported.')

Libraries imported.


# Load Data


## Scrape Postal Codes from Wikipedia
Scrape the postal codes table from this Wikipedia page:
https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

<i>"This is a list of postal codes in Canada where the first letter is M. Postal codes beginning with M are located within the city of Toronto in the province of Ontario. Only the first three characters are listed, corresponding to the Forward Sortation Area."</i>


In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

response = requests.get(url)
if response.status_code != 200:
    print('Failed to get data:', response.status_code)
else:
    print(f'Data downloaded from: {url}')


Data downloaded from: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M


In [3]:
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find("table", {"class":"wikitable sortable"})
df = pd.read_html(str(table),header=0)[0]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Clean up the postal code dataframe

In [4]:
# Remove unassigned boroughs
n = df.shape[0]
df = df.loc[df['Borough']!='Not assigned',:]
print(f'Removed {n-df.shape[0]:,} rows with unassigned boroughs.')
del n

# Check if a postcode can have more than one borough:
if np.max(df.groupby('Postcode')['Borough'].nunique())==1:
    print('As expected, there is only one borough in each postcode. Yay!')
else:
    print('Watch out! There is at least one postcode with multiple distinct boroughs...')

# Give unassigned neighborhoods the name of their borough
n = sum(df['Neighbourhood']=='Not assigned')
print(f'{n:,} rows with unassigned neighbourhoods (now assigned to borough).')
del n

df.loc[df['Neighbourhood']=='Not assigned','Neighbourhood']=df['Borough']

# Combine multiple rows for a given postcode into one comma-separated list of neighbourhoods
n = df.shape[0]
df=df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()
print(f'Collapsed {n-df.shape[0]:,} duplicated postcodes into single rows.')
del n
print('\n')
print(df.head())

Removed 77 rows with unassigned boroughs.
As expected, there is only one borough in each postcode. Yay!
1 rows with unassigned neighbourhoods (now assigned to borough).
Collapsed 108 duplicated postcodes into single rows.


  Postcode      Borough                           Neighbourhood
0      M1B  Scarborough                          Rouge, Malvern
1      M1C  Scarborough  Highland Creek, Rouge Hill, Port Union
2      M1E  Scarborough       Guildwood, Morningside, West Hill
3      M1G  Scarborough                                  Woburn
4      M1H  Scarborough                               Cedarbrae


In [5]:
print(f'The cleaned dataframe has {df.shape[0]:,} total rows, making up the following buroughs:')
print(df['Borough'].value_counts())

The cleaned dataframe has 103 total rows, making up the following buroughs:
North York          24
Downtown Toronto    18
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
York                 5
East Toronto         5
East York            5
Queen's Park         1
Mississauga          1
Name: Borough, dtype: int64


<b>In the last cell of the notebook, use the .shape method to print the number of rows in the dataframe:</b>

In [6]:
print(df.shape[0])

103


## Get Latitude and Longitude Coordinates

In [7]:
# Download csv from course website
url = 'https://cocl.us/Geospatial_data'
lat_lng_coords = pd.read_csv('https://cocl.us/Geospatial_data')
print(f'Data downloaded from: {url}')

lat_lng_coords.head()

Data downloaded from: https://cocl.us/Geospatial_data


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [8]:
# Combine the two dataframes to create the specified dataframe for this assignment
toronto_df = pd.merge(
    left=df, right=lat_lng_coords,
    left_on='Postcode',right_on='Postal Code',
    how='left', validate='1:1', indicator=True)

# Confirm that all of the postal codes matched to a latitude/longitude coordinate
print('Merge results:')
print(toronto_df['_merge'].value_counts())

# Adjust the column names to match the assignment
toronto_df.drop(['Postal Code','_merge'],axis=1,inplace=True)
toronto_df.rename(columns={'Postcode':'PostalCode','Neighbourhood':'Neighborhood'},inplace=True)

Merge results:
both          103
right_only      0
left_only       0
Name: _merge, dtype: int64


<b>Display the first 15 lines of the dataframe.</b>

In [9]:
toronto_df.head(15)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


# Explore and Cluster Toronto Neighborhoods

In [10]:
print(f"""The dataframe has {len(toronto_df['Borough'].unique()):,} boroughs and {len(toronto_df['Neighborhood'].unique()):,} neighborhoods.""")

The dataframe has 11 boroughs and 103 neighborhoods.


Let's get the geographical coordinates of Toronto.

In [11]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


As we did in the New York City lab, let's visualize Toronto and mark the neighborhoods on a map.

In [12]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude,longitude],zoom_start=10)


# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Borough'], toronto_df['Neighborhood']):
    label = f"{neighborhood}, {borough}"
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [13]:
toronto_df['Borough'].value_counts()

North York          24
Downtown Toronto    18
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
York                 5
East Toronto         5
East York            5
Queen's Park         1
Mississauga          1
Name: Borough, dtype: int64

<b>For simplicity, let's subset to downtown Toronto</b>

In [14]:
toronto_df = toronto_df.loc[toronto_df['Borough'].isin(['Downtown Toronto']),:]
toronto_df.reset_index(inplace=True,drop=True)
toronto_df.head()
print(f"""Now the dataframe has {len(toronto_df['Borough'].unique()):,} boroughs and {len(toronto_df['Neighborhood'].unique()):,} neighborhoods.""")

Now the dataframe has 1 boroughs and 18 neighborhoods.


In [15]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude,longitude],zoom_start=12)


# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Borough'], toronto_df['Neighborhood']):
    label = f"{neighborhood}, {borough}"
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Let's follow the lab and have fun with the FourSquare API.

<b>Define Foursquare Credentials and Version</b>

In [16]:
# @hidden_cell
CLIENT_ID = 'CN5NRP3FQ3PUY403DEAMPRRA25IZ5SF5YGWGIBK3FJUB5FLG' # your Foursquare ID
CLIENT_SECRET = 'AQ3N5JOFGUCUEY1RXYUSIM2ZFBURQRNOSTH43RDPQPE03GYE' # your Foursquare Secret
VERSION = '20180604' # Foursquare API version
LIMIT = 100

Borrow and tweak the getNearbyVenues function from the Exploring NY lab.

In [17]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    """Loop through the input locations and request venue information from FourSquare.
    Output a single dataframe with all of the results."""
    
    venues_dict={}
    idx = 0
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(f'Getting venues near {name}...')
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        for venue in results:            
            venues_dict.update({idx :
                {'Neighborhood':name, 
                 'Neighborhood Latitude':lat, 
                 'Neighborhood Longitude':lng, 
                 'Venue':venue['venue']['name'], 
                 'Venue ID':venue['venue']['id'],
                 'Distance from Neighborhood':venue['venue']['location']['distance'],
                 'Venue Latitude':venue['venue']['location']['lat'], 
                 'Venue Longitude':venue['venue']['location']['lng'],  
                 'Venue Category':venue['venue']['categories'][0]['icon']['prefix'].split('/')[-2] # get broader category bin from icon prefix
                }})
            idx = idx + 1

        print(f'    found {len(results):,} results')
    # Use helper method for creating a dataframe by rows
    nearby_venues = pd.DataFrame.from_dict(venues_dict, orient='index')
    
    print('---------------------------------------------------------')
    print(f'{nearby_venues.shape[0]:,} total venues collected.')
    
    return(nearby_venues)


<b>Get a dataframe with all of the venues in our chosen neighborhoods</b>

In [18]:
toronto_venues = getNearbyVenues(names=toronto_df['Neighborhood'],
                                latitudes=toronto_df['Latitude'],
                                longitudes=toronto_df['Longitude'])

Getting venues near Rosedale...
    found 4 results
Getting venues near Cabbagetown, St. James Town...
    found 47 results
Getting venues near Church and Wellesley...
    found 87 results
Getting venues near Harbourfront, Regent Park...
    found 47 results
Getting venues near Ryerson, Garden District...
    found 100 results
Getting venues near St. James Town...
    found 100 results
Getting venues near Berczy Park...
    found 55 results
Getting venues near Central Bay Street...
    found 88 results
Getting venues near Adelaide, King, Richmond...
    found 100 results
Getting venues near Harbourfront East, Toronto Islands, Union Station...
    found 100 results
Getting venues near Design Exchange, Toronto Dominion Centre...
    found 100 results
Getting venues near Commerce Court, Victoria Hotel...
    found 100 results
Getting venues near Harbord, University of Toronto...
    found 34 results
Getting venues near Chinatown, Grange Park, Kensington Market...
    found 100 results
Get

<b>De-duplicate, make sure each venue is only assigned to the closest neighborhood</b>


In [20]:
toronto_venues = toronto_venues.loc[toronto_venues.groupby('Venue ID')['Distance from Neighborhood'].idxmin()]


<b>Group the results and do some one-hot coding to prepare a neighborhood-level dataframe for k-means clustering</b>

In [21]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped['n'] = toronto_venues.groupby('Neighborhood').size().reset_index()[0]
toronto_grouped['n_scaled'] = toronto_grouped['n']/toronto_grouped['n'].max()
toronto_grouped.set_index('Neighborhood',inplace=True)
toronto_grouped.drop('n',axis=1,inplace=True)
toronto_grouped.head()

## Cluster Neighborhoods

Run k-means to cluster Toronto neighborhoods into 5 clusters

In [55]:
# set number of clusters
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped)

# check cluster labels generated for each row in the dataframe
print(kmeans.labels_)

# add cluster label to dataframe
toronto_grouped['Cluster Label']=kmeans.labels_

[0 2 4 0 3 3 2 3 0 0 0 0 3 0 1 3 0 0]


In [64]:
toronto_merged = pd.merge(left=toronto_df,right=toronto_grouped,left_on='Neighborhood',right_index=True,how='left')
print(toronto_merged.head())

  PostalCode           Borough                 Neighborhood   Latitude  \
0        M4W  Downtown Toronto                     Rosedale  43.679563   
1        M4X  Downtown Toronto  Cabbagetown, St. James Town  43.667967   
2        M4Y  Downtown Toronto         Church and Wellesley  43.665860   
3        M5A  Downtown Toronto    Harbourfront, Regent Park  43.654260   
4        M5B  Downtown Toronto     Ryerson, Garden District  43.657162   

   Longitude  arts_entertainment  building  education      food  nightlife  \
0 -79.377529            0.000000  0.000000   0.000000  0.000000   0.000000   
1 -79.367675            0.021277  0.000000   0.000000  0.574468   0.063830   
2 -79.383160            0.034483  0.022989   0.000000  0.620690   0.114943   
3 -79.360636            0.106383  0.042553   0.000000  0.489362   0.085106   
4 -79.378937            0.045977  0.022989   0.011494  0.528736   0.045977   

   parks_outdoors     shops    travel  n_scaled  Cluster Label  
0        1.000000  0.

## Let's look at the clusters and see if we can describe them

In [80]:
toronto_merged.loc[toronto_merged['Cluster Label']==0,['Neighborhood','n_scaled','arts_entertainment','building','education','food','nightlife','parks_outdoors','shops','travel']]

Unnamed: 0,Neighborhood,n_scaled,arts_entertainment,building,education,food,nightlife,parks_outdoors,shops,travel
1,"Cabbagetown, St. James Town",0.47,0.021277,0.0,0.0,0.574468,0.06383,0.12766,0.212766,0.0
3,"Harbourfront, Regent Park",0.47,0.106383,0.042553,0.0,0.489362,0.085106,0.06383,0.191489,0.021277
5,St. James Town,0.5,0.08,0.06,0.0,0.62,0.02,0.02,0.14,0.06
8,"Adelaide, King, Richmond",0.57,0.070175,0.017544,0.0,0.578947,0.105263,0.035088,0.140351,0.052632
10,"Design Exchange, Toronto Dominion Centre",0.3,0.033333,0.0,0.0,0.7,0.1,0.0,0.033333,0.133333
11,"Commerce Court, Victoria Hotel",0.35,0.028571,0.028571,0.0,0.685714,0.085714,0.0,0.085714,0.085714
12,"Harbord, University of Toronto",0.34,0.058824,0.058824,0.0,0.617647,0.176471,0.0,0.088235,0.0
15,Stn A PO Boxes 25 The Esplanade,0.48,0.0625,0.020833,0.0,0.5625,0.125,0.041667,0.1875,0.0
16,"First Canadian Place, Underground city",0.24,0.041667,0.041667,0.0,0.875,0.041667,0.0,0.0,0.0


In [73]:
toronto_merged.loc[toronto_merged['Cluster Label']==0,['n_scaled','arts_entertainment','building','education','food','nightlife','parks_outdoors','shops','travel']].mean()

n_scaled              0.413333
arts_entertainment    0.055859
building              0.029999
education             0.000000
food                  0.633738
nightlife             0.089228
parks_outdoors        0.032027
shops                 0.119932
travel                0.039217
dtype: float64

Cluster 0 is made up of medium-sized neighborhoods that have a lot of restaurants

In [81]:
toronto_merged.loc[toronto_merged['Cluster Label']==1,['Neighborhood','n_scaled','arts_entertainment','building','education','food','nightlife','parks_outdoors','shops','travel']]

Unnamed: 0,Neighborhood,n_scaled,arts_entertainment,building,education,food,nightlife,parks_outdoors,shops,travel
0,Rosedale,0.04,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


Cluster 1 is a single neighborhood made up solely of a few parks/outdoors venues.

In [79]:
toronto_merged.loc[toronto_merged['Cluster Label']==2,['Neighborhood','n_scaled','arts_entertainment','building','education','food','nightlife','parks_outdoors','shops','travel']]

Unnamed: 0,Neighborhood,n_scaled,arts_entertainment,building,education,food,nightlife,parks_outdoors,shops,travel
6,Berczy Park,0.06,0.0,0.0,0.0,0.333333,0.166667,0.166667,0.166667,0.166667
17,Christie,0.16,0.0,0.0,0.0,0.4375,0.0625,0.125,0.375,0.0


In [75]:
toronto_merged.loc[toronto_merged['Cluster Label']==2,['n_scaled','arts_entertainment','building','education','food','nightlife','parks_outdoors','shops','travel']].mean()

n_scaled              0.110000
arts_entertainment    0.000000
building              0.000000
education             0.000000
food                  0.385417
nightlife             0.114583
parks_outdoors        0.145833
shops                 0.270833
travel                0.083333
dtype: float64

Cluster 2 is made up of small neighborhoods with just a few venues, mostly restuarants and shops. 

In [82]:
toronto_merged.loc[toronto_merged['Cluster Label']==3,['Neighborhood','n_scaled','arts_entertainment','building','education','food','nightlife','parks_outdoors','shops','travel']]

Unnamed: 0,Neighborhood,n_scaled,arts_entertainment,building,education,food,nightlife,parks_outdoors,shops,travel
2,Church and Wellesley,0.87,0.034483,0.022989,0.0,0.62069,0.114943,0.034483,0.149425,0.022989
4,"Ryerson, Garden District",0.87,0.045977,0.022989,0.011494,0.528736,0.045977,0.034483,0.298851,0.011494
7,Central Bay Street,0.7,0.014286,0.028571,0.0,0.828571,0.028571,0.014286,0.085714,0.0
9,"Harbourfront East, Toronto Islands, Union Station",0.89,0.179775,0.033708,0.0,0.505618,0.05618,0.101124,0.067416,0.05618
13,"Chinatown, Grange Park, Kensington Market",1.0,0.01,0.01,0.0,0.68,0.07,0.01,0.2,0.02


In [83]:
toronto_merged.loc[toronto_merged['Cluster Label']==3,['Neighborhood','n_scaled','arts_entertainment','building','education','food','nightlife','parks_outdoors','shops','travel']].mean()

n_scaled              0.866000
arts_entertainment    0.056904
building              0.023651
education             0.002299
food                  0.632723
nightlife             0.063134
parks_outdoors        0.038875
shops                 0.160281
travel                0.022133
dtype: float64

Cluster 3 is made up of large neighborhoods with lots of restaurants and shops

In [84]:
toronto_merged.loc[toronto_merged['Cluster Label']==4,['Neighborhood','n_scaled','arts_entertainment','building','education','food','nightlife','parks_outdoors','shops','travel']]

Unnamed: 0,Neighborhood,n_scaled,arts_entertainment,building,education,food,nightlife,parks_outdoors,shops,travel
14,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.15,0.0,0.0,0.0,0.0,0.133333,0.133333,0.066667,0.666667


Cluster 4 is just one neighborhood - looks like we successfully identified the airport!

In [87]:
cluster_descriptions = {
    0:'Medium neighborhood',
    1:'Parks and outdoors',
    2:'Small neighborhood',
    3:'Large neighborhood',
    4:'Airport'}
toronto_merged['Cluster Description'] = toronto_merged['Cluster Label'].map(cluster_descriptions)

In [92]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster, cluster_desc in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['PostalCode'], toronto_merged['Cluster Label'], toronto_merged['Cluster Description']):
    label = folium.Popup(str(poi) + '\n' + str(cluster_desc), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters