# Segmenting and Clustering Neighborhoods in Toronto

## 1 - Scraping the Wikipedia table

In [87]:
import pandas as pd
import numpy as np

! pip install beautifulsoup4
from bs4 import BeautifulSoup

import requests
import json 
from pandas.io.json import json_normalize 

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans



### Set up Dataframe

In [88]:
postalCodes = pd.DataFrame(columns = ['Postal Code', 'Borough', 'Neighborhood'])

print(postalCodes)

Empty DataFrame
Columns: [Postal Code, Borough, Neighborhood]
Index: []


### Get data from URL, chop into table cells, extract data

In [89]:
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

res = requests.get(URL).text
soup = BeautifulSoup(res,'html.parser')

for items in soup.find('table', class_='wikitable').find_all('tr')[1::1]:
    data = items.find_all(['th','td'])
    try:
        postal_code = data[0].text.replace('\n', '')
        borough = data[1].text.replace('\n', '')
        neighborhood = data[1].find_next_sibling().text.replace('\n', '')
    except IndexError:pass
    #print("{} - {} - {}".format(postal_code, burough, neighborhood))
    postalCodes = postalCodes.append({'Postal Code' : postal_code, 'Borough' : borough, 'Neighborhood' : neighborhood}, ignore_index=True)

postalCodes.head(12)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


In [90]:
postalCodes.shape

(180, 3)

### Clean up 'Not Assigned' postal codes

In [91]:
postalCodes.drop(postalCodes[postalCodes['Borough'] == 'Not assigned'].index, inplace = True)
postalCodes.reset_index(inplace = True, drop = True)

postalCodes.shape

(103, 3)

## 2 - Finding and consolidating geospatial data

### Read coordinates from csv file

In [92]:
torontoCoords = pd.read_csv('https://cocl.us/Geospatial_data')
torontoCoords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merging coordinates dataframe and neighborhoods dataframe, based on postal code

In [93]:
torontoFull = postalCodes.merge(torontoCoords, on="Postal Code", how="left")
torontoFull.head(12)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


## 3 - Exploring neighborhoods

### Set up Foursquare API + credentials

In [94]:
!pip install geopy
from geopy.geocoders import Nominatim 

CLIENT_ID = '4FQ5LPXDTUKPCS5EMVRID3M5LUEE2UD1POWUCJ25PMZJVXUG' # your Foursquare ID
CLIENT_SECRET = 'CIMCNJJPESOSEBN4STKISR4YHAWVJTO4Y30IZ53VYSTSJECW' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30



### Look up the coordinates of Toronto

In [95]:
address = 'Toronto'
geolocator = Nominatim(user_agent=CLIENT_ID)
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The coordinates of Toronto are 43.6534817, -79.3839347.


In [96]:
#!pip install folium==0.11.0
import folium # plotting library

### Create visual map of neighborhoods

In [97]:
torontoMap = folium.Map(location = [latitude, longitude], zoom_start = 10)

for lat, long, borough, neighborhood in zip(torontoFull['Latitude'], torontoFull['Longitude'], torontoFull['Borough'], torontoFull['Neighborhood']):
    label = '{},{}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    label = folium.CircleMarker(
        [lat, long], 
        radius = 5, 
        popup = label, 
        color = 'blue', 
        fill = True,
        fill_color = 'gray',
        fill_opacity = 0.7,
        parse_html = False).add_to(torontoMap)
    
torontoMap

### Filtering boroughs that contain the word 'Toronto'

In [98]:
boroughs = list(torontoFull.Borough.unique())
torontoBoro = []
for x in boroughs:
    if "toronto" in x.lower():
        torontoBoro.append(x)
torontoBoro

['Downtown Toronto', 'East Toronto', 'West Toronto', 'Central Toronto']

In [99]:
torontoFull = torontoFull[torontoFull['Borough'].isin(torontoBoro)].reset_index(drop=True)
torontoFull.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


### Reading in top 100 Toronto Venues within 600 meters

In [100]:
radius = 600
limit = 100
venues = []

for lat, long, post, borough, neighborhood in zip(torontoFull['Latitude'], torontoFull['Longitude'], torontoFull['Postal Code'], torontoFull['Borough'], torontoFull['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        limit)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            post, 
            borough,
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))
        
venues = pd.DataFrame(venues)
venues.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Corktown Common,43.655618,-79.356211,Park


In [101]:
venues.columns = ['PostalCode', 'Borough', 'Neighborhood', 'Boro_Lat', 'Boro_Long', 'Ven_Name', 'Ven_Lat', 'Ven_Long', 'Ven_Category']
venues.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Boro_Lat,Boro_Long,Ven_Name,Ven_Lat,Ven_Long,Ven_Category
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Corktown Common,43.655618,-79.356211,Park


In [102]:
venues.shape

(2094, 9)

In [103]:
venues.groupby(["PostalCode", "Borough", "Neighborhood"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Boro_Lat,Boro_Long,Ven_Name,Ven_Lat,Ven_Long,Ven_Category
PostalCode,Borough,Neighborhood,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
M4E,East Toronto,The Beaches,21,21,21,21,21,21
M4K,East Toronto,"The Danforth West, Riverdale",67,67,67,67,67,67
M4L,East Toronto,"India Bazaar, The Beaches West",29,29,29,29,29,29
M4M,East Toronto,Studio District,66,66,66,66,66,66
M4N,Central Toronto,Lawrence Park,3,3,3,3,3,3
M4P,Central Toronto,Davisville North,14,14,14,14,14,14
M4R,Central Toronto,"North Toronto West, Lawrence Park",29,29,29,29,29,29
M4S,Central Toronto,Davisville,44,44,44,44,44,44
M4T,Central Toronto,"Moore Park, Summerhill East",5,5,5,5,5,5
M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park",50,50,50,50,50,50


In [104]:
print('There are {} uniques categories.'.format(len(venues['Ven_Category'].unique())))

There are 252 uniques categories.


In [105]:
torontoEncoding = pd.get_dummies(venues[['Ven_Category']], prefix="", prefix_sep="")

torontoEncoding['Neighborhood'] = venues['Neighborhood'] 

fixed_columns = [torontoEncoding .columns[-1]] + list(torontoEncoding.columns[:-1])
torontoEncoding = torontoEncoding[fixed_columns]

print(torontoEncoding.shape)
torontoEncoding.head()

(2094, 252)


Unnamed: 0,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Amphitheater,...,Train Station,Tram Station,Udon Restaurant,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [106]:
torontoEngrouped = torontoEncoding.groupby('Neighborhood').mean().reset_index()
torontoEngrouped

Unnamed: 0,Neighborhood,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Tram Station,Udon Restaurant,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Women's Store
0,Berczy Park,0.011236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011236,...,0.0,0.0,0.0,0.0,0.022472,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0,0.058824,0.058824,0.058824,0.117647,0.176471,0.117647,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0
5,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Church and Wellesley,0.02,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0
7,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,...,0.01,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.01,0.0
8,Davisville,0.022727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Frequency of top venues in Toronto, grouped by neighborhood

In [107]:
num_top_venues = 3

for hood in torontoEngrouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_engrouped[torontoEngrouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
                 venue  freq
0          Coffee Shop  0.10
1  Japanese Restaurant  0.03
2                Hotel  0.03


----Brockton, Parkdale Village, Exhibition Place----
         venue  freq
0         Café  0.08
1  Coffee Shop  0.08
2       Bakery  0.05


----Business reply mail Processing Centre, South Central Letter Processing Plant Toronto----
                venue  freq
0       Burrito Place  0.09
1  Light Rail Station  0.09
2             Brewery  0.05


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
             venue  freq
0  Airport Service  0.18
1      Coffee Shop  0.12
2   Airport Lounge  0.12


----Central Bay Street----
             venue  freq
0      Coffee Shop  0.18
1             Café  0.07
2  Bubble Tea Shop  0.04


----Christie----
           venue  freq
0  Grocery Store  0.24
1           Café  0.18
2           Park  0.12


----Church and Wellesley----
                 venue  freq
0

In [108]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [109]:
num_top_venues = 3

indicators = ['st', 'nd', 'rd']

columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

venuesSorted = pd.DataFrame(columns=columns)
venuesSorted['Neighborhood'] = torontoEngrouped['Neighborhood']

for ind in np.arange(torontoEngrouped.shape[0]):
    venuesSorted.iloc[ind, 1:] = return_most_common_venues(torontoEngrouped.iloc[ind, :], num_top_venues)
venuesSorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,Berczy Park,Coffee Shop,Pub,Seafood Restaurant
1,"Brockton, Parkdale Village, Exhibition Place",Café,Coffee Shop,Gift Shop
2,"Business reply mail Processing Centre, South C...",Burrito Place,Light Rail Station,Pizza Place
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Coffee Shop,Airport Terminal
4,Central Bay Street,Coffee Shop,Café,Bubble Tea Shop


### Clustering Neighborhoods

In [110]:
kclusters = 5

torontoClustering = toronto_engrouped.drop('Neighborhood', 1)

#k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(torontoClustering)

kmeans.labels_[0:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [111]:
# clustering labels
venuesSorted.insert(0, 'Cluster Labels', kmeans.labels_)

torontoMerged = torontoFull 

torontoMerged = torontoMerged.join(venuesSorted.set_index('Neighborhood'), on='Neighborhood')

torontoMerged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,Coffee Shop,Theater,Bakery
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0,Coffee Shop,Sandwich Place,Bookstore
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,Coffee Shop,Clothing Store,Bubble Tea Shop
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0,Coffee Shop,Café,Clothing Store
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Pub,Women's Store,Health Food Store


### Create map and add markers


In [112]:
clusteredMap = folium.Map(location=[latitude, longitude], zoom_start=10)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(torontoMerged['Latitude'], torontoMerged['Longitude'], torontoMerged['Neighborhood'], torontoMerged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(clusteredMap)
       
clusteredMap

### Examining Clusters

In [113]:
torontoMerged.loc[torontoMerged['Cluster Labels'] == 0, torontoMerged.columns[[1] + list(range(5, torontoMerged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,Downtown Toronto,0,Coffee Shop,Theater,Bakery
1,Downtown Toronto,0,Coffee Shop,Sandwich Place,Bookstore
2,Downtown Toronto,0,Coffee Shop,Clothing Store,Bubble Tea Shop
3,Downtown Toronto,0,Coffee Shop,Café,Clothing Store
4,East Toronto,0,Pub,Women's Store,Health Food Store
5,Downtown Toronto,0,Coffee Shop,Pub,Seafood Restaurant
6,Downtown Toronto,0,Coffee Shop,Café,Bubble Tea Shop
7,Downtown Toronto,0,Grocery Store,Café,Park
8,Downtown Toronto,0,Coffee Shop,Café,Gym
9,West Toronto,0,Park,Bakery,Pharmacy


In [114]:
torontoMerged.loc[torontoMerged['Cluster Labels'] == 1, torontoMerged.columns[[1] + list(range(5, torontoMerged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
19,Central Toronto,1,Music Venue,Garden,Spa


In [115]:
torontoMerged.loc[torontoMerged['Cluster Labels'] == 2, torontoMerged.columns[[1] + list(range(5, torontoMerged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
29,Central Toronto,2,Park,Gym,Tennis Court
33,Downtown Toronto,2,Park,Playground,Trail


In [116]:
torontoMerged.loc[torontoMerged['Cluster Labels'] == 3, torontoMerged.columns[[1] + list(range(5, torontoMerged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
21,Central Toronto,3,Trail,Park,Jewelry Store


In [117]:
torontoMerged.loc[torontoMerged['Cluster Labels'] == 4, torontoMerged.columns[[1] + list(range(5, torontoMerged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
18,Central Toronto,4,Park,Swim School,Bus Line
