# Toronto Neighborhood Clustering
---

In [1]:
import pandas as pd 
from bs4 import BeautifulSoup as bs4
import requests

### Data source from wikipedia:
[List of postal codes of Canada: M](https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M)

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r = requests.get(url)
soup = bs4(r.text)
table = soup.find_all('table')
df_list = pd.read_html(str(table))

In [3]:
len(df_list)

4

In [4]:
df = df_list[0]
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Data Preprocessing
---

### Remove data where boroughs are not assigned

In [5]:
df = df[df['Borough'] != 'Not assigned']
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### If Neighbourhood is not assigned a name, use borough name

In [6]:
df.apply(lambda row: row['Borough'] if (row['Neighbourhood'] == 'Not Assigned') else row['Neighbourhood'], axis=1)
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Merge cells with similar postal codes by joining neighbourhoods

In [7]:
by_neighbourhoods = df.groupby(['Postal Code'])['Neighbourhood'].apply(', '.join)

### Merge and drop duplicates

In [8]:
df.merge(by_neighbourhoods, on='Postal Code')
df.drop_duplicates()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [9]:
df.shape

(103, 3)

## Get postal code location data
---

In [10]:
!pip install geocoder



In [11]:
import geocoder

In [12]:
def get_coord(postal_code):
    g = geocoder.arcgis('{}, Toronto, Ontario coordinates'.format(postal_code))
    loc = g.latlng
    return loc[0], loc[1]

In [13]:
df = df.merge(df.apply(lambda row: pd.Series(get_coord(row['Postal Code'])), axis=1), left_index=True, right_index=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,0,1
2,M3A,North York,Parkwoods,43.75245,-79.32991
3,M4A,North York,Victoria Village,43.73057,-79.31306
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72327,-79.45042
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188


In [14]:
df.rename({0: 'Latitude', 1: 'Longitude'}, axis=1, inplace=True)
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.75245,-79.32991
3,M4A,North York,Victoria Village,43.73057,-79.31306
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72327,-79.45042
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.66263,-79.52831
9,M1B,Scarborough,"Malvern, Rouge",43.81139,-79.19662
11,M3B,North York,Don Mills,43.74923,-79.36186
12,M4B,East York,"Parkview Hill, Woodbine Gardens",43.70718,-79.31192
13,M5B,Downtown Toronto,"Garden District, Ryerson",43.65739,-79.37804


## Filter for the Toronto area and visualize
---

In [15]:
df_toronto = df[df['Borough'].str.contains('Toronto')].reset_index(drop=True)
print(df_toronto['Borough'].unique())
df_toronto.head()

['Downtown Toronto' 'East Toronto' 'West Toronto' 'Central Toronto']


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.65739,-79.37804
3,M5C,Downtown Toronto,St. James Town,43.65215,-79.37587
4,M4E,East Toronto,The Beaches,43.67709,-79.29547


In [16]:
!pip install folium



In [17]:
import folium

In [18]:
toronto_location = geocoder.arcgis('Toronto, CA').latlng
map_toronto = folium.Map(location=toronto_location, zoom_start=12)

# add markers to map
for borough, neighbourhood, lat, lng in zip(df_toronto['Borough'], df['Neighbourhood'], df_toronto['Latitude'], df_toronto['Longitude']):
    label = folium.Popup("{0}: {1}".format(borough, neighbourhood), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
map_toronto

## Cluster neighborhoods

In [19]:
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

In [20]:
k = 4
kmeans = KMeans(n_clusters=k, random_state=0)
kmeans.fit(df_toronto[['Latitude', 'Longitude']])
kmeans.labels_[0:10]

array([2, 2, 2, 2, 0, 2, 2, 3, 2, 3], dtype=int32)

In [21]:
toronto_clustered = df_toronto
toronto_clustered.insert(0, 'Cluster', kmeans.labels_)
toronto_clustered.head()

Unnamed: 0,Cluster,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
1,2,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188
2,2,M5B,Downtown Toronto,"Garden District, Ryerson",43.65739,-79.37804
3,2,M5C,Downtown Toronto,St. James Town,43.65215,-79.37587
4,0,M4E,East Toronto,The Beaches,43.67709,-79.29547


In [22]:
# create map
map_clusters = folium.Map(location=toronto_location, zoom_start=12)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for row in toronto_clustered.itertuples():
    label = folium.Popup('Cluster {0}, {1} borough.\nNeighbourhood(s): {2}'.format(row.Cluster, row.Borough, row.Neighbourhood))
    folium.CircleMarker(
        [row.Latitude, row.Longitude],
        radius=5,
        popup=label,
        color=rainbow[row.Cluster-1],
        fill=True,
        fill_color=rainbow[row.Cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

With the exception of the neighbourhoods of Christie and Rosedale, the KMeans algorithm is able to separate the Toronto boroughs via location data only.