# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
# uncomment if needed pip install lxml

Import the relevant libraries

In [2]:
import pandas as pd
import lxml

In [3]:
df1 = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
df =df1[0]

Cells without boroughs have been ignored.

Unassigned neighborhoods have been assigned the borough name.

In [4]:
df = df[df.Borough != "Not assigned"]

for i in range(df.shape[0]):
    if df.Neighborhood.iloc[i] == "Not assigned":
        df.Neighborhood.iloc[i] = df.Borough.iloc[i]


In [5]:
df

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [6]:
df.shape

(103, 3)

Retrieve coordinates and merge latitude and longitude into dataframe

In [7]:
df_coords = pd.read_csv("http://cocl.us/Geospatial_data")

In [8]:
df_coords.shape

(103, 3)

In [9]:
df = df.merge(df_coords, on = "Postal Code")
df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


import libraries for clustering and plotting

In [10]:
import numpy as np
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import json

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim

import requests
from pandas.io.json import json_normalize

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium

print("Libraries imported")



Libraries imported


In [11]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df['Borough'].unique()),
        df.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


Get coordinates for Toronto and create map.

In [12]:
address = "Toronto, ON"

geolocator = Nominatim(user_agent = "on_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print("The geographic coordinate of Toronto are {}, {}." .format(latitude, longitude))

The geographic coordinate of Toronto are 43.6534817, -79.3839347.


In [13]:
# create map of Toronto
map_toronto = folium.Map(location = [latitude, longitude], zoom_start = 10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df["Latitude"], df["Longitude"], df["Borough"], df["Neighborhood"]):
    label = "{}, {}".format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color="blue",
        fill=True,
        fill_color="#3186cc",
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto

Focus on just neighborhoods whose burough contains the word Toronto.

In [14]:
toronto_data = df[df["Borough"].str.contains("Toronto")].reset_index(drop=True)
toronto_data.head()


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


Create new map

In [15]:
address = "Toronto, ON"

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [16]:
# create map of Toronto
map_toronto = folium.Map(location = [latitude, longitude], zoom_start = 11)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_data["Latitude"], toronto_data["Longitude"], toronto_data["Borough"], toronto_data["Neighborhood"]):
    label = "{}, {}".format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color="blue",
        fill=True,
        fill_color="#3186cc",
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto

For boroughs with more than one neighborhood, make latitude and longitude the average coordinates of the group.

In [17]:
toronto_grouped = toronto_data.groupby("Neighborhood").mean().reset_index()
toronto_grouped


Unnamed: 0,Neighborhood,Latitude,Longitude
0,Berczy Park,43.644771,-79.373306
1,"Brockton, Parkdale Village, Exhibition Place",43.636847,-79.428191
2,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
3,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442
4,Central Bay Street,43.657952,-79.387383
5,Christie,43.669542,-79.422564
6,Church and Wellesley,43.66586,-79.38316
7,"Commerce Court, Victoria Hotel",43.648198,-79.379817
8,Davisville,43.704324,-79.38879
9,Davisville North,43.712751,-79.390197


Cluster the data and plot

In [18]:
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop("Neighborhood", 1)

kmeans = KMeans(n_clusters = kclusters, random_state=0).fit(toronto_grouped_clustering)

kmeans.labels_[0:10]

array([0, 1, 4, 0, 0, 1, 0, 0, 2, 2], dtype=int32)

In [19]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(toronto_grouped['Latitude'], toronto_grouped['Longitude'], toronto_grouped['Neighborhood'], kmeans.labels_):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters