# Imports

In [0]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

print('Libraries imported.')

Libraries imported.


In [0]:
import folium # map rendering library
print('Imported')

Imported


We use an older version of the wikipedia page to scrape in the format we desire using pandas read_html

In [0]:
df=pd.read_html("https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=945633050")[0]

In [0]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


# Part 1: Data Cleaning

In [0]:
df = df[df.Borough != 'Not assigned']
df['Neighbourhood']=df['Neighbourhood'].replace('Not assigned', df['Borough'])
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


If you use df.Neighbourhood == 'Not assigned' you will see that there are no unassigned Neighbourhoods.

In [0]:
df_grouped = df.groupby(['Postcode','Borough'], as_index=False, sort=False).agg(', '.join)
df_grouped.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park


In [0]:
df_grouped.shape

(103, 3)

# Part 2: Geospatial Dataframe

Download the geospatial_data.csv

In [0]:
!wget -q -O 'geospatial_data.csv' http://cocl.us/Geospatial_data
print('Data downloaded!')

Data downloaded!


In [0]:
df_geo = pd.read_csv('geospatial_data.csv')
df_geo.columns

Index(['Postal Code', 'Latitude', 'Longitude'], dtype='object')

Renaming the column for pandas merge

In [0]:
df_geo = df_geo.rename({'Postal Code':'Postcode'}, axis=1)

In [0]:
df2 = pd.merge(df_grouped,df_geo)

In [0]:
df2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494


In [0]:
df2.shape

(103, 5)

# Part 3: Clustering

Overview of Toronto Map

In [0]:
latitude = 43.6532
longitude = -79.3832

In [0]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df2['Latitude'], df2['Longitude'], df2['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

map_toronto

![alt text](https://i.imgur.com/1bHZsO2.png)

For this clustering exercise, I will be attempting to define borders between the closest Neighbourhoods and identifying which of these each point belongs to. There are 5 neighbourhoods seen on the Folium Map so our K will be 5.

In [0]:
X = np.array(df2[['Latitude','Longitude']])
X

array([[ 43.7532586, -79.3296565],
       [ 43.7258823, -79.3155716],
       [ 43.6542599, -79.3606359],
       [ 43.718518 , -79.4647633],
       [ 43.6623015, -79.3894938],
       [ 43.6678556, -79.5322424],
       [ 43.8066863, -79.1943534],
       [ 43.7459058, -79.352188 ],
       [ 43.7063972, -79.309937 ],
       [ 43.6571618, -79.3789371],
       [ 43.709577 , -79.4450726],
       [ 43.6509432, -79.5547244],
       [ 43.7845351, -79.1604971],
       [ 43.7258997, -79.340923 ],
       [ 43.6953439, -79.3183887],
       [ 43.6514939, -79.3754179],
       [ 43.6937813, -79.4281914],
       [ 43.6435152, -79.5772008],
       [ 43.7635726, -79.1887115],
       [ 43.6763574, -79.2930312],
       [ 43.6447708, -79.3733064],
       [ 43.6890256, -79.453512 ],
       [ 43.7709921, -79.2169174],
       [ 43.7090604, -79.3634517],
       [ 43.6579524, -79.3873826],
       [ 43.669542 , -79.4225637],
       [ 43.773136 , -79.2394761],
       [ 43.8037622, -79.3634517],
       [ 43.7543283,

In [0]:
kmeans = KMeans(n_clusters=5, random_state=42).fit(X)

In [0]:
kmeans.labels_

array([3, 4, 1, 0, 1, 0, 2, 3, 4, 1, 1, 0, 2, 4, 4, 1, 1, 0, 2, 4, 1, 1,
       2, 4, 1, 1, 2, 3, 3, 4, 1, 1, 2, 3, 0, 4, 1, 1, 2, 3, 0, 4, 1, 1,
       4, 3, 0, 4, 1, 0, 0, 2, 3, 0, 4, 3, 0, 0, 4, 3, 0, 3, 1, 0, 0, 2,
       3, 1, 1, 0, 0, 4, 3, 1, 1, 1, 0, 0, 2, 1, 1, 0, 2, 1, 1, 2, 1, 1,
       0, 0, 3, 1, 1, 0, 0, 2, 1, 1, 0, 1, 4, 0, 0], dtype=int32)

In [0]:
kmeans.cluster_centers_

array([[ 43.68851024, -79.5214847 ],
       [ 43.66830196, -79.39913405],
       [ 43.77480968, -79.2362143 ],
       [ 43.76683612, -79.38435879],
       [ 43.6969548 , -79.32028644]])

Now with our knowledge of cluster labels for our given points, map to folium.

In [0]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)
colours = ['red','yellow','green','blue','purple']

# add markers to map
for lat, lng, label, col in zip(df2['Latitude'], df2['Longitude'], df2['Neighbourhood'], kmeans.labels_):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=colours[col],
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

map_toronto

![alt text](https://i.imgur.com/ahQ2043.png)