## Task 1 : Data Preparation

Let's import all the dependencies that we will need to preprocess the data

In [1]:
import pandas as pd

Build the code to scrape the Wikipedia page : "List of postal codes of Canada: M"

In [2]:
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', header=0)[0]
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Not assigned


Remove neighbourhood with unassigned Borough

In [3]:
df2 = df[df['Borough'] != 'Not assigned'].reset_index(drop=True)
df2

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Queen's Park,Not assigned
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


Combine More than one neighborhood that exist in one postal code area.

In [4]:
df3 = df2.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df3

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


Assign the borough name of neighbourhood with "Not Assigned" value, the neighborhood will be the same as the borough

In [5]:
df3.loc[df3['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = df3.loc[df3['Neighbourhood'] == 'Not assigned', 'Borough']
df3

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


Examine the resulting dataframe's dimension

In [6]:
df3.shape

(103, 3)

## Task 2 : Adding Coordinates to the Dataframe 

Download Geographical Coordinate for each postal code

In [7]:
url="https://cocl.us/Geospatial_data"
c=pd.read_csv(url)
c

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


Join the downloaded Coordinate File with existing dataframe

In [8]:
df4 = pd.merge(df3, c, left_on='Postcode', right_on = 'Postal Code', how='left')
filtered_columns = ['Postcode', 'Borough', 'Neighbourhood', 'Latitude', 'Longitude']
df4 = df4.loc[:, filtered_columns]
df4 = df4.rename(columns={"Postcode" : "PostalCode"})
df4

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [21]:
# explore borough that contain Toronto
df5 = df4[df4['Borough'].str.contains('Toronto')].reset_index(drop=True)
df5

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


## Task 3 : Explore and cluster the neighborhoods in Toronto

Import necessary modules to map each location points

In [10]:
import folium
import numpy as np

Determine the central point of our map, assume that the central of our initial map would be the mean of latitude and longitude of all postcode

In [11]:
latitude_init = np.mean(df5['Latitude'])
longitude_init = np.mean(df5['Longitude'])

Map all existing Toronto postal code coordinates

In [12]:
# create map
map_initial = folium.Map(location=[latitude_init, longitude_init], zoom_start=11)

# add markers to the map
markers_colors = []
for lat, lon, bor, nei in zip(df5['Latitude'], df5['Longitude'], df5['Borough'], df5['Neighbourhood']):
    label = folium.Popup(str(bor) + ' | ' + str(nei), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_initial)  
       
map_initial

Import necessary modules to do the K-Means Clustering

In [13]:
from sklearn.cluster import KMeans

Assume that we want to achieve cluster labels that is consistent with Borough names, thus the k value is:

In [14]:
len(df5['Borough'].unique())

4

Clustering with k = 4

In [15]:
# set number of clusters
kclusters = 4

df5_clustering = df5.drop(["PostalCode","Borough","Neighbourhood"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df5_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 3, 3, 3, 2, 2, 2, 2, 2, 2])

Insert the cluster labels to recent dataframe (df5)

In [22]:
df5.insert(5, 'Cluster Labels', kmeans.labels_)

Import necessary modules to generate specific color for each cluster

In [17]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

Final map, with all points clustered differentiated by distinct colors

In [23]:
# create map
map_clusters = folium.Map(location=[latitude_init, longitude_init], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, bor, nei, cluster in zip(df5['Latitude'],df5['Longitude'],df5['Borough'],df5['Neighbourhood'],df5['Cluster Labels']):
    label = folium.Popup(str(bor) + ' | Cluster: ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Let's whether the clustering is doing what we expected it to do (that each cluster labels only contain 1 borough)

In [24]:
df5.groupby(['Cluster Labels','Borough']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,PostalCode,Neighbourhood,Latitude,Longitude
Cluster Labels,Borough,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Downtown Toronto,1,1,1,1
0,West Toronto,6,6,6,6
1,Central Toronto,1,1,1,1
1,Downtown Toronto,18,18,18,18
2,Central Toronto,8,8,8,8
3,East Toronto,5,5,5,5


The clustering result is almost perfect. Cluster Labels 0 is mostly West Toronto, with 1 data point identified as Downtown Toronto. Cluster Labels 0 is mostly Downtown Toronto, with 1 data point identified as Central Toronto. Cluster 2 is all Central Toronto. Cluster Labels 3 is all East Toronto. So, only 2 points that missed its Borough assignment in this clustering process.