In [1]:
import pandas as pd

In [3]:
dfs = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', header=0)
dfs[0]

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [33]:
df = dfs[0]
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


*To remove "Not assigned" row*

In [9]:
borough_not_to_keep = ['Not assigned']

In [10]:
df.Borough.isin(borough_not_to_keep)

0       True
1       True
2      False
3      False
4      False
       ...  
175     True
176     True
177     True
178    False
179     True
Name: Borough, Length: 180, dtype: bool

In [28]:
df = df[~df.Borough.isin(borough_not_to_keep)]
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [24]:
df.loc[df['Postal Code'] == 'M5A']

Unnamed: 0,Postal Code,Borough,Neighbourhood
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


*the row of postal code with M5A looks fine*

In [32]:
df['Postal Code'].value_counts()

M1V    1
M2L    1
M5V    1
M9V    1
M2P    1
      ..
M4J    1
M1X    1
M4B    1
M6C    1
M4G    1
Name: Postal Code, Length: 103, dtype: int64

*no duplicated postal code*

In [35]:
type(df)

pandas.core.frame.DataFrame

In [37]:
df.shape

(180, 3)

In [57]:
geo_data = pd.read_csv('Geospatial_Coordinates.csv')

In [58]:
geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [61]:
geo_data.shape

(103, 3)

In [59]:
merged_df = df.merge(geo_data, how = 'inner', on = ['Postal Code'])

In [60]:
merged_df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


In [91]:
df_toronto = merged_df[merged_df['Borough'].str.contains('Toronto')]
df_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [93]:
df_toronto = df_toronto.reset_index(drop=True)
df_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


**Begin to carry out simple cluster analysis on Toronto** 

In [94]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [100]:
address = 'Toronto'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


*Lets first begin by visualising the Neigbourhood of Toronto*

In [101]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [107]:
df_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


*To clean up data before apply ML - Kmeans*

In [109]:
df_toronto1 = df_toronto.drop(['Borough','Neighbourhood'],axis=1)

In [111]:
df_toronto1.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M5A,43.65426,-79.360636
1,M7A,43.662301,-79.389494
2,M5B,43.657162,-79.378937
3,M5C,43.651494,-79.375418
4,M4E,43.676357,-79.293031


*nomalise data with Standardscaler*

In [112]:
from sklearn.preprocessing import StandardScaler
X = df_toronto1.values[:,1:]
X = np.nan_to_num(X)
Clus_dataSet = StandardScaler().fit_transform(X)
Clus_dataSet

array([[-0.55554658,  0.79089146],
       [-0.20855993,  0.01026436],
       [-0.43033262,  0.29583074],
       [-0.67489659,  0.39102765],
       [ 0.39793749,  2.61964756],
       [-0.96499134,  0.44814526],
       [-0.39621906,  0.06737385],
       [ 0.10386009, -0.88430039],
       [-0.71471013,  0.14352435],
       [ 0.08069342, -1.41708063],
       [-1.13564978,  0.21967755],
       [-0.82881755, -0.80817965],
       [ 0.53600121,  1.01941327],
       [-0.86117495,  0.22443577],
       [-1.30688641, -1.03653375],
       [ 0.08040863,  2.00991342],
       [-0.81708966,  0.27203152],
       [-0.32834143,  1.32413968],
       [ 2.62714635,  0.02929996],
       [ 1.92270943, -0.7320562 ],
       [ 1.96828767, -0.00877123],
       [ 1.28638314, -0.57980391],
       [-0.23847079, -2.02583013],
       [ 2.08186867, -0.42754079],
       [ 0.24054304, -0.42754079],
       [-0.78424468, -1.79756801],
       [ 1.60468436,  0.02929996],
       [-0.19155492, -0.27526956],
       [-0.67158707,

*Lets assume 4 clusters to be categorised*

In [113]:
clusterNum = 4
k_means = KMeans(init = "k-means++", n_clusters = clusterNum, n_init = 12)
k_means.fit(X)
labels = k_means.labels_
print(labels)

[1 1 1 1 2 1 1 3 1 3 1 3 2 1 3 2 1 2 0 0 0 0 3 0 1 3 0 1 3 0 1 0 1 1 1 1 1
 1 2]


In [120]:
df_toronto1["Clus_km"] = labels
df_toronto1.head(5)

Unnamed: 0,Postal Code,Latitude,Longitude,Clus_km
0,M5A,43.65426,-79.360636,1
1,M7A,43.662301,-79.389494,1
2,M5B,43.657162,-79.378937,1
3,M5C,43.651494,-79.375418,1
4,M4E,43.676357,-79.293031,2


*To merge with original toronto dataset to get back columns of Borough and Neighbourhood*

In [121]:
df_toronto2 = df_toronto1.merge(df_toronto, how = 'inner', on = ['Postal Code'])
df_toronto2.head()

Unnamed: 0,Postal Code,Latitude_x,Longitude_x,Clus_km,Borough,Neighbourhood,Latitude_y,Longitude_y
0,M5A,43.65426,-79.360636,1,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,43.662301,-79.389494,1,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,43.657162,-79.378937,1,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,43.651494,-79.375418,1,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,43.676357,-79.293031,2,East Toronto,The Beaches,43.676357,-79.293031


*Final clean up before visualisation on map*

In [126]:
df_toronto2 = df_toronto2.drop(['Latitude_y','Longitude_y'],axis=1)

In [127]:
df_toronto2.head()

Unnamed: 0,Postal Code,Latitude_x,Longitude_x,Clus_km,Borough,Neighbourhood
0,M5A,43.65426,-79.360636,1,Downtown Toronto,"Regent Park, Harbourfront"
1,M7A,43.662301,-79.389494,1,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
2,M5B,43.657162,-79.378937,1,Downtown Toronto,"Garden District, Ryerson"
3,M5C,43.651494,-79.375418,1,Downtown Toronto,St. James Town
4,M4E,43.676357,-79.293031,2,East Toronto,The Beaches


In [129]:
df_toronto2 = df_toronto2.rename(columns={'Latitude_x':'Latitude', 'Longitude_x':'Longitude'})
df_toronto2.head()

Unnamed: 0,Postal Code,Latitude,Longitude,Clus_km,Borough,Neighbourhood
0,M5A,43.65426,-79.360636,1,Downtown Toronto,"Regent Park, Harbourfront"
1,M7A,43.662301,-79.389494,1,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
2,M5B,43.657162,-79.378937,1,Downtown Toronto,"Garden District, Ryerson"
3,M5C,43.651494,-79.375418,1,Downtown Toronto,St. James Town
4,M4E,43.676357,-79.293031,2,East Toronto,The Beaches


*Lets visualise the result of kmeans on map*

In [132]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(clusterNum)
ys = [i + x + (i*x)**2 for i in range(clusterNum)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_toronto2['Latitude'], df_toronto2['Longitude'], df_toronto2['Neighbourhood'], df_toronto2['Clus_km']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters