In [1]:
import pandas as pd 
from bs4 import BeautifulSoup
import requests

import folium # map rendering library
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import geocoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

In [2]:
html_text = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(html_text,'lxml')

In [3]:
table_contents=[]
table=soup.find('table')
for row in table.find_all('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

In [4]:
toronto_df=pd.DataFrame(table_contents)
toronto_df['Borough']=toronto_df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

In [5]:
toronto_df.shape

(103, 3)

In [6]:
toronto_df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto Business,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


## Adding Coordinates to our data frame by POSTALCODE

In [7]:
coord = pd.read_csv('./output/Geospatial_Coordinates.csv')
coord.rename(columns={'Postal Code':'PostalCode'},inplace=True)

In [8]:
# merge 2 dataset by a matching column 
toronto_df2 = pd.merge(toronto_df,coord,on='PostalCode')
toronto_df2

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto Business,Enclave of M4L,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


In [9]:
toronto_df2.PostalCode.nunique()

103

## Trying to make Clusters and Defining LEbels to the un labeled data set

In [10]:
map_toronto = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

for lat,lng,borough,neighbourhood in zip(toronto_df2['Latitude'],toronto_df2['Longitude'],toronto_df2['Borough'],toronto_df2['Neighborhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
map_toronto

In [11]:
# getting dummies for borough for making clusters
toronto_df_enc = pd.get_dummies(toronto_df2.Borough)
toronto_df_enc = pd.concat([toronto_df2,toronto_df_enc],axis=1).reset_index(drop=True)

In [12]:
toronto_df_enc.drop(columns=['PostalCode','Latitude','Longitude'],inplace=True)


In [13]:
toronto_df_enc

Unnamed: 0,Borough,Neighborhood,Central Toronto,Downtown Toronto,Downtown Toronto Stn A,East Toronto,East Toronto Business,East York,East York/East Toronto,Etobicoke,Etobicoke Northwest,Mississauga,North York,Queen's Park,Scarborough,West Toronto,York
0,North York,Parkwoods,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,North York,Victoria Village,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,Downtown Toronto,"Regent Park, Harbourfront",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,North York,"Lawrence Manor, Lawrence Heights",0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,Queen's Park,Ontario Provincial Government,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
99,Downtown Toronto,Church and Wellesley,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
100,East Toronto Business,Enclave of M4L,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
101,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [14]:
# Now getting frequencies of the Boroughs
borough_freq= toronto_df_enc.groupby(toronto_df_enc['Borough']).mean().round(4).reset_index()
borough_freq.head()

Unnamed: 0,Borough,Central Toronto,Downtown Toronto,Downtown Toronto Stn A,East Toronto,East Toronto Business,East York,East York/East Toronto,Etobicoke,Etobicoke Northwest,Mississauga,North York,Queen's Park,Scarborough,West Toronto,York
0,Central Toronto,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Downtown Toronto,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Downtown Toronto Stn A,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,East Toronto,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,East Toronto Business,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [15]:
# Creating Clusters
from sklearn.cluster import KMeans
x=borough_freq.drop('Borough',1)
k = 5
KMeans = KMeans(n_clusters=k,random_state=0).fit(x)
KMeans.labels_[0:10]
borough_freq.insert(0,'label',KMeans.labels_)

In [16]:

# CLusters added as label in the main Data Frame
toronto_labeled = pd.merge(toronto_df2,borough_freq,on='Borough')
toronto_labeled

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,label,Central Toronto,Downtown Toronto,Downtown Toronto Stn A,East Toronto,...,East York,East York/East Toronto,Etobicoke,Etobicoke Northwest,Mississauga,North York,Queen's Park,Scarborough,West Toronto,York
0,M3A,North York,Parkwoods,43.753259,-79.329656,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,M4A,North York,Victoria Village,43.725882,-79.315572,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,M3B,North York,Don Mills North,43.745906,-79.352188,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,M6B,North York,Glencairn,43.709577,-79.445073,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99,M7R,Mississauga,Enclave of L4W,43.636966,-79.615819,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
100,M5W,Downtown Toronto Stn A,Enclave of M5E,43.646435,-79.374846,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
101,M9W,Etobicoke Northwest,"Clairville, Humberwood, Woodbine Downs, West H...",43.706748,-79.594054,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [17]:

# create map
map_clusters = folium.Map(location=[toronto_labeled.Latitude.mean(),toronto_labeled.Longitude.mean()], zoom_start=11)
# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_labeled['Latitude'],toronto_labeled['Longitude'],toronto_labeled['Neighborhood'],toronto_labeled['label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Tried anotherway for making our clusters

In [10]:
df = toronto_df2.drop(columns=['Borough','Neighborhood','PostalCode'])
df 


Unnamed: 0,Latitude,Longitude
0,43.753259,-79.329656
1,43.725882,-79.315572
2,43.654260,-79.360636
3,43.718518,-79.464763
4,43.662301,-79.389494
...,...,...
98,43.653654,-79.506944
99,43.665860,-79.383160
100,43.662744,-79.321558
101,43.636258,-79.498509


### Checking Shilouhtee scores

In [21]:
from sklearn.cluster import KMeans
range_n_clusters = np.arange(2,10)
for n_clusters in range_n_clusters:
    clust_model = KMeans(n_clusters=n_clusters)
    preds = clust_model.fit_predict(df)
    #centers = clusterer.cluster_centers

    score = silhouette_score(df, preds).round(2)*100
    print("For k = {},  silhouette score is {} %)".format(n_clusters, score))


For k = 2,  silhouette score is 66.0 %)
For k = 3,  silhouette score is 72.0 %)
For k = 4,  silhouette score is 85.0 %)
For k = 5,  silhouette score is 94.0 %)
For k = 6,  silhouette score is 81.0 %)
For k = 7,  silhouette score is 70.0 %)
For k = 8,  silhouette score is 54.0 %)
For k = 9,  silhouette score is 44.0 %)


In [20]:
from sklearn.cluster import KMeans
df = toronto_df2.drop(columns=['Borough','Neighborhood','PostalCode'])
df = df
Ks = 10
mean_acc = np.zeros((Ks-1))
for n_clusters in range(2,Ks):
    clusterer = KMeans(n_clusters=n_clusters)
    preds = clusterer.fit_predict(df)
    mean_acc[n_clusters-1] = silhouette_score(df, preds)
print(f'The best silhouette percentage is {mean_acc.max()*100} % for  K={mean_acc.argmax()+1}') 

The best silhouette percentage is 94.08340621335208 % for  K=5


In [25]:
# Creating Clusters
from sklearn.cluster import KMeans
k = 5
model = KMeans(n_clusters=k,random_state=0).fit(df)

In [26]:
pre = model.fit_predict(df)
silhouette_score(df,pre)

0.9408340621335208

In [17]:
toronto_df2.insert(0,'label',model.labels_)

In [18]:
toronto_df2

Unnamed: 0,label,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,4,M3A,North York,Parkwoods,43.753259,-79.329656
1,4,M4A,North York,Victoria Village,43.725882,-79.315572
2,2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,0,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,2,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
...,...,...,...,...,...,...
98,1,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,2,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,4,M7Y,East Toronto Business,Enclave of M4L,43.662744,-79.321558
101,1,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


In [19]:
toronto_df2.label.value_counts()

2    32
1    24
4    20
0    17
3    10
Name: label, dtype: int64

In [24]:
map_toronto = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

for lat,lng,borough,neighbourhood in zip(toronto_df2['Latitude'],toronto_df2['Longitude'],toronto_df2['Borough'],toronto_df2['Neighborhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
map_toronto

In [25]:

# create map
map_clusters = folium.Map(location=[toronto_df2.Latitude.mean(),toronto_df2.Longitude.mean()], zoom_start=11)
# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_df2['Latitude'],toronto_df2['Longitude'],toronto_df2['Neighborhood'],toronto_df2['label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Thankyou