In [28]:
import pandas as pd
import numpy as np
import requests
import folium
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans

import matplotlib.cm as cm
import matplotlib.colors as colors

#!pip install Beautifulsoup4
#!pip install lxml
from bs4 import BeautifulSoup

## Using beautiful soup to get the html page
I Used Inspect Element on the wikipedia webpage and found out that the whole table was contained under the tbody column. So i used the find function to find the first tbody tag and extract all the information inside it.

In [2]:
source = requests.get(
    'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = BeautifulSoup(source, 'lxml')

summary = soup.find("tbody")


### Creating a Dataframe

In [3]:
column_name = ['PostalCode', 'Borough', 'Neighbourhood']
df = pd.DataFrame(columns=column_name)
df

Unnamed: 0,PostalCode,Borough,Neighbourhood


### Finding the table
When i used the developers console on the webpage i found that the table rows used the tag td so i extracted all the text starting with the td tag. 

In [4]:
rows = summary.find_all('td')

In [5]:
n=len(rows)
n

540

### Extracting the data and putting it in the dataframe.

In [6]:
for a in range(0, 539, 3):
    postalcode = rows[a].text.rstrip()
    borough = rows[a+1].text.rstrip()
    neigh = rows[a+2].text.rstrip()
    df = df.append({'PostalCode': postalcode,
                    'Borough': borough,
                    'Neighbourhood': neigh}, ignore_index=True)

In [7]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Removing the unassigned 

In [8]:
for a in df.index:
    if(df['Borough'][a] == 'Not assigned'):
        df.drop(index=a, axis=0, inplace=True)

In [9]:
x = df
newdf = pd.DataFrame(columns=column_name)
x.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## Removing recurring postal codes and adding the neighborhoods 

In [10]:
y = x['PostalCode'].unique()
for value in y:
    flag = 0
    for a in df.index:
        if flag == 0:
            if(value == df['PostalCode'][a]):
                postal_code = df['PostalCode'][a]
                Bor = df['Borough'][a]
                neighbourhood = df['Neighbourhood'][a]
                newdf = newdf.append({'PostalCode': postal_code,
                                      'Borough': Bor,
                                      'Neighbourhood': neighbourhood}, ignore_index=True)
                prime = 0
                flag = 1
                continue
        if flag == 1:
            if(value == df['PostalCode'][a]):
                orig = df['Neighbourhood'][a]
                newdf['Neighbourhood'][prime] = newdf['Neighbourhood'][prime]+","+orig
    prime = prime+1

In [11]:
newdf.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [12]:
newdf.shape

(103, 3)

## Getting the Lattitudes and longitudes

In [13]:
coordinates= pd.read_csv('C:/Users/shasw/Documents/Jupyter/Geospatial_Coordinates.csv')
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
newdf["Latitiude"]=""
newdf['Longitude']=""

In [15]:
for a in coordinates.index:
    for b in newdf.index:
        if(coordinates["Postal Code"][a]==newdf["PostalCode"][b]):
            newdf['Latitiude'][b]=coordinates["Latitude"][a]
            newdf['Longitude'][b]=coordinates["Longitude"][a]

In [16]:
newdf.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitiude,Longitude
0,M3A,North York,Parkwoods,43.7533,-79.3297
1,M4A,North York,Victoria Village,43.7259,-79.3156
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6543,-79.3606
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7185,-79.4648
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6623,-79.3895


In [17]:
address= 'Toronto, Ontario'

geolocator= Nominatim(user_agent='toronto_agent')
location= geolocator.geocode(address)
latitude= location.latitude
longitude= location.longitude
print("The coordinates of Toronto are- {}, {}.".format(latitude,longitude))

The coordinates of Toronto are- 43.6534817, -79.3839347.


In [18]:
map_toronto= folium.Map(location=[latitude,longitude], zoom_start=10)

for lat, long, Borough, neighbourhood in zip(newdf['Latitiude'],newdf['Longitude'],newdf['Borough'], newdf['Neighbourhood']):
    label='{}, {}'.format(neighbourhood,Borough)
    label= folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat,long],
                        radius=5,
                        popup=label,
                        color="orange",
                        fill=True,
                        fill_color='#3186cc',
                        fill_opacity=0.7,
                        parse_html=False
                       ).add_to(map_toronto)
map_toronto

## K-means Clustering

In [19]:
# Clustering the neighbourhoods by borough
dum_df= pd.get_dummies(newdf['Borough'])
dum_df.head()

Unnamed: 0,Central Toronto,Downtown Toronto,East Toronto,East York,Etobicoke,Mississauga,North York,Scarborough,West Toronto,York
0,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,1,0,0,0
2,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0
4,0,1,0,0,0,0,0,0,0,0


In [23]:
kclusters=5

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(dum_df)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([1, 1, 3, 1, 3, 0, 2, 1, 0, 3, 1, 0, 2, 1, 0, 3, 0, 0, 2, 0, 3, 0,
       2, 0, 3, 3, 2, 1, 1, 0, 3, 0, 2, 1, 1, 0, 3, 0, 2, 1, 1, 0, 3, 0,
       2, 1, 1, 0, 3, 1, 1, 2, 1, 1, 0, 1, 0, 1, 2, 1, 1, 4, 4, 0, 0, 2,
       1, 4, 4, 0, 0, 2, 1, 4, 4, 0, 0, 0, 2, 4, 3, 0, 2, 4, 3, 2, 4, 3,
       0, 0, 2, 3, 3, 0, 0, 2, 3, 3, 0, 3, 0, 0, 0])

In [24]:
kmeans.labels_.shape

(103,)

In [25]:
newdf.insert(0, 'Cluster Labels', kmeans.labels_)

In [26]:
newdf.head()

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighbourhood,Latitiude,Longitude
0,1,M3A,North York,Parkwoods,43.7533,-79.3297
1,1,M4A,North York,Victoria Village,43.7259,-79.3156
2,3,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6543,-79.3606
3,1,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7185,-79.4648
4,3,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6623,-79.3895


In [31]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(newdf['Latitiude'], newdf['Longitude'], newdf['Neighbourhood'], newdf['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters