Extract table data from wikipedia page

In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

wiki_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
res = requests.get(wiki_url)
soup = BeautifulSoup(res.content)
table = soup.find_all('table')[0]
table
df = pd.read_html(str(table))[0]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Remove rows with Borough "Not assigned" from dataset

In [2]:
df.drop(df[df['Borough']=="Not assigned"].index, inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


Group dataset by Postcode and combine Neighbourhoods into comma seperated list.

In [3]:
df_combine = df.groupby("Postcode").agg({'Borough':'first','Neighbourhood':', '.join}).reset_index()
df_combine.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


If Neighbourhood is Not assigned, set Neighbourhood to Borough

In [4]:
df_combine['Neighbourhood'] = np.where(df_combine['Neighbourhood']=="Not assigned",df_combine['Borough'],df_combine['Neighbourhood'])
df_combine[df_combine['Borough']=="Queen's Park"]

Unnamed: 0,Postcode,Borough,Neighbourhood
93,M9A,Queen's Park,Queen's Park


In [5]:
df_combine.shape

(103, 3)

## Get Geolocation Data

In [6]:
geourl = "https://cocl.us/Geospatial_data"

df_geo = pd.read_csv(geourl)
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Rename Postal Code column to Postcode and merge dataframes on Postcode.

In [8]:
df_geo.rename(columns={'Postal Code':'Postcode'},inplace=True)
df_all = pd.merge(df_combine, df_geo, on='Postcode')
df_all.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [15]:
from geopy.geocoders import Nominatim
import folium
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))



The geograpical coordinate of Toronto are 43.653963, -79.387207.


Map all postcode locations around Toronto

In [63]:
map_t = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_all['Latitude'], df_all['Longitude'], df_all['Borough'], df_all['Neighbourhood']):
    label = '{}'.format(borough)
    label = folium.Popup(str(label))
    folium.CircleMarker([lat, lng],radius=5,color='blue',fill=True,fill_color='#3186cc',fill_opacity=0.7,parse_html=False).add_to(map_t)  
    
map_t

Using indvidual postcode locations, create KMeans cluster, cluster number of 10 chosen for this exercise.

In [77]:
from sklearn.cluster import KMeans 
cluster_num = 10
X = list(zip(df_all['Latitude'], df_all['Longitude']))
k_means = KMeans(init="k-means++",n_clusters=cluster_num, n_init=12)
k_means.fit(X)

k_labels = k_means.labels_
k_labels

array([2, 2, 2, 2, 4, 4, 4, 0, 4, 0, 4, 4, 4, 4, 4, 4, 2, 5, 5, 5, 5, 5,
       5, 5, 5, 0, 5, 0, 8, 8, 8, 8, 8, 1, 0, 0, 0, 0, 3, 0, 0, 7, 0, 7,
       3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 7,
       7, 7, 7, 7, 7, 8, 8, 3, 9, 9, 9, 9, 9, 8, 8, 9, 9, 9, 9, 7, 6, 0,
       6, 6, 6, 6, 6, 6, 6, 6, 1, 1, 1, 1, 1, 1, 1])

Create rainbow color list to show different clusters on map.

In [1]:
import matplotlib.cm as cm
import matplotlib.colors as colors
colors_array = cm.rainbow(np.linspace(0, 1, cluster_num))
rainbow = [colors.rgb2hex(i) for i in colors_array]


NameError: name 'np' is not defined

Produce map displaying clusters of postcode created from KMeans method.

In [79]:
map_t = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, cluster in zip(df_all['Latitude'], df_all['Longitude'], k_labels):
    folium.CircleMarker([lat, lng],radius=5,color=rainbow[cluster],fill=True,fill_color=rainbow[cluster],fill_opacity=0.7,parse_html=False).add_to(map_t)  
    
map_t