In [54]:
# Scrape wikipedia page to extract HTML text script
import requests
from lxml.html import fromstring
wikipedia_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

from bs4 import BeautifulSoup
soup=BeautifulSoup(wikipedia_url,'lxml')
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className=document.documentElement.className.replace(/(^|\s)client-nojs(\s|$)/,"$1client-js$2");RLCONF={"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":890001695,"wgRevisionId":890001695,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications in Ontario","Postal codes in Canada","Toronto","Ontario-related lists"],"wgBreakFrames":!1,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June",

In [55]:
# Find class ‘wikitable sortable’ in the HTML script
My_table=soup.find('table',{'class':'wikitable sortable'})
My_table

<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td></tr>
<tr>
<td>M6A</td>

In [56]:
# Get all the tables
tables = soup.find_all('table',class_="wikitable")

# extract the content
contents = [item.get_text() for item in tables[0].find_all('td')]
contents

['M1A',
 'Not assigned',
 'Not assigned\n',
 'M2A',
 'Not assigned',
 'Not assigned\n',
 'M3A',
 'North York',
 'Parkwoods\n',
 'M4A',
 'North York',
 'Victoria Village\n',
 'M5A',
 'Downtown Toronto',
 'Harbourfront\n',
 'M5A',
 'Downtown Toronto',
 'Regent Park\n',
 'M6A',
 'North York',
 'Lawrence Heights\n',
 'M6A',
 'North York',
 'Lawrence Manor\n',
 'M7A',
 "Queen's Park",
 'Not assigned\n',
 'M8A',
 'Not assigned',
 'Not assigned\n',
 'M9A',
 'Etobicoke',
 'Islington Avenue\n',
 'M1B',
 'Scarborough',
 'Rouge\n',
 'M1B',
 'Scarborough',
 'Malvern\n',
 'M2B',
 'Not assigned',
 'Not assigned\n',
 'M3B',
 'North York',
 'Don Mills North\n',
 'M4B',
 'East York',
 'Woodbine Gardens\n',
 'M4B',
 'East York',
 'Parkview Hill\n',
 'M5B',
 'Downtown Toronto',
 'Ryerson\n',
 'M5B',
 'Downtown Toronto',
 'Garden District\n',
 'M6B',
 'North York',
 'Glencairn\n',
 'M7B',
 'Not assigned',
 'Not assigned\n',
 'M8B',
 'Not assigned',
 'Not assigned\n',
 'M9B',
 'Etobicoke',
 'Cloverdale\n',

In [57]:
# Clean the data table, convert to dataframe with column headers
import pandas as pd
data_head=['PostalCode', 'Borough', 'Neighbourhood']
data = list(zip(*[iter(contents)]*3))
Toronto_neighbourhoods = pd.DataFrame(data[0:], columns=data_head)
Toronto_neighbourhoods['Neighbourhood'] = Toronto_neighbourhoods['Neighbourhood'].str.rstrip('\n')
Toronto_neighbourhoods

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [58]:
# Cells with both Borough & Neighbourhood as Not assigned --- need to drop these rows from dataframe
data1=Toronto_neighbourhoods.loc[(Toronto_neighbourhoods.Borough=='Not assigned') &
                                  (Toronto_neighbourhoods.Neighbourhood=='Not assigned')]

In [59]:
# Case 2 - Cells having a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
data2=Toronto_neighbourhoods.loc[(Toronto_neighbourhoods.Borough!='Not assigned') &
                                 (Toronto_neighbourhoods.Neighbourhood=='Not assigned')]

data2= data2.replace({'Neighbourhood': 'Not assigned'}, "Queen's Park")
data2

Unnamed: 0,PostalCode,Borough,Neighbourhood
8,M7A,Queen's Park,Queen's Park


In [60]:
# Case 1 - Cells having a borough that is Not assigned ---- ignore such rows
data4=Toronto_neighbourhoods.loc[(Toronto_neighbourhoods.Borough=='Not assigned')]

In [61]:
# Case 3 - Cells with both Borough & Neighbourhood as assigned --- need to combine duplicate rows to one from dataframe
data3=Toronto_neighbourhoods.loc[(Toronto_neighbourhoods.Borough!='Not assigned') &
                                  (Toronto_neighbourhoods.Neighbourhood!='Not assigned')]
data3

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern
14,M3B,North York,Don Mills North


In [62]:
# Cobine Case 3 & Case 2 into a combined dataframe
Combined_dataframe = pd.concat([data2,data3],sort=False)
Combined_dataframe

Unnamed: 0,PostalCode,Borough,Neighbourhood
8,M7A,Queen's Park,Queen's Park
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [63]:
# For Foursquare location data, we need to get the latitude and the longitude coordinates of each neighborhood
# Step 1 - Use the Geocoder package or the csv file to create the following dataframe:
path='http://cocl.us/Geospatial_data'
Location_coordinates = pd.read_csv(path)
Location_coordinates.rename(columns={'Postal Code':'PostalCode'},inplace=True)
Location_coordinates

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [82]:
Combined_dataframe=Combined_dataframe.merge(Location_coordinates,on='PostalCode',how='right')

In [83]:
# Group & organize the combine dataframe into one row with the neighborhoods separated with a comma
Combined_TorontoNeighbourhoods = Combined_dataframe.groupby(['PostalCode','Borough','Latitude','Longitude'],sort=False,as_index=False).agg(','.join)

# Clean & organize the Toronto Neighbourhood dataframe
Combined_TorontoNeighbourhoods=Combined_TorontoNeighbourhoods[['PostalCode','Borough','Neighbourhood','Latitude','Longitude']]
Combined_TorontoNeighbourhoods

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
1,M3A,North York,Parkwoods,43.753259,-79.329656
2,M4A,North York,Victoria Village,43.725882,-79.315572
3,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.654260,-79.360636
4,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens,Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937


In [84]:
Combined_TorontoNeighbourhoods.shape

(103, 5)

In [85]:
## Explore and cluster the neighborhoods in Toronto
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

print('Libraries imported.')

Libraries imported.


In [86]:
# Use geopy library to get the latitude and longitude values of Toronto
address = 'Toronto, Ontario'

geolocator=Nominatim(user_agent='tor_explorer')
location=geolocator.geocode(address)
latitude=location.latitude
longitude=location.longitude
print('The geographical coordinates of Toronto are {},{}.'.format(latitude,longitude))

The geographical coordinates of Toronto are 43.653963,-79.387207.


In [87]:
# Create a map of Toronto with neighborhoods superimposed on top
map_toronto=folium.Map(location=[latitude,longitude],zoom_start=10)

# add markers to map
for lat,lng,borough,neighbourhood in zip(Combined_TorontoNeighbourhoods['Latitude'],Combined_TorontoNeighbourhoods['Longitude'],
                                         Combined_TorontoNeighbourhoods['Borough'],
                                         Combined_TorontoNeighbourhoods['Neighbourhood']):
    label='{},{}'.format(neighbourhood,borough)
    label=folium.Popup(label,parse_html=True)
    folium.CircleMarker([lat,lng],radius=5,popup=label,color='green',fill=True,fill_color='#3186cc',fill_opacity=0.7,
                       parse_html=False).add_to(map_toronto)
    
map_toronto