In [100]:
import requests 
from bs4 import BeautifulSoup 
import re
import pandas as pd
import numpy as np


In [None]:
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
r = requests.get(URL) 
  
soup = BeautifulSoup(r.content, 'lxml') 
print(soup.prettify())

In [101]:
rows = soup.find_all('tr')

In [102]:
for row in rows:
    row_td = row.find_all('td')

In [103]:
list_rows = []
for row in rows:
    cells = row.find_all('td')
    str_cells = str(cells)
    clean = re.compile('<.*?>')
    clean2 = (re.sub(clean, '',str_cells))
    list_rows.append(clean2)

In [43]:
df = pd.DataFrame(list_rows)
df = df.iloc[1:288]  # Select all the rows which has the table data.
df.tail(10)

Unnamed: 0,0
278,"[M4Z, Not assigned, Not assigned\n]"
279,"[M5Z, Not assigned, Not assigned\n]"
280,"[M6Z, Not assigned, Not assigned\n]"
281,"[M7Z, Not assigned, Not assigned\n]"
282,"[M8Z, Etobicoke, Kingsway Park South West\n]"
283,"[M8Z, Etobicoke, Mimico NW\n]"
284,"[M8Z, Etobicoke, The Queensway West\n]"
285,"[M8Z, Etobicoke, Royal York South West\n]"
286,"[M8Z, Etobicoke, South of Bloor\n]"
287,"[M9Z, Not assigned, Not assigned\n]"


In [122]:
# Cleaning the data

Canada_Post = df[0].str.strip('[')
Canada_Post = Canada_Post.str.strip('\n]')
Canada_Post = Canada_Post.str.split(',', expand=True)

In [123]:
Canada_Post.columns=['Postcode','Borough','Neighborhood']
Canada_Post.head()

Unnamed: 0,Postcode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [124]:
# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
Canada_Post=Canada_Post[~Canada_Post.Borough.str.contains("Not assigned")]
Canada_Post.reset_index(inplace = True,drop=True)
Canada_Post.head()

In [126]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
Canada_Post['Neighborhood'] = np.where((Canada_Post.Neighborhood == 'Not assigned'),Canada_Post['Borough'],Canada_Post.Neighborhood)
Canada_Post

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
...,...,...,...
205,M8Z,Etobicoke,Kingsway Park South West
206,M8Z,Etobicoke,Mimico NW
207,M8Z,Etobicoke,The Queensway West
208,M8Z,Etobicoke,Royal York South West


In [127]:
# More than one neighborhood can exist in one postal code area
Canada_Post=Canada_Post.groupby(['Postcode','Borough'])['Neighborhood'].apply(','.join).reset_index()


Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [129]:

Canada_Post.shape

(103, 3)

In [130]:
Canada_Post.head(20)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [131]:
Canada_Post.to_csv('Canada_Postal_Data.csv') 