In [14]:
# download and install the beautifulsoup4 library which is used for web scraping
#!conda install -c conda-forge beautifulsoup4 --yes

# download and install the lxml library which is used parsing the webcontent
#!conda install -c conda-forge lxml --yes

In [1]:
# import all the necessary library
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd

In [2]:
source_code = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source_code, 'lxml')
table = soup.find('table', {'class':'wikitable sortable'})

postcode = []
borough = []
neigh = []
for contents in table.find_all('tr')[1:]:
    postcode.append((contents.find_all('td')[0].text))
    borough.append((contents.find_all('td')[1].text))
    neigh.append((contents.find_all('td')[2].text).strip())

In [3]:
pc_ca = {'Borough':borough, 'Neighbourhood':neigh, 'PostCode':postcode}
labels = ['PostCode', 'Borough', 'Neighbourhood']
df = pd.DataFrame(pc_ca, columns=labels)
df.head()

Unnamed: 0,PostCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [4]:
# Ignore cells with a borough that is Not assigned
df.loc[:, 'Borough'].replace('Not assigned', np.nan, inplace=True)
df.dropna(subset=['Borough'], inplace=True)
df.reset_index(drop=True, inplace=True)
df.head(10)

Unnamed: 0,PostCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [20]:
# combine into one row with the neighborhoods having same postcode
df_grouped = pd.DataFrame({'Neighbourhood': df.groupby(['PostCode', 'Borough'])['Neighbourhood'].apply(','.join)}).reset_index()
df_grouped.head(10)

Unnamed: 0,PostCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [26]:
# Not assigned neighborhood will be the same as the borough
df_grouped.loc[:, 'Neighbourhood'].replace('Not assigned', df['Borough'], inplace=True)
df_grouped

0                                             Parkwoods
1                                      Victoria Village
2                                          Harbourfront
3                                           Regent Park
4                                      Lawrence Heights
5                                        Lawrence Manor
6                                          Queen's Park
7                                      Islington Avenue
8                                                 Rouge
9                                               Malvern
10                                      Don Mills North
11                                     Woodbine Gardens
12                                        Parkview Hill
13                                              Ryerson
14                                      Garden District
15                                            Glencairn
16                                           Cloverdale
17                                            Is

In [7]:
df_grouped.shape

(103, 3)

In [8]:
df_grouped.to_csv('Postcode.csv')