In [2]:
from bs4 import BeautifulSoup
import numpy as np # library to handle data in a vectorized manner
import pandas as pd
import requests

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
rs = requests.get(url)
print(rs.status_code)

200


In [4]:
soup = BeautifulSoup(rs.content, "lxml")

In [8]:
table = soup.find('table', {"class":"wikitable sortable"})
trs = table.find_all('tr')

postcode = []
borough = []
neighbourhood = []

for tr in trs:
    tds = tr.find_all('td')
    if tds:
        postcode.append(tds[0].text.strip())
        borough.append(tds[1].text.strip())
        neighbourhood.append(tds[2].text.strip())


<b>The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood</b>

In [9]:
dataFrame = pd.DataFrame(data=[postcode, borough, neighbourhood]).transpose()
dataFrame.columns=['postcode', 'borough', 'neighbourhood']
dataFrame.head()

Unnamed: 0,postcode,borough,neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


<b>Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.</b>

In [11]:
dataFrame = dataFrame[dataFrame.borough != "Not assigned"].reset_index(drop=True)
dataFrame.head()

Unnamed: 0,postcode,borough,neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


<b>Group the same Postal Code</b>

In [15]:
dataFrame = dataFrame.groupby(["postcode", "borough"], as_index=False).agg(lambda x: ", ".join(x))
dataFrame.head()

Unnamed: 0,postcode,borough,neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


<b>Neighborhood="Not assigned", make the value the same as Borough</b>

In [16]:
for index, row in dataFrame.iterrows():
    if row["neighbourhood"] == "Not assigned":
        row["neighbourhood"] = row["borough"]
        
dataFrame.head()

Unnamed: 0,postcode,borough,neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


<b>Number of rows</b>

In [19]:
dataFrame.shape

(103, 3)