In [1]:
#Import the necessary libraries 
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
#scrape the wiki page content by using BeautifulSoup

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki_html = requests.get(url).text
soup = BeautifulSoup(wiki_html, 'html.parser')


In [3]:
#Let us also convert content of postcode HTML table as list of data
data = []
for tr in soup.tbody.find_all('tr'):
    data.append([td.get_text().strip() for td in tr.find_all('td')])

In [6]:
#Create the dataframe & visualize that datafram consist of 3 columns: Post code, borough, neighbourhood.
df = pd.DataFrame(data, columns=['PostalCode','Borough','Neighborhood'])
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [7]:
#let´s drop rows that are "None (above header) or Not assigned" 
df = df.dropna()
empty = 'Not assigned'
df = df[(df.PostalCode != empty ) & (df.Borough != empty) & (df.Neighborhood != empty)]
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


In [8]:
#Group dataframe by 'PostalCode' and 'Borough'. And convert the groupby value as string separated by commas and back to a new dataframe
def neighborhood_list(grouped):    
    return ', '.join(sorted(grouped['Neighborhood'].tolist()))
                    
grp = df.groupby(['PostalCode', 'Borough'])
dfgroup = grp.apply(neighborhood_list).reset_index(name='Neighborhood')
dfgroup.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [9]:
#Convert and save the file to csv
dfgroup.to_csv('Toronto.csv', index=False)

In [10]:
#print the number of rows in my final dataframe
dfgroup.shape

(102, 3)