In [1]:
import pandas as pd
import numpy as np
import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML and XML documents 

#mapping tools
#!pip install geopy 
#from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

#!pip install folium
#import folium # map rendering library

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

## step 1 use BeautifulSoup to scrape data from website: 

In [2]:
data = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text


In [3]:
# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')

## step 2 based on the table in the wikipedia (search for table and extract to dict & create pd DataFrame)

In [4]:
#search for table
table = soup.find('table',{'class':'wikitable sortable'})
table; #remove ';' to view output

# step 3 extract table data and create pd DataFrame


In [5]:
#extract rows to dict
rows = []
for row in table.find_all("tr"):
    cols = row.find_all("td")
    cols = [ele.text.strip() for ele in cols]
    rows.append(cols)

rows; #remove ';' to view output

## Create dataframe with 103 Postcodes ('Postcode', 'Borough', 'Neighborhood')

In [6]:
#create initial pd DataFrame
df = pd.DataFrame(rows)
df = df.rename(columns={0:"Postcode",1:"Borough",2:"Neighborhood"})
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [7]:
df.shape

(289, 3)

In [8]:
df_bkp = df.copy()

# drop columns where Borough is Not assigned or None

In [9]:
df = df.drop([0])

In [10]:
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [11]:
df.drop(df[df['Borough']=='Not assigned'].index,inplace=True)

In [12]:
df.reset_index(drop=True)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


## step 4 - data transform - if 'Neighborhood' = 'Not Assigned', then use 'Borough'

In [13]:
df[df.Neighborhood=='Not assigned']

Unnamed: 0,Postcode,Borough,Neighborhood
9,M7A,Queen's Park,Not assigned


In [14]:
df.Neighborhood[df.Neighborhood == 'Not assigned'] = df.Borough

#### More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.

In [15]:
df = df.groupby(['Postcode', 'Borough'])['Neighborhood'].apply(list).apply(lambda x:', '.join(x)).to_frame().reset_index()

In [16]:
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [17]:
df.shape

(103, 3)