In [1]:
import requests
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import numpy as np
from bs4 import BeautifulSoup
# !conda install -c conda-forge beautifulsoup4 --yes

## Download and Explore Dataset

In [2]:
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(data, 'html.parser')

In [3]:
table = soup.find('table')
td = table.find_all('td')

Postalcode = []
Borough = []
Neighborhood = []

for i in range(0, len(td), 3):
    Postalcode.append(td[i].text.strip())
    Borough.append(td[i+1].text.strip())
    Neighborhood.append(td[i+2].text.strip())

In [4]:
df_canada = pd.DataFrame(data=[Postalcode, Borough, Neighborhood]).transpose()
df_canada.columns = ['Postal Code', 'Borough', 'Neighborhood']
df_canada.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Ignore cells with a borough that is Not assigned

In [5]:
df_canada['Borough'].replace('Not assigned', np.nan, inplace=True)
df_canada.dropna(subset=['Borough'], inplace=True)

df_canada.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


### If there is more than one neighborhood existing in one postal code area these neighborhoods will be combined into one row with the neighborhoods separated with a comma

In [6]:
def impute_neirghborhood(row):
    if row['Neighborhood'] == 'Not assigned':
        row['Neighborhood'] = row['Borough']
    
    return row

df_canada = df_canada.apply(impute_neirghborhood, axis=1)
df_canada.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [7]:
df_canada.shape

(210, 3)