## Build the required environment

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

#Load the package of beautiful soup
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
from pandas.io.json import json_normalize

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes
import folium

print('Libraries imported.')

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Libraries imported.


## Catch the HTML

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = urlopen(url).read().decode('utf-8')
soup = BeautifulSoup(page, 'html.parser')

wiki_table = soup.body.table.tbody

In [3]:
def get_cell(element):
    cells = element.find_all('td')
    row = []
    for cell in cells:
        if cell.a:
            if (cell.a.text):
                row.append(cell.a.text)
                continue
        row.append(cell.string.strip())
    
    return row

In [4]:
def get_row():
    data = []
    for tr in wiki_table.find_all('tr'):
        row = get_cell(tr)
        if len(row) != 3:
            continue
        data.append(row)
    return data

In [5]:
data = get_row()
columns = ['Postcode', 'Borough', 'Neighbourhood']
df = pd.DataFrame(data, columns = columns)
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
7,M8A,Not assigned,
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,Malvern / Rouge


## Clean the data

In [6]:
df_NoB = df[df.Borough != 'Not assigned']
df_NoB = df_NoB.sort_values(by = ['Postcode', 'Borough'])
df_NoB.reset_index(inplace = True)
df_NoB.drop('index', axis = 1, inplace = True)
df_NoB.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [7]:
df_postcode = df_NoB['Postcode']
df_postcode.drop_duplicates(inplace = True)
df_post = pd.DataFrame(df_postcode)
df_post['Borough'] = ''
df_post['Neighbourhood'] = ''

df_post.reset_index(inplace = True)
df_post.drop('index', axis = 1, inplace = True)
df_NoB.reset_index(inplace = True)
df_NoB.drop('index', axis = 1, inplace = True)

for i in df_post.index:
    for j in df_NoB.index:
        if df_post.iloc[i,0] == df_NoB.iloc[j,0]:
            df_post.iloc[i,1] = df_NoB.iloc[j,1]
            df_post.iloc[i,2] = df_post.iloc[i,2] + ',' + df_NoB.iloc[j,2]

for i in df_post.index:
    s = df_post.iloc[i,2]
    if s[0] == ',':
        s = s[1:]
    df_post.iloc[i, 2] = s

In [8]:
df_post.shape

(103, 3)