# Install Beautifulsoup

In [3]:
import pandas as pd
import numpy as mp
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import requests

# Parse data from html page

In [4]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')

In [5]:
#Function to return the index of item if it exist in list
def is_in_list(element, list_element):
    try:
        index = list_element.index(element)
        return index
    except ValueError:
        return None

In [6]:
#Parse data from html
table = soup.find('table')
data = {'PostalCode': [], 'Borough':[], 'Neighborhood':[]}
rawdata = table.find_all('tr')
for row in rawdata[1:]:
    cell = row.find_all('td')
    postal_code = cell[0].text
    borough = cell[1].text.rstrip()
    neighborhood = cell[2].text.rstrip()
    if (borough == 'Not assigned'):
        continue
    if (neighborhood == 'Not assigned'):
        neighborhood = borough
    if is_in_list(postal_code, data['PostalCode']):
        postal_code_index = is_in_list(postal_code, data['PostalCode'])
        data['Neighborhood'][postal_code_index] = data['Neighborhood'][postal_code_index] + ', ' + neighborhood
    else:
        data['PostalCode'].append(postal_code)
        data['Borough'].append(borough)
        data['Neighborhood'].append(neighborhood)

In [11]:
#Create pandas Data Frame from raw data
df_ca = pd.DataFrame(data, columns = ['PostalCode','Borough','Neighborhood'])
df_ca.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [12]:
df_ca.shape

(103, 3)

# Read Geospatial Coordinates and merge to main data frame

In [24]:
geo_coordinate = pd.read_csv('https://cocl.us/Geospatial_data')
geo_coordinate.rename(index=str, columns={"Postal Code": "PostalCode"}, inplace=True)
geo_coordinate.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [25]:
df = pd.merge(df_ca,geo_coordinate, on='PostalCode')
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
