Importing Libraries

In [16]:
import numpy as np
import pandas as pd
import json
from bs4 import BeautifulSoup
import requests

Scraping Wikipedia Page

In [17]:
data = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text

In [18]:
soup = BeautifulSoup(data, 'html.parser')
table = soup.find('table')
rows = table.find_all('tr')

Creating Dataframe

In [19]:
PostalCodes = []
Boroughs = []
Neighborhoods = []

for row in rows:
    cells = row.find_all('td')
    if(len(cells) > 0):
        PostalCodes.append(cells[0].text.rstrip('\n'))
        Boroughs.append(cells[1].text.rstrip('\n'))
        Neighborhoods.append(cells[2].text.rstrip('\n'))

In [20]:
df = pd.DataFrame({"PostalCode": PostalCodes, "Borough": Boroughs, "Neighborhood": Neighborhoods})

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


Dropping entries with Boroughs Not Assigned

In [21]:
df = df[df.Borough != 'Not assigned']

Grouping neighborhoods together

In [22]:
df['Neighborhood'] = df.groupby('PostalCode')['Neighborhood'].transform(lambda x: "%s" % ', '.join(x)).values


Getting the shape of the dataframe

In [23]:
df.shape

(103, 3)

In [24]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


Reading the lat & long coordinates from CSV file

In [25]:
coordinates = pd.read_csv("Geospatial_Coordinates.csv")
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [26]:
coordinates.rename(columns={"Postal Code": "PostalCode"}, inplace = True)
coordinates.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Dataframe with the coordinates

In [27]:
df_longlat = pd.merge(df, coordinates, on = 'PostalCode')
df_longlat.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,43.662301,-79.389494
