In [30]:
# import all the libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [31]:
# get source code for the wiki page
source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text

In [32]:
# creating a BeautifulSoup object for the source page
soup = BeautifulSoup(
    source,
    "lxml"
)

In [33]:
# pulling the table content
table = soup.find(
    "body"
).find(
    "div",
    class_="mw-body"
).find(
    "div",
    class_="mw-body-content",
    id="bodyContent"
).table.tbody

In [34]:
# append all values of the table into a list
all_postcodes = []

for i in table.find_all("tr"):
    k = []
    for j in i.find_all("td"):
        k.append(j.text)
    all_postcodes.append(k)

In [35]:
labels = ["Postcode","Borough","Neighbourhood"]

In [36]:
# creating dataframe
df = pd.DataFrame.from_records(all_postcodes,columns=labels).dropna()

In [37]:
# remove records where Borough is not assigned
df = df[(df["Borough"]!="Not assigned")]

In [38]:
# removing /n value from the Neighbourhood column
df.Neighbourhood = df.Neighbourhood.str.slice(0,-1)

In [39]:
# assign borough to neighbourhood where neighbourhood is not assigned
df.Neighbourhood = np.where(
    df["Neighbourhood"]=="Not assigned", 
    df["Borough"], 
    df["Neighbourhood"]
)

In [40]:
# dropping any duplicates
df = df.drop_duplicates()

In [41]:
# grouping by postcode and borough and concatenating the neighbourhood value
grouped_df = df.groupby(
    ["Postcode","Borough"]
)["Neighbourhood"].apply(lambda x: "%s" % ', '.join(x)).reset_index()

In [42]:
# shape of grouped neighbourhood dataframe
grouped_df.shape

(103, 3)

In [43]:
# reading Geospatial_Coordinates into pandas dataframe
lat_long_coords = pd.read_csv("Geospatial_Coordinates.csv")

In [44]:
new_data = pd.merge(
    grouped_df,
    lat_long_coords,
    left_on = 'Postcode', 
    right_on = 'Postal Code',
    how = "inner"
).drop(
    columns = ["Postal Code"]
)

In [54]:
toronto_data = new_data[new_data["Borough"].str.contains("Toronto")].reset_index().drop(columns = ["index"])

In [56]:
toronto_data

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049
