In [1]:
! pip install geocoder html5lib lxml pandas numpy bs4



In [2]:
import numpy as np
import pandas as pd
import geocoder
from bs4 import BeautifulSoup
from urllib.request import urlopen
import csv

In [3]:
postal_codes_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [4]:
with urlopen(postal_codes_url) as html_doc:
    soup = BeautifulSoup(html_doc.read(), "lxml")

In [5]:
tr = soup.table.find('tr')
df_array = {th.text.strip():[] for th in tr.find_all('th') }

for tr in soup.table.find_all('tr'):
    for key, td in zip(df_array.keys(), tr.find_all('td')):
        df_array[key].append(td.text.strip())
df = pd.DataFrame(df_array)

In [6]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### replacing "Not assigned" with '' (empty string)

In [7]:
df.replace(to_replace=['Not assigned'], value=[''], inplace=True)

### Ignoring cells with Borough being '' (empty string)

In [8]:
df = df[df.Borough != '']

### Assigning Null Neighbourhood same as Borough

In [9]:
def fill_neighbourhood(x):
    if x.Neighbourhood == '':
        return x.Borough
    return x.Neighbourhood
df["Neighbourhood"] = df.apply(lambda x: fill_neighbourhood(x), axis=1)

In [10]:
df = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda x: ", ".join(x)).to_frame().reset_index()

In [11]:
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


In [12]:
df.shape

(103, 3)

In [13]:
csv_file_name = "Geospatial_Coordinates.csv"
coordinates = pd.read_csv(csv_file_name, index_col="Postal Code")
coordinates.at["M1B", "Latitude"]
coordinates.at["M1B", "Longitude"]

43.806686299999996

-79.19435340000001

### add latitude and longitude from coordinates dataframe

In [14]:
df["Latitude"] = df["Postcode"].apply(lambda x: coordinates.at[x, "Latitude"])
df["Longitude"] = df["Postcode"].apply(lambda x: coordinates.at[x, "Longitude"])

In [15]:
df

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437
