In [1]:
import numpy as np
import requests
import pandas as pd
import matplotlib.pyplot as plt
import folium
from geopy.geocoders import Nominatim 
from bs4 import BeautifulSoup

The data set contains all the post codes in Canada starting with the letter M. Some of the post codes have been assigned to a borough and a neighbourhood. We need to clear the data by getting rid of all the not assigned data

# Import data from Wikipedia

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
text = requests.get(url).text
soup = BeautifulSoup(text,'lxml')
table = soup.find('table',{'class':'wikitable sortable'})

## Convert the data into lists and combining duplicate post codes (if any)

In [3]:
post_code = []
borough = []
neighbourhood = []
for row in table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3: # Because there are 3 columns
        if (cells[0].find(text=True).strip("\n")) not in post_code:
            post_code.append(cells[0].find(text=True).strip("\n"))
            borough.append(cells[1].find(text=True).strip("\n"))
            neighbourhood.append(cells[2].find(text=True).strip("\n"))
        else:
            index = post_code.index((cells[0].find(text=True).strip("\n")))
            neighbourhood[index] = neighbourhood[index]+str(",")+cells[2].find(text=True).strip("\n")

## Put the data in a a Pandas Dataframe

In [4]:
df = pd.DataFrame()
df["Post_code"] = post_code
df["Borough"] = borough
df["Neighbourhood"] = neighbourhood
df.head()

Unnamed: 0,Post_code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Remove rows where borough is not assigned

In [5]:
index = df[((df.Borough == 'Not assigned'))].index
df = df.drop(index).reset_index(drop=True)
df.head()

Unnamed: 0,Post_code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [6]:
df.shape

(103, 3)

## Obtain latitude and longitudes of the neighbourhoods using geopy

In [7]:
Coordinates = []
accuracy = 0
full_address = np.array(df['Post_code']+","+df['Borough'])
geolocator = Nominatim(user_agent="foursquare_agent")
for i in full_address:
    try:
        location = geolocator.geocode(i)
        latitude = location.latitude
        longitude = location.longitude
        Coordinates.append((location.latitude,location.longitude))
        accuracy += 1
    except:
        pass


Success
Failed
Failed
Failed
Success
Failed
Success
Failed
Failed
Failed
Failed
Success
Success
Success
Failed
Failed
Failed
Success
Failed
Failed
Failed
Failed
Success
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Success
Failed
Failed
Success
Failed
Failed
Failed
Success
Failed
Failed
Success
Failed
Failed
Success
Failed
Failed
Failed
Success
Failed
Failed
Failed
Failed
Success
Failed
Failed
Failed
Failed
Failed
Failed
Success
Failed
Failed
Failed
Success
Failed
Failed
Failed
Failed
Failed
Success
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Success
Failed
Failed
Failed
Success
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Success
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed


[(43.7543263, -79.44911696639593),
 (43.6563221, -79.3809161),
 (54.2820009, -0.4011868),
 (43.64074125, -79.5419018239487),
 (54.2820009, -0.4011868),
 (43.7328216, -79.3469614),
 (43.671459150000004, -79.55249206611668),
 (43.7626686, -79.2308605092575),
 (43.6511085, -79.38347444469197),
 (43.7543263, -79.44911696639593),
 (43.6522219, -79.40753862886237),
 (43.735823249999996, -79.47870883340411),
 (43.7170226, -79.41978303501344),
 (43.6727601, -79.30405834999999),
 (43.7859621, -79.4160307769213),
 (43.7543263, -79.44911696639593),
 (43.6896191, -79.479188),
 (43.7170226, -79.41978303501344),
 (43.671459150000004, -79.55249206611668),
 (43.7170226, -79.41978303501344),
 (43.773077, -79.257774)]

Due to the unreliability of the above package, I will use the provided CSV file which contains the geospatial locations of the suburbs

In [24]:
coord = pd.read_csv("Geospatial_Coordinates.csv")
coord.columns = ["Post_code","Latitude","Longitude"]
coord.head()

Unnamed: 0,Post_code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [31]:
loc_df = pd.merge(df,coord,on="Post_code")
loc_df

Unnamed: 0,Post_code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509
