# Part 1

import modules

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

Extract data from the table in the URL

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
extracting_data = requests.get(url).text
wiki_data = BeautifulSoup(extracting_data, 'lxml')

data = []
content = wiki_data.find('div', class_='mw-parser-output')
table_body = content.table.tbody

rows = table_body.find_all('tr')
for row in rows:
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    data.append([ele for ele in cols if ele])

data = np.array(data[1:])
print(data.shape)
for entry in data:
  print(entry)

(180, 3)
['M1A' 'Not assigned' 'Not assigned']
['M2A' 'Not assigned' 'Not assigned']
['M3A' 'North York' 'Parkwoods']
['M4A' 'North York' 'Victoria Village']
['M5A' 'Downtown Toronto' 'Regent Park, Harbourfront']
['M6A' 'North York' 'Lawrence Manor, Lawrence Heights']
['M7A' 'Downtown Toronto' "Queen's Park, Ontario Provincial Government"]
['M8A' 'Not assigned' 'Not assigned']
['M9A' 'Etobicoke' 'Islington Avenue, Humber Valley Village']
['M1B' 'Scarborough' 'Malvern, Rouge']
['M2B' 'Not assigned' 'Not assigned']
['M3B' 'North York' 'Don Mills']
['M4B' 'East York' 'Parkview Hill, Woodbine Gardens']
['M5B' 'Downtown Toronto' 'Garden District, Ryerson']
['M6B' 'North York' 'Glencairn']
['M7B' 'Not assigned' 'Not assigned']
['M8B' 'Not assigned' 'Not assigned']
['M9B' 'Etobicoke'
 'West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale']
['M1C' 'Scarborough' 'Rouge Hill, Port Union, Highland Creek']
['M2C' 'Not assigned' 'Not assigned']
['M3C' 'North York' 'Don Mills']
['M

Create dataframe

In [3]:
df_original = pd.DataFrame(data=data, columns=['Postalcode','Borough','Neighborhood']) 
df_original

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


Ignore not assigned boroughs

In [4]:
df_clean = df_original[df_original.Borough != 'Not assigned']
df_clean

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


Check not assigned neighborhood. All neighborhoods are assigned

In [5]:
df_clean[df_clean.Neighborhood == 'Not assigned']

Unnamed: 0,Postalcode,Borough,Neighborhood


Check duplicate postal code. No duplicate

In [6]:
vc = df_clean.Postalcode.value_counts()
# check number of postal code that appeared in more than one row
len(vc[vc > 1])

0

Check number of rows in the dataframe

In [7]:
df_clean.shape

(103, 3)

## Final Result-Part 1



In [8]:
df_clean = df_clean.reset_index(drop=True)
df_clean

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


# Part 2

In [9]:
df_geo = pd.read_csv('/content/drive/My Drive/datasets/Coursera/Geospatial_Coordinates.csv')

df_combined = df_clean
df_combined['Latitude'] = df_combined['Postalcode'].map(df_geo.set_index('Postal Code')['Latitude'])
df_combined['Longitude'] = df_combined['Postalcode'].map(df_geo.set_index('Postal Code')['Longitude'])

df_combined

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509
