Install the necessary packages 

In [1]:
import pandas
import requests
from bs4 import BeautifulSoup

Scraping the Wikipedia page in Table format and giving appropriate names

In [2]:
website_text = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_text,'xml')

table = soup.find('table',{'class':'wikitable sortable'})
table_rows = table.find_all('tr')

data = []
for row in table_rows:
    data.append([t.text.strip() for t in row.find_all('td')])

df = pandas.DataFrame(data, columns=['PostalCode', 'Borough', 'Neighbourhood'])
df = df[~df['PostalCode'].isnull()]  # to filter out bad rows

In [3]:
## Checking the dataset
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
1,M1A,Not assigned,
2,M2A,Not assigned,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [4]:
# Drop the Not assigned values via logic: 
df = df[df.Borough != 'Not assigned']

In [5]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Checking if the values in the variables contain any problematic values. 

In [6]:
df.Borough.unique()

array(['North York', 'Downtown Toronto', 'Etobicoke', 'Scarborough',
       'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

In [7]:
df.Neighbourhood.unique()

array(['Parkwoods', 'Victoria Village', 'Regent Park, Harbourfront',
       'Lawrence Manor, Lawrence Heights',
       "Queen's Park, Ontario Provincial Government",
       'Islington Avenue, Humber Valley Village', 'Malvern, Rouge',
       'Don Mills', 'Parkview Hill, Woodbine Gardens',
       'Garden District, Ryerson', 'Glencairn',
       'West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale',
       'Rouge Hill, Port Union, Highland Creek', 'Woodbine Heights',
       'St. James Town', 'Humewood-Cedarvale',
       'Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood',
       'Guildwood, Morningside, West Hill', 'The Beaches', 'Berczy Park',
       'Caledonia-Fairbanks', 'Woburn', 'Leaside', 'Central Bay Street',
       'Christie', 'Cedarbrae', 'Hillcrest Village',
       'Bathurst Manor, Wilson Heights, Downsview North',
       'Thorncliffe Park', 'Richmond, Adelaide, King',
       'Dufferin, Dovercourt Village', 'Scarborough Village',
       'Fairview, H

In [8]:
df.PostalCode.unique()

array(['M3A', 'M4A', 'M5A', 'M6A', 'M7A', 'M9A', 'M1B', 'M3B', 'M4B',
       'M5B', 'M6B', 'M9B', 'M1C', 'M3C', 'M4C', 'M5C', 'M6C', 'M9C',
       'M1E', 'M4E', 'M5E', 'M6E', 'M1G', 'M4G', 'M5G', 'M6G', 'M1H',
       'M2H', 'M3H', 'M4H', 'M5H', 'M6H', 'M1J', 'M2J', 'M3J', 'M4J',
       'M5J', 'M6J', 'M1K', 'M2K', 'M3K', 'M4K', 'M5K', 'M6K', 'M1L',
       'M2L', 'M3L', 'M4L', 'M5L', 'M6L', 'M9L', 'M1M', 'M2M', 'M3M',
       'M4M', 'M5M', 'M6M', 'M9M', 'M1N', 'M2N', 'M3N', 'M4N', 'M5N',
       'M6N', 'M9N', 'M1P', 'M2P', 'M4P', 'M5P', 'M6P', 'M9P', 'M1R',
       'M2R', 'M4R', 'M5R', 'M6R', 'M7R', 'M9R', 'M1S', 'M4S', 'M5S',
       'M6S', 'M1T', 'M4T', 'M5T', 'M1V', 'M4V', 'M5V', 'M8V', 'M9V',
       'M1W', 'M4W', 'M5W', 'M8W', 'M9W', 'M1X', 'M4X', 'M5X', 'M8X',
       'M4Y', 'M7Y', 'M8Y', 'M8Z'], dtype=object)

The DF is sorted ascending based on Postal Code so it will be matched with the following Coordinates Dataset

In [9]:
df = df.sort_values(by='PostalCode')
df.head(20)

Unnamed: 0,PostalCode,Borough,Neighbourhood
10,M1B,Scarborough,"Malvern, Rouge"
19,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
28,M1E,Scarborough,"Guildwood, Morningside, West Hill"
37,M1G,Scarborough,Woburn
46,M1H,Scarborough,Cedarbrae
55,M1J,Scarborough,Scarborough Village
64,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
73,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
82,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
91,M1N,Scarborough,"Birch Cliff, Cliffside West"


## Importing the Coordinated as a CSV format file 

In [10]:
import pandas as pd
coordinates = pd.read_csv('http://cocl.us/Geospatial_data/Geospatial_Coordinates.csv')

## Observing if the new dataset is similar with the DF. 

coordinates.head(20)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


# The 2 datasets were merged using the Postal Code a matching variable. 
##  It can be observed the postal codes from the 2 datasets are matching perfectly 

In [21]:
FullDataset = df.merge(coordinates, left_on = 'PostalCode', right_on = 'Postal Code')
FullDataset.head(20)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",M1B,43.75741,-79.273304
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",M1C,43.786947,-79.385975
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.754328,-79.442259
3,M1G,Scarborough,Woburn,M1G,43.676357,-79.293031
4,M1H,Scarborough,Cedarbrae,M1H,43.715383,-79.405678
5,M1J,Scarborough,Scarborough Village,M1J,43.651494,-79.375418
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",M1K,43.696948,-79.411307
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",M1L,43.693781,-79.428191
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",M1M,43.661608,-79.464763
9,M1N,Scarborough,"Birch Cliff, Cliffside West",M1N,43.636258,-79.498509
