Let us start by installing BeautifulSoup for the retrieval of the required table from the wikipedia link.

In [1]:
! pip install beautifulsoup4
print(" BeautifulSoup4 Successfully Installed!")

Collecting beautifulsoup4
[?25l  Downloading https://files.pythonhosted.org/packages/66/25/ff030e2437265616a1e9b25ccc864e0371a0bc3adb7c5a404fd661c6f4f6/beautifulsoup4-4.9.1-py3-none-any.whl (115kB)
[K     |████████████████████████████████| 122kB 6.0MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2 (from beautifulsoup4)
  Downloading https://files.pythonhosted.org/packages/6f/8f/457f4a5390eeae1cc3aeab89deb7724c965be841ffca6cfca9197482e470/soupsieve-2.0.1-py3-none-any.whl
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.9.1 soupsieve-2.0.1
 BeautifulSoup4 Successfully Installed!


In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

Importing required libraries and extraction of the table

In [3]:
import requests
from bs4 import BeautifulSoup

wiki = url
website_url = requests.get(wiki).text
soup = BeautifulSoup(website_url,'html.parser')

my_table = soup.find('table',{'class':'wikitable sortable'})
#print(my_table)
A=[]
B=[]
C=[]


for row in my_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))
       

Conversion of the table into a DataFrame

In [4]:
import pandas as pd
df = pd.DataFrame({'Postal_Code':A,'Borough':B,'Neighbourhood':C})

In [5]:
df.head()

Unnamed: 0,Postal_Code,Borough,Neighbourhood
0,M1A\n,Not assigned\n,Not assigned\n
1,M2A\n,Not assigned\n,Not assigned\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"


Data Cleansing and Pre-Processing

In [6]:
df['Postal_Code'] = df['Postal_Code'].map(lambda x: str(x)[:-1])
df['Borough'] = df['Borough'].map(lambda x: str(x)[:-1])
df['Neighbourhood'] = df['Neighbourhood'].map(lambda x: str(x)[:-1])

In [7]:
df.head()

Unnamed: 0,Postal_Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [8]:
df.shape

(180, 3)

In [9]:
df['Borough'].unique()

array(['Not assigned', 'North York', 'Downtown Toronto', 'Etobicoke',
       'Scarborough', 'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

In [10]:
df.set_index(['Postal_Code'],inplace = True)

Getting rid of rows with Borough as "Not assigned"

In [11]:
dfdrop = df[df.Borough == "Not assigned"]

In [12]:
dfdrop.shape

(77, 2)

In [13]:
dfdrop.head()

Unnamed: 0_level_0,Borough,Neighbourhood
Postal_Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1A,Not assigned,Not assigned
M2A,Not assigned,Not assigned
M8A,Not assigned,Not assigned
M2B,Not assigned,Not assigned
M7B,Not assigned,Not assigned


In [14]:
df.drop(dfdrop.index,inplace = True)

In [15]:
df.shape

(103, 2)

Processed DataFrame

In [16]:
df.head()

Unnamed: 0_level_0,Borough,Neighbourhood
Postal_Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Regent Park, Harbourfront"
M6A,North York,"Lawrence Manor, Lawrence Heights"
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [17]:
! pip install geocoder

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 5.4MB/s eta 0:00:01
[?25hCollecting click (from geocoder)
[?25l  Downloading https://files.pythonhosted.org/packages/d2/3d/fa76db83bf75c4f8d338c2fd15c8d33fdd7ad23a9b5e57eb6c5de26b430e/click-7.1.2-py2.py3-none-any.whl (82kB)
[K     |████████████████████████████████| 92kB 6.6MB/s eta 0:00:011
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Collecting future (from geocoder)
[?25l  Downloading https://files.pythonhosted.org/packages/45/0b/38b06fd9b92dc2b68d58b75f900e97884c45bedd2ff83203d933cf5851c9/future-0.18.2.tar.gz (829kB)
[K     |████████████████████████████████| 829kB 6.1MB/s eta 0:00:01
Building wheels

In [18]:
import geocoder

In [20]:
geocord = pd.read_csv('Geospatial_Coordinates.csv')

In [21]:
geocord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [32]:
geocord.rename(columns={'Postal Code':'Postal_Code'},inplace = True)

In [44]:
geocord.set_index(['Postal_Code'],inplace = True)

In [46]:
df.index.values

array(['M3A', 'M4A', 'M5A', 'M6A', 'M7A', 'M9A', 'M1B', 'M3B', 'M4B',
       'M5B', 'M6B', 'M9B', 'M1C', 'M3C', 'M4C', 'M5C', 'M6C', 'M9C',
       'M1E', 'M4E', 'M5E', 'M6E', 'M1G', 'M4G', 'M5G', 'M6G', 'M1H',
       'M2H', 'M3H', 'M4H', 'M5H', 'M6H', 'M1J', 'M2J', 'M3J', 'M4J',
       'M5J', 'M6J', 'M1K', 'M2K', 'M3K', 'M4K', 'M5K', 'M6K', 'M1L',
       'M2L', 'M3L', 'M4L', 'M5L', 'M6L', 'M9L', 'M1M', 'M2M', 'M3M',
       'M4M', 'M5M', 'M6M', 'M9M', 'M1N', 'M2N', 'M3N', 'M4N', 'M5N',
       'M6N', 'M9N', 'M1P', 'M2P', 'M4P', 'M5P', 'M6P', 'M9P', 'M1R',
       'M2R', 'M4R', 'M5R', 'M6R', 'M7R', 'M9R', 'M1S', 'M4S', 'M5S',
       'M6S', 'M1T', 'M4T', 'M5T', 'M1V', 'M4V', 'M5V', 'M8V', 'M9V',
       'M1W', 'M4W', 'M5W', 'M8W', 'M9W', 'M1X', 'M4X', 'M5X', 'M8X',
       'M4Y', 'M7Y', 'M8Y', 'M8Z'], dtype=object)

In [48]:
for value in geocord.index:
    if value in df.index.values:
        continue
    else:
        geocord.drop(value,inplace = True)

In [49]:
geocord.index

Index(['M1B', 'M1C', 'M1E', 'M1G', 'M1H', 'M1J', 'M1K', 'M1L', 'M1M', 'M1N',
       ...
       'M9A', 'M9B', 'M9C', 'M9L', 'M9M', 'M9N', 'M9P', 'M9R', 'M9V', 'M9W'],
      dtype='object', name='Postal_Code', length=103)

In [50]:
df.index

Index(['M3A', 'M4A', 'M5A', 'M6A', 'M7A', 'M9A', 'M1B', 'M3B', 'M4B', 'M5B',
       ...
       'M8W', 'M9W', 'M1X', 'M4X', 'M5X', 'M8X', 'M4Y', 'M7Y', 'M8Y', 'M8Z'],
      dtype='object', name='Postal_Code', length=103)

In [51]:
df.sort_index(ascending = True,inplace = True)

In [52]:
geocord.sort_index(ascending = True,inplace = True)

In [58]:
df['Latitude'] = geocord.Latitude.values

In [59]:
df['Longitude'] = geocord.Longitude.values

In [62]:
df.head()

Unnamed: 0_level_0,Borough,Neighbourhood,Latitude,Longitude
Postal_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
M1G,Scarborough,Woburn,43.770992,-79.216917
M1H,Scarborough,Cedarbrae,43.773136,-79.239476
