In [62]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import geocoder # import geocoder


## Data Gathering

In [5]:
#get html page
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html_data=requests.get(url)
#create an object from BeautifulSoup with html data
soup=BeautifulSoup(html_data.content,'html.parser')
# extract table from entire html page
table=soup.find('table',attrs={'class':'wikitable'})
# get table rows
rows=table.find_all('tr')

In [13]:
# extract text from table cells as columns(header , th) and data (td)
columns=[]
data=[]
for row in rows:
    if len(row.find_all('th'))!=0:#header row
        headers= row.find_all('th')
        for th in headers:
            columns.append(th.text.strip())
    else:
        row_tds= row.find_all('td')
        row_data_lst=[]
        for td in row_tds:
            row_data_lst.append(td.text.strip())
        data.append(row_data_lst)

In [14]:
df=pd.DataFrame(columns=columns,data=data)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [16]:
# check data frame no. of rows and no. of columns
df.shape

(180, 3)

## Clean Data

In [17]:
# replace Not assigned data cells to numpy null value
df=df.replace('Not assigned',np.nan)

In [18]:
#check nulls values in each columns
df.isna().sum()

Postal Code      0
Borough         77
Neighborhood    77
dtype: int64

In [26]:
# ignore borough with null values(Not Assigned),we have about 77 row with null borough
df=df[df.Borough.isna() ==False]
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [29]:
# check if we have null Neighborhood
df.Neighborhood.isna().sum()

0

In [57]:
# check data frame shape
df.shape

(103, 3)

### Get Coordinates for each Postal Code

In [83]:
df_geo=pd.read_csv('Geospatial_Coordinates.csv')
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [92]:
df_new=df.join(df_geo.set_index('Postal Code'),on='Postal Code')
df_new.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.753259,-79.329656
3,M4A,North York,Victoria Village,43.725882,-79.315572
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [94]:
df_new.shape

(103, 5)