In [None]:
#IBM Data Science Applied Capstone Project Week-3

# Part 1: Scraping Postal Codes from Wikipidea

In [2]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [3]:
res = requests.get("https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=945633050.")
soup = BeautifulSoup(res.content, 'lxml')
tables = soup.find_all('table')
tables

[<table class="wikitable sortable">
 <tbody><tr>
 <th>Postcode</th>
 <th>Borough</th>
 <th>Neighbourhood
 </th></tr>
 <tr>
 <td>M1A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>
 <tr>
 <td>M2A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>
 <tr>
 <td>M3A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
 </td></tr>
 <tr>
 <td>M4A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
 </td></tr>
 <tr>
 <td>M5A</td>
 <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
 <td><a href="/wiki/Regent_Park" title="Regent Park">Harbourfront</a>
 </td></tr>
 <tr>
 <td>M6A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Lawrence_Heights" title="Lawrence Heights">Lawrence Heights</a>
 </td></tr>
 <tr>


In [62]:
#Selecting required table by class
table = soup.find('table', {'class': 'wikitable sortable'})

#Reading table using pandas and marking 'Not assigned' as na
df = pd.read_html(str(table), na_values=['Not assigned'])

#As read_html returns a list of dataframe, selecting the dataframe we need
df = df[0]
print(df.shape)
df.head()

(287, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [63]:
#Removing the rows with missing Neighbourhoods
df.dropna(subset=['Neighbourhood'], axis=0, inplace=True)

#Renaming the column to match with the requirement
df.rename(columns={'Postcode':'PostalCode'}, inplace=True)
print(df.shape)
df.head()

(210, 3)


Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [64]:
#checking if any Neighbourhood is missing value
np.where(df['Neighbourhood'].isna())
#if missing uncomment the code below
#df.loc[df['Neighbourhood'].isna(), ['Neighbourhood']] = df['Borough']

(array([], dtype=int64),)

In [65]:
#Combining Neighbourhoods with common PostalCode
dfg = df.groupby(['PostalCode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()
dfg

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


Number of rows and columns of the final dataframe

In [66]:
dfg.shape

(103, 3)

# Part 2: Geocoding the data

In [86]:
url = 'https://cocl.us/Geospatial_data'
geodata = pd.read_csv(url)
print(geodata.shape)
geodata.head()

(103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [89]:
geodata.rename(columns={'Postal Code':'PostalCode'}, inplace=True)

In [90]:
#Merging the two dataframes using PostalCode as the common column
df = pd.merge(dfg, geodata, on='PostalCode')
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
