# Capstone Notebook
This notebook will be used to complete the final course of the IBM professional data science certification program

In [1]:
import pandas as pd
import numpy as np
import urllib.request
import bs4 as bs

### Scrape web page and decode contents

In [6]:

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
req = urllib.request.urlopen(url)
article = req.read().decode()

### Filter contents, save table contents to DataFrame

In [82]:
#source = urllib.request.urlopen('http://www.gcoins.net/en/catalog/view/45518').read()
soup = bs.BeautifulSoup(article,'lxml')

table = soup.find('table')
table_rows = table.find_all('tr')

res = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td if tr.text.strip()]
    if row:
        res.append(row)


df = pd.DataFrame(res, columns=["Postcode", "Borough", "Neighbourhood"])
df.head()


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [83]:
df.shape

(289, 3)

### Remove Not Assigned Boroughs

In [85]:
df = df[df.Borough != 'Not assigned']
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [86]:
df.shape

(212, 3)

**77 Rows reduced [289 - 212]**

### Group rows by Postcode and Concatenate Neighbourhood

In [103]:
df2 = pd.DataFrame({'Neighbourhood' : df.groupby(['Postcode', 'Borough'])['Neighbourhood'].agg(lambda col: ', '.join(col))}).reset_index()
df2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Copy Borough value to Neighbourhood for Not Assigned Neighbourhoods

In [88]:
df2.loc[df2['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = df2.loc[df2['Neighbourhood'] == 'Not assigned'].Borough
df2[df2['Postcode'] == 'M7A']

Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Queen's Park


In [89]:
df2.shape

(103, 3)

### Import Coordinates and join on existing DF

In [90]:
coords = pd.read_csv('geo/Geospatial_Coordinates.csv')
coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [105]:
df3 = df2.join(coords.set_index('Postal Code'), on='Postcode')
df3.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [100]:
df3.shape

(103, 5)