# Project: Scrape Wikipedia page for Toronto Postal Codes utilizing BeautifulSoup and Pandas.

## Download BeautifulSoup
## In order to scrape the table of postal codes from Wikipedia, we first need to install BeautifulSoup and the appropriate packages related to it

In [31]:
pip install beautifulsoup4

Collecting beautifulsoup4
[?25l  Downloading https://files.pythonhosted.org/packages/e8/b5/7bb03a696f2c9b7af792a8f51b82974e51c268f15e925fc834876a4efa0b/beautifulsoup4-4.9.0-py3-none-any.whl (109kB)
[K     |████████████████████████████████| 112kB 8.6MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2 (from beautifulsoup4)
  Downloading https://files.pythonhosted.org/packages/05/cf/ea245e52f55823f19992447b008bcbb7f78efc5960d77f6c34b5b45b36dd/soupsieve-2.0-py2.py3-none-any.whl
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.9.0 soupsieve-2.0
Note: you may need to restart the kernel to use updated packages.


In [32]:
pip install lxml

Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/dd/ba/a0e6866057fc0bbd17192925c1d63a3b85cf522965de9bc02364d08e5b84/lxml-4.5.0-cp36-cp36m-manylinux1_x86_64.whl (5.8MB)
[K     |████████████████████████████████| 5.8MB 10.7MB/s eta 0:00:01     |██████▍                         | 1.2MB 10.7MB/s eta 0:00:010:01████████████████▍     | 4.7MB 10.7MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.5.0
Note: you may need to restart the kernel to use updated packages.


In [33]:
pip install html5lib

Note: you may need to restart the kernel to use updated packages.


In [34]:
pip install requests

Note: you may need to restart the kernel to use updated packages.


In [35]:
from bs4 import BeautifulSoup
import requests

# Import Wikipedia Page

## Use beautifulsoup to import the wikipedia page, and then access only the table which we need for the project.

In [36]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url)

In [37]:
soup = BeautifulSoup(page.content, 'lxml')

In [38]:
#print(soup.prettify()) #This will show the entire html text.

In [39]:
match = soup.title.text
print(match)

List of postal codes of Canada: M - Wikipedia


In [40]:
match = soup.table.prettify()
#match = soup.find('div', class_='wikitable sortable')
#print(match)

In [41]:
table = soup.find_all('table')[0]

## Convert into a DataFrame

In [42]:
import pandas as pd

In [43]:
df = pd.read_html(str(table))[0]

In [44]:
len(df)

180

In [45]:
wiki = pd.DataFrame(df)

In [46]:
wiki.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [47]:
wiki.size

540

## Create a Dataframe which only contains assigned Boroughs

In [48]:
wiki.columns.values

array(['Postal code', 'Borough', 'Neighborhood'], dtype=object)

In [49]:
(wiki['Borough'] == 'Not assigned').sum()

77

In [50]:
wiki['Borough'].size

180

In [51]:
clean = wiki[wiki['Borough'] != 'Not assigned']

In [52]:
clean

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
...,...,...,...
160,M8X,Etobicoke,The Kingsway / Montgomery Road / Old Mill North
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,Business reply mail Processing CentrE
169,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...


## Dataframe Shape

In [53]:
clean.shape

(103, 3)

## Import csv file that includes latitude/longitude information

In [54]:
clean.to_csv(r'toronto.csv', index = False, header = True)

In [63]:
clean = clean.rename(columns={"Postal code": "Postal Code"})
clean.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [64]:
cor = pd.read_csv('Geospatial_Coordinates.csv')
cor.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## Merge both dataframes together on 'Postal Code'

In [65]:
final = pd.merge(clean, cor, on='Postal Code')

In [66]:
final.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,43.662301,-79.389494
