# Applied Data Science Capstone Project

## Import required packages

In [1]:
import pandas as pd
import numpy as np
import requests 
requests.packages.urllib3.disable_warnings()
#Requests enables us to download raw html as text
from bs4 import BeautifulSoup 

## Scrapping Data from Wikipedia page using BeautifulSoup


In [4]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
r = requests.get(url, timeout=20 ,verify = False)

In [None]:
raw_html = r.text 
raw_html

In [None]:
soup = BeautifulSoup(raw_html,'html.parser')
print(soup.prettify())

In [9]:
soup.find('title').text

'List of postal codes of Canada: M - Wikipedia'

In [81]:
table = soup.find_all('table', attrs = {"class":"wikitable"})

In [None]:
from IPython.core.display import display,HTML #display the parsed table
display(HTML(table[0].prettify()))

In [50]:
column_headers = [th.getText() for th in 
                  table[0].find_all('tr')[0].findAll('th')]
column_headers

['Postcode', 'Borough', 'Neighbourhood\n']

In [None]:
rows = table[0].find_all('tr')
rows

In [51]:
data_rows = table[0].find_all('tr')[1:]  # skip the first 1 header rows
type(data_rows) 

list

In [52]:
data = [[td.getText() for td in data_rows[i].findAll('td')]
            for i in range(len(data_rows))]

In [55]:
df = pd.DataFrame(data,columns=column_headers)

In [56]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


## Alternative way to scrap data
### Note: using pandas.io.hmtl to get the wiki table into pandas DataFrame

In [82]:
res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
wikitables = soup.find_all('table') 
df2 = pd.read_html(str(wikitables[0]), index_col=None, header=0)[0]
df2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [84]:
df2.describe()

Unnamed: 0,Postcode,Borough,Neighbourhood
count,212,212,212
unique,103,11,210
top,M8Y,Etobicoke,Runnymede
freq,8,45,2


In [60]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 289 entries, 0 to 288
Data columns (total 3 columns):
Postcode          289 non-null object
Borough           289 non-null object
Neighbourhood
    289 non-null object
dtypes: object(3)
memory usage: 6.9+ KB


## Cleaning the Data

Dropping the rows where Borough is Not Assigned

In [85]:
df2 = df2[df2.Borough != 'Not assigned']

## Data Processing - 'Not assigned' to Values

In [86]:
# Iterate over the dataframe and fix 'Not assigned' for column 'Neighborhood'
for i, _ in df2.iterrows():
    if df2.loc[i]['Neighbourhood'] == 'Not assigned': df2.loc[i]['Neighbourhood'] = df2.loc[i]['Borough']

# Dataframe Shape

In [89]:
df2.shape

(212, 3)