# IBM Data Science Professional Certificate
## Applied Data Science Capstone

# Scraping Wikipedia page to obtain the table of postal codes
https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [1]:
from bs4 import BeautifulSoup
import requests

## 1. Load page content

In [2]:
targetPage = r'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(targetPage).text

## 2. Parse HTML
### 2.1 Create soup

In [3]:
soup = BeautifulSoup(source, 'lxml')

### 2.2 Extract table tag

In [4]:
tableStr = soup.find('table').prettify() # Returns HTML table tag as a string

## 3. Convert HTML table to pandas dataframe
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_html.html

In [5]:
import pandas as pd
df = pd.read_html(tableStr, match='str', header=0) # Returns a list of dataframes from tables tags in the input string. header=0 is denote the first row conatins the column lables.
postalcodes = df[0] # First dataframe in the list
postalcodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 289 entries, 0 to 288
Data columns (total 3 columns):
Postcode         289 non-null object
Borough          289 non-null object
Neighbourhood    289 non-null object
dtypes: object(3)
memory usage: 6.9+ KB


In [6]:
postalcodes.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## 4. Pre-Processing
### 4.1 Filter out the records that does not have  an assigned borough

In [7]:
postalcodes = postalcodes.loc[postalcodes.Borough!='Not assigned'].reset_index(drop=True)
postalcodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212 entries, 0 to 211
Data columns (total 3 columns):
Postcode         212 non-null object
Borough          212 non-null object
Neighbourhood    212 non-null object
dtypes: object(3)
memory usage: 5.0+ KB


In [8]:
postalcodes.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


### 4.2 Collapse records with more than one neighborhood can exist in one postal code area

In [9]:
postalcodes = postalcodes.groupby(['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index()
postalcodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 3 columns):
Postcode         103 non-null object
Borough          103 non-null object
Neighbourhood    103 non-null object
dtypes: object(3)
memory usage: 2.5+ KB


In [10]:
postalcodes.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### 4.3 Assign Neighbourhood=Borough for the records with Neighbourhood='Not Assigned'

In [11]:
postalcodes.loc[postalcodes.Neighbourhood=='Not assigned'].reset_index(drop=True)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M7A,Queen's Park,Not assigned


In [12]:
postalcodes.loc[postalcodes.Neighbourhood=='Not assigned', 'Neighbourhood']=postalcodes.Borough

In [13]:
postalcodes.loc[postalcodes.Neighbourhood=='Not assigned'].reset_index(drop=True) # Should have no rows satisfying this condition excists

Unnamed: 0,Postcode,Borough,Neighbourhood


## 5. Number of rows and columns in the dataframe

In [14]:
print('Number of Rows =', postalcodes.shape[0])

Number of Rows = 103
