# COURSERA CAPSTONE

In [78]:
""" This notebook will be used for the coursera capstone project. """
import numpy as np
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup

In [79]:
print('Hello Capstone Project Course!')

Hello Capstone Project Course!


## 1. Fetch the data from Wikipedia ##

In [80]:
html = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(html, 'lxml')

# Find the only table of that URL:
table = soup.find('table', {'class': 'wikitable sortable'})

In [81]:
# Inspect the table
table

<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td></tr>
<tr>
<td>M6A</td>

## 2. Clean the data

In [82]:
# Extract all <td> tags.
table.findAll('td')

[<td>M1A</td>, <td>Not assigned</td>, <td>Not assigned
 </td>, <td>M2A</td>, <td>Not assigned</td>, <td>Not assigned
 </td>, <td>M3A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
 </td>, <td>M4A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
 </td>, <td>M5A</td>, <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>, <td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
 </td>, <td>M5A</td>, <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>, <td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
 </td>, <td>M6A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Lawrence_Heights" title="Lawrence Heights">Lawrence Heights

In [83]:
# Put each column into a dataframe
t = pd.DataFrame()

i=0
target_column = {0: 'PostalCode', 1: 'Borough', 2: 'Neighborhood'}
for td in list(table.findAll('td')):
    col = target_column[i%3]
    value = str(td).replace('<td>', "").replace('</td>','').replace('\n', "")
    try:
        value = re.findall( r">(.*)</a>",value)[0]
    except:
       pass 
    t.loc[i//3, col] = value
    i+=1
t.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [84]:
### Clean the data to remove 'Not assigned' in Borough ###
df = t
df = df[~df['Borough'].str.contains('Not assigned')]
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [85]:
### Combine Neighboroods within the same postal code ###
list_of_multiple_neighborhoods = df['PostalCode'].value_counts().index[df['PostalCode'].value_counts()>1]
dfs = df.copy()

def combine_neighborhood():
    global dfs
    i=0
    while True:
        #print(i)
        dfs.reset_index(inplace=True, drop=True)
        maxrow = dfs.shape[0]
        if i == maxrow-1:
            break
        else:
            #print(i,dfs.shape[0])
            if dfs.iloc[i,0] == dfs.iloc[i+1,0]:
                dfs.iloc[i,2] = dfs.iloc[i,2] + ', ' + dfs.iloc[i+1,2]
                dfs.drop(i+1, axis=0, inplace=True)
            else:
                i+=1

combine_neighborhood()

   

In [86]:
### Clean the data to replace 'Not assigned' in Neighborhood to the Borough ###
filter = df['Neighborhood'].str.contains('Not assigned')
pd.set_option('mode.chained_assignment', None) # Turn off a warning that is not relevant.
df.loc[filter,'Neighborhood'] = df.loc[filter, 'Borough']
df.head(8)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue


## 3. Summary

In [89]:
print('>>> The shape of the dataframe is {}. <<<'.format(dfs.shape))

dfs.head(5)

>>> The shape of the dataframe is (103, 3). <<<


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Not assigned
