# Applied Data Science Capstone Project

## Import required packages

In [1]:
import pandas as pd
import numpy as np
import requests 
requests.packages.urllib3.disable_warnings()
#Requests enables us to download raw html as text
from bs4 import BeautifulSoup 

## Scrapping Data from Wikipedia page using BeautifulSoup


In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
r = requests.get(url, timeout=20 ,verify = False)

In [None]:
raw_html = r.text 
raw_html

In [None]:
soup = BeautifulSoup(raw_html,'html.parser')
print(soup.prettify())

In [5]:
soup.find('title').text

'List of postal codes of Canada: M - Wikipedia'

In [6]:
table = soup.find_all('table', attrs = {"class":"wikitable"})

In [7]:
from IPython.core.display import display,HTML #display the parsed table
display(HTML(table[0].prettify()))

Postcode,Borough,Neighbourhood
M1A,Not assigned,Not assigned
M2A,Not assigned,Not assigned
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,Harbourfront
M5A,Downtown Toronto,Regent Park
M6A,North York,Lawrence Heights
M6A,North York,Lawrence Manor
M7A,Queen's Park,Not assigned
M8A,Not assigned,Not assigned


In [8]:
column_headers = [th.getText() for th in 
                  table[0].find_all('tr')[0].findAll('th')]
column_headers

['Postcode', 'Borough', 'Neighbourhood\n']

In [None]:
rows = table[0].find_all('tr')
rows

In [10]:
data_rows = table[0].find_all('tr')[1:]  # skip the first 1 header rows
type(data_rows) 

list

In [11]:
data = [[td.getText() for td in data_rows[i].findAll('td')]
            for i in range(len(data_rows))]

In [12]:
df = pd.DataFrame(data,columns=column_headers)

In [13]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


## Alternative way to scrap data
### Note: using pandas.io.hmtl to get the wiki table into pandas DataFrame

In [14]:
res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
wikitables = soup.find_all('table') 
df2 = pd.read_html(str(wikitables[0]), index_col=None, header=0)[0]
df2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [15]:
df2.describe()

Unnamed: 0,Postcode,Borough,Neighbourhood
count,289,289,289
unique,180,12,210
top,M9V,Not assigned,Not assigned
freq,8,77,78


In [16]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 289 entries, 0 to 288
Data columns (total 3 columns):
Postcode         289 non-null object
Borough          289 non-null object
Neighbourhood    289 non-null object
dtypes: object(3)
memory usage: 6.9+ KB


## Cleaning the Data

Dropping the rows where Borough is Not Assigned

In [17]:
df2 = df2[df2.Borough != 'Not assigned']

## Data Processing - 'Not assigned' to Values

In [18]:
# Iterate over the dataframe and fix 'Not assigned' for column 'Neighborhood'
for i, _ in df2.iterrows():
    if df2.loc[i]['Neighbourhood'] == 'Not assigned': df2.loc[i]['Neighbourhood'] = df2.loc[i]['Borough']

# Dataframe Shape

In [19]:
df2.shape

(212, 3)

In [6]:
!conda install -c conda-forge geocoder --yes


Solving environment: done

# All requested packages already installed.



### Importing Geospatial_Coordinates Dataset to get the Latitude and Longitude 

In [53]:
Cordinates = pd.read_csv("Geospatial_Coordinates.csv")
Cordinates = pd.DataFrame(Cordinates,columns = ['Postal Code','Latitude','Longitude'])
Cordinates = Cordinates.rename(columns={'Postal Code':'Postcode'})
Cordinates.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Merging the dataset with cordinates data

In [55]:
new_df = pd.merge(df2, Cordinates, on='Postcode', how='outer')
new_df.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
4,M6A,North York,Lawrence Heights,43.718518,-79.464763
