## Importing the necessary libraries for the project execution

In [2]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


### Importing the Requests package to get the Wikipedia page 

In [3]:
import requests

### Reading the Wikipedia Page and return the html to the variable 'page'

In [4]:
page = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
page

<Response [200]>

### Importing the Beautifulsoup package and Parse the html in the 'page' variable, and store it in Beautiful Soup format

In [5]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')

### Use function “prettify” to look at nested structure of HTML page

In [6]:
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );
  </script>
  <script>
   (window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":857830462,"wgRevisionId":857830462,"wgArticleId":539066,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications in Ontario","Postal codes in Canada","Toronto","Ontario-related lists"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wg

### Returns the content which is enclosed within the tags

In [7]:
soup.title

<title>List of postal codes of Canada: M - Wikipedia</title>

In [8]:
soup.a

<a id="top"></a>

In [9]:
soup.title.string

'List of postal codes of Canada: M - Wikipedia'

In [10]:
soup.find_all('a')

[<a id="top"></a>,
 <a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#p-search">Jump to search</a>,
 <a href="/wiki/Postal_codes_in_Canada" title="Postal codes in Canada">postal codes in Canada</a>,
 <a href="/wiki/Toronto" title="Toronto">Toronto</a>,
 <a href="/wiki/Ontario" title="Ontario">Ontario</a>,
 <a href="/wiki/Canada_Post" title="Canada Post">Canada Post</a>,
 <a href="#cite_note-1">[1]</a>,
 <a class="mw-redirect" href="/wiki/Smartphones" title="Smartphones">smartphones</a>,
 <a href="/wiki/IPhone" title="IPhone">iPhone</a>,
 <a href="/wiki/BlackBerry" title="BlackBerry">BlackBerry</a>,
 <a href="#cite_note-2">[2]</a>,
 <a href="/wiki/CD-ROM" title="CD-ROM">CD-ROMs</a>,
 <a href="/w/index.php?title=List_of_postal_codes_of_Canada:_M&amp;action=edit&amp;section=1" title="Edit section: Toronto]] - 103 Canadian postal code#Forward sortation areas|FSAs]]">edit</a>,
 <a href="/wiki/North_York" title="North York">North York</a>,
 <a hr

### Find the right table: As we are seeking a table to extract information about state capitals, we should identify the right table first. Let’s write the command to extract information within all table tags.

In [11]:
all_tables=soup.find_all('table')
all_tables

[<table class="wikitable sortable">
 <tbody><tr>
 <th>Postcode</th>
 <th>Borough</th>
 <th>Neighbourhood
 </th></tr>
 <tr>
 <td>M1A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>
 <tr>
 <td>M2A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>
 <tr>
 <td>M3A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
 </td></tr>
 <tr>
 <td>M4A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
 </td></tr>
 <tr>
 <td>M5A</td>
 <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
 <td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
 </td></tr>
 <tr>
 <td>M5A</td>
 <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
 <td><a href="/wiki/Regent_Park" title="Regent Park">Regent Pa

#### Now to identify the right table, we will use attribute “class” of table and use it to filter the right table. In chrome, you can check the class name by right click on the required table of web page –> Inspect element –> Copy the class name OR go through the output of above command find the class name of right table.

Extract the information to DataFrame: Here, we need to iterate through each row (tr) and then assign each element of tr (td) to a variable and append it to a list. 

In [12]:
right_table=soup.find('table', class_='wikitable sortable')
right_table

<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td></tr>
<tr>
<td>M6A</td>

#### Now to access value of each element, we will use “find(text=True)” option with each element. 

In [13]:
A=[]
B=[]
C=[]

for row in right_table.findAll("tr"):
    cells = row.findAll('td')
    if len(cells)==3: #Only extract table body not heading
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))

len(cells)
A

['M1A',
 'M2A',
 'M3A',
 'M4A',
 'M5A',
 'M5A',
 'M6A',
 'M6A',
 'M7A',
 'M8A',
 'M9A',
 'M1B',
 'M1B',
 'M2B',
 'M3B',
 'M4B',
 'M4B',
 'M5B',
 'M5B',
 'M6B',
 'M7B',
 'M8B',
 'M9B',
 'M9B',
 'M9B',
 'M9B',
 'M9B',
 'M1C',
 'M1C',
 'M1C',
 'M2C',
 'M3C',
 'M3C',
 'M4C',
 'M5C',
 'M6C',
 'M7C',
 'M8C',
 'M9C',
 'M9C',
 'M9C',
 'M9C',
 'M1E',
 'M1E',
 'M1E',
 'M2E',
 'M3E',
 'M4E',
 'M5E',
 'M6E',
 'M7E',
 'M8E',
 'M9E',
 'M1G',
 'M2G',
 'M3G',
 'M4G',
 'M5G',
 'M6G',
 'M7G',
 'M8G',
 'M9G',
 'M1H',
 'M2H',
 'M3H',
 'M3H',
 'M3H',
 'M4H',
 'M5H',
 'M5H',
 'M5H',
 'M6H',
 'M6H',
 'M7H',
 'M8H',
 'M9H',
 'M1J',
 'M2J',
 'M2J',
 'M2J',
 'M3J',
 'M3J',
 'M4J',
 'M5J',
 'M5J',
 'M5J',
 'M6J',
 'M6J',
 'M7J',
 'M8J',
 'M9J',
 'M1K',
 'M1K',
 'M1K',
 'M2K',
 'M3K',
 'M3K',
 'M4K',
 'M4K',
 'M5K',
 'M5K',
 'M6K',
 'M6K',
 'M6K',
 'M7K',
 'M8K',
 'M9K',
 'M1L',
 'M1L',
 'M1L',
 'M2L',
 'M2L',
 'M3L',
 'M4L',
 'M4L',
 'M5L',
 'M5L',
 'M6L',
 'M6L',
 'M6L',
 'M7L',
 'M8L',
 'M9L',
 'M1M',
 'M1M',


### Converting the lists to one dataframe

In [14]:
#import pandas to convert list to data frame
import pandas as pd
df=pd.DataFrame(A,columns=['Postal_code'])
df['Borough']=B
df['Neighborhood']=C

df

Unnamed: 0,Postal_code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


#### Removing the rows with incomplete data

In [15]:
df = df[df.Borough !="Not assigned"]
df = df[df.Neighborhood !="Not assigned"]
df

Unnamed: 0,Postal_code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


#### Resetting the Index

In [16]:
df.reset_index(drop=True)

Unnamed: 0,Postal_code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [17]:
df.shape

(212, 3)

## Below step joins values in Neighborhood column based on postalCode and Borough

In [18]:
df2=df.groupby(['Postal_code','Borough'])['Neighborhood'].apply(','.join).reset_index()
df2

Unnamed: 0,Postal_code,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood]]\n,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park\n,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West\n"
9,M1N,Scarborough,"Birch Cliff,Cliffside West\n"


In [19]:
df2.shape

(103, 3)

In [20]:
postal = df.Postal_code.tolist()
len(postal)

212

### Creating 2 functions to get Latitude and Longitude
I'm using "Geolocator" tool insted of geocoder which was given to us for sample

I'm using Neighboorhod and Borough to get the latitude and longitude.

In [23]:
# Function for getting Latitude
def lat(elem):
    n,b=elem
    n1=list(n.split(","))
    a='{},{}'.format(n1[0],b)
    geolocator = Nominatim()
    location = geolocator.geocode(a)
    if location == None:
        location = geolocator.geocode(b)
        latitude = location.latitude
        longitude = location.longitude
    else:
        latitude = location.latitude
        longitude = location.longitude
    return latitude

In [24]:
# Function for getting Longitude
def lng(elem):
    n,b=elem
    n1=list(n.split(","))
    a='{},{}'.format(n1[0],b)
    #print(a)
    geolocator = Nominatim()
    location = geolocator.geocode(a)
    if location == None:
        location = geolocator.geocode(b)
        latitude = location.latitude
        longitude = location.longitude
    else:
        latitude = location.latitude
        longitude = location.longitude
    return longitude

In [25]:
df2['latitude'] = df2[['Neighborhood','Borough']].apply(lat,axis=1)



In [26]:
df2['longitude'] = df2[['Neighborhood','Borough']].apply(lng,axis=1)



In [27]:
df2.head()

Unnamed: 0,Postal_code,Borough,Neighborhood,latitude,longitude
0,M1B,Scarborough,"Rouge,Malvern",43.80493,-79.165837
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.790117,-79.173334
2,M1E,Scarborough,"Guildwood]]\n,Morningside,West Hill",43.754899,-79.197776
3,M1G,Scarborough,Woburn,43.759824,-79.225291
4,M1H,Scarborough,Cedarbrae,43.756467,-79.226692
