# This notebook has been created in order to start my Applied Data Science Capstone Project (IMB/Coursera)


## Part 1 of 3. Create a dataset of Toronto postal codes: 

In [1]:
# Install BeautifulSoup library, we'll use it for HTML parsing
!conda install -c conda-forge bs4 --yes
from bs4 import BeautifulSoup

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [2]:
# Import all libraries
import numpy as np
import requests as rq
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
print('All libraries imported successfully.')

All libraries imported successfully.


In [3]:
# Load Wikipedia page with the table of Toronto postal codes    
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html_postal_codes = rq.get(wiki_url).text
soup = BeautifulSoup(html_postal_codes, 'html.parser').find('table')
# print(soup.prettify()) # print formatted HTML code of the table

In [45]:
# Parse table
tbl_header = soup.find_all('th')  # extract table headers
tbl_values = soup.find_all('td')  # extract table cells

# Create a data frame & populate it with values.
postal_codes = pd.DataFrame()
# Populate the data frame with values.
# Note that we have to eliminate '\n' characters from each HTML tag.
for i in range(len(tbl_values) // len(tbl_header)):
    j = i * len(tbl_header)
    postal_codes = postal_codes.append(
        [ [tag.string.split("\n")[0] for tag in tbl_values[j:j+len(tbl_header)]] ],
    )

# Now set column names & filter out rows with unassigned borough name.    
postal_codes.columns = [tag.string.split("\n")[0] for tag in tbl_header]
postal_codes = postal_codes[postal_codes['Borough'] != 'Not assigned']
postal_codes.reset_index(drop=True, inplace=True)
postal_codes.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [46]:
# Let's check whether our dataset has any record with unassigned Neighborhood?
postal_codes[postal_codes['Neighbourhood'] == 'Not assigned'].head()

Unnamed: 0,Postal Code,Borough,Neighbourhood


In [47]:
# The Wikipedia page claims there are 103 postal codes for Toronto.
# Let's make sure our dataset contains the corresponding number of records.
print(f'The HTML page was parsed successfully, {postal_codes.shape[0]} data rows produced.')

The HTML page was parsed successfully, 103 data rows produced.


## Part 2 of 3. Obtain the geographical coordinates for each neighborhood:

In [7]:
# Install & import Geocoder package, we'll use it to get coordinates for a given postal code
!conda install -c conda-forge geocoder --yes
import geocoder

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [12]:
# Try to receive coordinates of a neighborhood
'''
neighb_coords = None
while neighb_coords is None:
    g = geocoder.google('M5A, Toronto, Ontario')
    neihb_coords = g.latlng
latitude = neighb_coords[0]
longitude = neighb_coords[1]
print(latitude, longitude)
'''
g = geocoder.google('M5A, Toronto, Ontario')
g

<[REQUEST_DENIED] Google - Geocode [empty]>

The Geocoder package has proven to be totally unreliable, i.e. numerous calls of **geocoder.google()** kept producing 
`<[REQUEST_DENIED] Google - Geocode [empty]>` constantly. Therefore, we had to use an alternative way to obtain the geographical coordinates corresponding to the postal codes in our dataframe.

In [48]:
# Load coordinates from the .csv file provided for the assignment  
geo_coords = pd.read_csv('Geospatial_Coordinates.csv')
geo_coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [49]:
# Now join the dataframes in order to obtain geo coords for each postal code
postal_codes.set_index(['Postal Code'], inplace=True)
geo_coords.set_index(['Postal Code'], inplace=True)
postal_codes = postal_codes.join(geo_coords).reset_index()
postal_codes.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


## Part 3 of 3. Explore and cluster the neighborhoods in Toronto:

In [57]:
# Let's consider only boroughs that contain the word 'Toronto'
toronto = postal_codes[postal_codes['Borough'].str.contains('Toronto')].reset_index(drop=True)
toronto

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
7,M6G,Downtown Toronto,Christie,43.669542,-79.422564
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259
