# This is notebook for peer graded assignment on Segmenting and Clustering Neighborhoods in Toronto

#### In this assigment, the neighborhoods and borough in Toronto is scraped from wikipedia using BeautifulSoup library. Then, the latitute and longitude coordinates of each neighborhood is obatained from OpenCage since Google Maps Geocoding API is subscription based API.
##### Reference: https://opencagedata.com/

In [7]:
# libraries
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from urllib.request import urlopen

In [8]:
# data source
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# open url
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')

In [9]:
# find the table
table = soup.find('table',{'class':'wikitable sortable'})
table_rows = table.find_all('tr')

data = []
for row in table_rows:
    data.append([t.text.strip() for t in row.find_all('td')])

df = pd.DataFrame(data, columns=['Postal Code', 'Borough', 'Neighbourhood'])

# delete rows with empy postal code
df = df[~df['Postal Code'].isnull()]

# delete rows with borough that is Not assigned
df = df[df.Borough !='Not assigned']

# assign name of the neighbourhood with the name of the borough if a cell has a borough but a Not assigned neighborhood
df['Neighbourhood']=df['Neighbourhood'].replace('Not assigned', df['Borough']) 

# reset index
df = df.reset_index(drop=True)

# show the dataframe
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [10]:
# instaling geocoder
! pip install opencage



In [11]:
# import geocoder library
from opencage.geocoder import OpenCageGeocode
from pprint import pprint

#API key
key = '2e018142ee4a46a3aa57a4d95af1b345'

# open geocoder
geocoder = OpenCageGeocode(key)

In [13]:
# list of postal code
PostalCodes = df['Postal Code'].tolist()

In [14]:
# variable initialization
latitude = []
longitude = []

# loop through all postal code
for PostalCode in PostalCodes:
    g = geocoder.geocode(PostalCode + ', Toronto, Ontario, Canada')
    temp_lat = g[0]['geometry']['lat']
    temp_long = g[0]['geometry']['lng']
    latitude.append(temp_lat)
    longitude.append(temp_long)
    

In [15]:
# The latitude and longitude of each neighbourhood seems to be different than the provided csv file
# Therefore the values area updated with the available csv data
geo_df=pd.read_csv('http://cocl.us/Geospatial_data')
geo_df

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [17]:
df = pd.merge(df,geo_df, on='Postal Code')
df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509
