# Torornto Neigbourhoods

Three parts in the same file

In [1]:
import sys
sys.path.append("/usr/local/lib/python3.7/site-packages")

In [2]:
#Imports
import numpy as np
import pandas as pd
import requests
import lxml.html as lh
from geopy.geocoders import Nominatim
import json
from pandas.io.json import json_normalize
import folium
import bs4 as bs
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

## P1: Grabbing Wiki page
'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
Use the previous version of the page, since format was recently changed

In [100]:
url = 'https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=945633050'

result = requests.get(url)
print(url)
print(result.status_code)
#print(result.headers)

https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=945633050
200


### Parsing Page

In [93]:
soup = bs.BeautifulSoup(result.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
data = pd.read_json(df[0].to_json(orient='records'))
data.columns = ['Postal Code', 'Borough', 'Neighbourhood']
data.head(20)


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Etobicoke,Islington Avenue


### Filtering 'Not assigned' values and grouping if several neigbourhoods have the same Postal code

In [101]:
#filtering NA value and grouping
fdata=data[data['Borough']!= 'Not assigned'].groupby(['Borough', 'Postal Code'], as_index=False).agg(','.join)
fdata.head(10)

Unnamed: 0,Borough,Postal Code,Neighbourhood
0,Central Toronto,M4N,Lawrence Park
1,Central Toronto,M4P,Davisville North
2,Central Toronto,M4R,North Toronto West
3,Central Toronto,M4S,Davisville
4,Central Toronto,M4T,"Moore Park,Summerhill East"
5,Central Toronto,M4V,"Deer Park,Forest Hill SE,Rathnelly,South Hill,..."
6,Central Toronto,M5N,Roselawn
7,Central Toronto,M5P,"Forest Hill North,Forest Hill West"
8,Central Toronto,M5R,"The Annex,North Midtown,Yorkville"
9,Downtown Toronto,M4W,Rosedale


In [102]:
#fixing the 
#If a cell has a borough but a Not assigned neighborhood, 
#then the neighborhood will be the same as the borough.
fdata['Neighbourhood'] = np.where(fdata['Neighbourhood'] == 'Not assigned', fdata['Borough'], fdata['Neighbourhood'])


### Checking the shape of the data

In [103]:
fdata.shape

(103, 3)

## P2: Using Geospatial data


In [83]:
geo_url = "https://cocl.us/Geospatial_data"
geo_data = pd.read_csv(geo_url)
geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [95]:
#merge data
mdata=pd.merge(fdata, geo_data, on='Postal Code')

In [128]:
mdata[20:40]

Unnamed: 0,Borough,Postal Code,Neighbourhood,Latitude,Longitude
20,Downtown Toronto,M5L,"Commerce Court,Victoria Hotel",43.648198,-79.379817
21,Downtown Toronto,M5S,"Harbord,University of Toronto",43.662696,-79.400049
22,Downtown Toronto,M5T,"Chinatown,Grange Park,Kensington Market",43.653206,-79.400049
23,Downtown Toronto,M5V,"CN Tower,Bathurst Quay,Island airport,Harbourf...",43.628947,-79.39442
24,Downtown Toronto,M5W,Stn A PO Boxes 25 The Esplanade,43.646435,-79.374846
25,Downtown Toronto,M5X,"First Canadian Place,Underground city",43.648429,-79.38228
26,Downtown Toronto,M6G,Christie,43.669542,-79.422564
27,Downtown Toronto,M7A,Queen's Park,43.662301,-79.389494
28,East Toronto,M4E,The Beaches,43.676357,-79.293031
29,East Toronto,M4K,"The Danforth West,Riverdale",43.679557,-79.352188


### P3: Explore and cluster the neighborhoods in Toronto
Filter only Borough which contains Toronto
Cluster on the map what left using KMeans

In [175]:
mmdata=mdata.copy()

mmdata = mmdata[mdata.Borough.str.contains("Toronto")]

tmap=folium.Map(location=[43.70, -79.38], zoom_start=12)
mmdata[20:40]



Unnamed: 0,Borough,Postal Code,Neighbourhood,Latitude,Longitude
20,Downtown Toronto,M5L,"Commerce Court,Victoria Hotel",43.648198,-79.379817
21,Downtown Toronto,M5S,"Harbord,University of Toronto",43.662696,-79.400049
22,Downtown Toronto,M5T,"Chinatown,Grange Park,Kensington Market",43.653206,-79.400049
23,Downtown Toronto,M5V,"CN Tower,Bathurst Quay,Island airport,Harbourf...",43.628947,-79.39442
24,Downtown Toronto,M5W,Stn A PO Boxes 25 The Esplanade,43.646435,-79.374846
25,Downtown Toronto,M5X,"First Canadian Place,Underground city",43.648429,-79.38228
26,Downtown Toronto,M6G,Christie,43.669542,-79.422564
27,Downtown Toronto,M7A,Queen's Park,43.662301,-79.389494
28,East Toronto,M4E,The Beaches,43.676357,-79.293031
29,East Toronto,M4K,"The Danforth West,Riverdale",43.679557,-79.352188


In [172]:
X = mmdata['Latitude']
Y = mmdata['Longitude']
Z = np.stack((X, Y), axis=1)

kmeans = KMeans(n_clusters=5, n_init = 20).fit(Z)

clusters = kmeans.labels_
colors = ['red', 'green', 'blue', 'yellow', 'grey']
mmdata['Cluster'] = clusters

for latitude, longitude, borough, cluster in zip(mmdata['Latitude'], mmdata['Longitude'], mmdata['Borough'], mmdata['Cluster']):
    label = folium.Popup(borough, parse_html=True)
    folium.CircleMarker(
        [latitude, longitude],
        radius=10,
        popup=label,
        color='black',
        fill=True,
        fill_color=colors[cluster],
        fill_opacity=0.7).add_to(tmap)  


In [173]:
tmap