# Clustering neighborhoods in Toronto

1. get neighborhoods in Toronto
2. explore venues among neighborhoods
3. clustering neighborhoods based on categories distribution of their venues

## Part I. scrape pastal code: 
https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [2]:
import requests
from bs4 import BeautifulSoup

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"3f996570-256b-4f45-92a3-87ba6c847372","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":960187814,"wgRevisionId":960187814,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Communications in Ontario","Postal codes in Canada","Toron

In [3]:
data = []
table = soup.find('table', attrs={'class':'wikitable sortable'})
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
    cols = row.find_all('td')
    if cols:
        cols = [ele.text.strip() for ele in cols]
        data.append([ele for ele in cols]) # Get rid of empty values

In [4]:
# convert data into a dataframe
from pandas import DataFrame
df = DataFrame(data, columns=['PostalCode', 'Borough', 'Neighborhood'])
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [5]:
# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
# df['Borough'].unique() #display unique Borough to make sure no other variation of "Not assigned"
df = df[df.Borough != 'Not assigned'].reset_index(drop=True)
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [6]:
df.shape

(103, 3)

## Part II. latitude and the longitude coordinates of each neighborhood

In [84]:
# !conda install -c conda-forge geopy --yes 
# !pip install geocoder

In [1]:
# import geocoder # import geocoder

# # initialize your variable to None
# lat_lng_coords = None

# # loop until you get the coordinates
# postal_code = df['Borough'][3]
# while(lat_lng_coords is None):
#   g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#   lat_lng_coords = g.latlng

# latitude = lat_lng_coords[0]
# longitude = lat_lng_coords[1]

In [7]:
import numpy as np
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

geolocator = Nominatim(user_agent="ca_explorer")
# address = 'New York City, NY'

latitude = np.empty(df.shape[0])
longitude = np.empty(df.shape[0])
latitude[:] = np.nan
longitude[:] = np.nan

for ind in range(df.shape[0]):
    address = '{}, {}, Toronto, Ontario'.format(df['Neighborhood'][ind].split(',')[0],df['Borough'][ind])
    postal_code = df['PostalCode'][ind]

    location = geolocator.geocode(address)
    if location:
        latitude[ind] = location.latitude
        longitude[ind] = location.longitude
    print('The geograpical coordinate of {} ({}) are {}, {}.'.format(address, postal_code, latitude[ind], longitude[ind]))
    
np.sum(np.isnan(longitude))    

The geograpical coordinate of Parkwoods, North York, Toronto, Ontario (M3A) are 43.7587999, -79.3201966.
The geograpical coordinate of Victoria Village, North York, Toronto, Ontario (M4A) are 43.732658, -79.3111892.
The geograpical coordinate of Regent Park, Downtown Toronto, Toronto, Ontario (M5A) are nan, nan.
The geograpical coordinate of Lawrence Manor, North York, Toronto, Ontario (M6A) are 43.7220788, -79.4375067.
The geograpical coordinate of Queen's Park, Downtown Toronto, Toronto, Ontario (M7A) are 43.663217, -79.38629.
The geograpical coordinate of Islington Avenue, Etobicoke, Toronto, Ontario (M9A) are 43.6794838, -79.5389092.
The geograpical coordinate of Malvern, Scarborough, Toronto, Ontario (M1B) are 43.8091955, -79.2217008.
The geograpical coordinate of Don Mills, North York, Toronto, Ontario (M3B) are 43.775347, -79.3459439.
The geograpical coordinate of Parkview Hill, East York, Toronto, Ontario (M4B) are nan, nan.
The geograpical coordinate of Garden District, Downto

The geograpical coordinate of Kingsview Village, Etobicoke, Toronto, Ontario (M9R) are 43.6995391, -79.5563459.
The geograpical coordinate of Agincourt, Scarborough, Toronto, Ontario (M1S) are 43.7853531, -79.2785494.
The geograpical coordinate of Davisville, Central Toronto, Toronto, Ontario (M4S) are 43.697936, -79.3972908.
The geograpical coordinate of University of Toronto, Downtown Toronto, Toronto, Ontario (M5S) are nan, nan.
The geograpical coordinate of Runnymede, West Toronto, Toronto, Ontario (M6S) are 43.6517026, -79.4759978.
The geograpical coordinate of Clarks Corners, Scarborough, Toronto, Ontario (M1T) are 43.7964095, -79.2977951.
The geograpical coordinate of Moore Park, Central Toronto, Toronto, Ontario (M4T) are 43.6903876, -79.3832965.
The geograpical coordinate of Kensington Market, Downtown Toronto, Toronto, Ontario (M5T) are nan, nan.
The geograpical coordinate of Milliken, Scarborough, Toronto, Ontario (M1V) are 43.8231743, -79.3017626.
The geograpical coordinate

20

In [8]:
# add new columns to df
df['Latitude'] = latitude
df['Longitude'] = longitude
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7588,-79.320197
1,M4A,North York,Victoria Village,43.732658,-79.311189
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",,
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.722079,-79.437507
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.663217,-79.38629


In [9]:
# !conda install -c conda-forge folium=0.5.0 --yes
import folium
# create map of Toronto using latitude and longitude values
address = 'Toronto, Ontario'
location = geolocator.geocode(address)
latitude_to = location.latitude
longitude_to = location.longitude

map_toronto = folium.Map(location=[latitude_to, longitude_to], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    if ~np.isnan(lat):
        label = '{}; {}'.format(neighborhood, borough)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color='blue',
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.7,
            parse_html=False).add_to(map_toronto)  
    else:
        print('Missing: {}; {}.'.format(neighborhood,borough))
    
map_toronto

Missing: Regent Park, Harbourfront; Downtown Toronto.
Missing: Parkview Hill, Woodbine Gardens; East York.
Missing: St. James Town; Downtown Toronto.
Missing: Humewood-Cedarvale; York.
Missing: Berczy Park; Downtown Toronto.
Missing: Caledonia-Fairbanks; York.
Missing: Central Bay Street; Downtown Toronto.
Missing: Harbourfront East, Union Station, Toronto Islands; Downtown Toronto.
Missing: Toronto Dominion Centre, Design Exchange; Downtown Toronto.
Missing: India Bazaar, The Beaches West; East Toronto.
Missing: Commerce Court, Victoria Hotel; Downtown Toronto.
Missing: Del Ray, Mount Dennis, Keelsdale and Silverthorn; York.
Missing: Canada Post Gateway Processing Centre; Mississauga.
Missing: University of Toronto, Harbord; Downtown Toronto.
Missing: Kensington Market, Chinatown, Grange Park; Downtown Toronto.
Missing: CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport; Downtown Toronto.
Missing: Stn A PO Boxes; Downtown Toronto

In [None]:
# import pandas as pd
# postal_data = pd.read_csv('http://cocl.us/Geospatial_data')

# postal_data.head()