In [4]:
# import numpy and pandas (dataframe)
import pandas as pd
import numpy as np

# import map rendering libraries
%matplotlib inline
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

# import packages for web scrapping: beautifulsoup & requests
from bs4 import BeautifulSoup
import requests
from urllib.request import urlopen

In [3]:
!pip install folium

Collecting folium
  Downloading folium-0.11.0-py2.py3-none-any.whl (93 kB)
[K     |████████████████████████████████| 93 kB 3.4 MB/s  eta 0:00:01
Collecting branca>=0.3.0
  Downloading branca-0.4.1-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.1 folium-0.11.0


In [5]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

html = urlopen(url) # open url
html = BeautifulSoup(html) # use Beautifulsoup to download html data and store into variable "html"

In [6]:
print(html.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"cf0149f1-3bc5-409e-b9e6-d59a5559a1a8","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":979555370,"wgRevisionId":979555370,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description is different from Wikidata","Communicati

In [7]:
html.title

<title>List of postal codes of Canada: M - Wikipedia</title>

In [8]:
# extract the text within <title></title> tag

html.title.text

'List of postal codes of Canada: M - Wikipedia'

In [9]:
# identify <table> with class="wikitable sortable".
# this table contains table header (i.e columns): "postcode","borough" & 'neighbourhood' data
# and data for each column in <tr></tr>

tbl = html.find_all('table', class_="wikitable sortable") 
tbl

[<table class="wikitable sortable">
 <tbody><tr>
 <th>Postal Code
 </th>
 <th>Borough
 </th>
 <th>Neighbourhood
 </th></tr>
 <tr>
 <td>M1A
 </td>
 <td>Not assigned
 </td>
 <td>Not assigned
 </td></tr>
 <tr>
 <td>M2A
 </td>
 <td>Not assigned
 </td>
 <td>Not assigned
 </td></tr>
 <tr>
 <td>M3A
 </td>
 <td>North York
 </td>
 <td>Parkwoods
 </td></tr>
 <tr>
 <td>M4A
 </td>
 <td>North York
 </td>
 <td>Victoria Village
 </td></tr>
 <tr>
 <td>M5A
 </td>
 <td>Downtown Toronto
 </td>
 <td>Regent Park, Harbourfront
 </td></tr>
 <tr>
 <td>M6A
 </td>
 <td>North York
 </td>
 <td>Lawrence Manor, Lawrence Heights
 </td></tr>
 <tr>
 <td>M7A
 </td>
 <td>Downtown Toronto
 </td>
 <td>Queen's Park, Ontario Provincial Government
 </td></tr>
 <tr>
 <td>M8A
 </td>
 <td>Not assigned
 </td>
 <td>Not assigned
 </td></tr>
 <tr>
 <td>M9A
 </td>
 <td>Etobicoke
 </td>
 <td>Islington Avenue, Humber Valley Village
 </td></tr>
 <tr>
 <td>M1B
 </td>
 <td>Scarborough
 </td>
 <td>Malvern, Rouge
 </td></tr>
 <tr>
 <td>M2B

In [10]:
# extract header by find_all (find every tag) with <th></th> in the table
headers = tbl[0].find_all('th')
headers

[<th>Postal Code
 </th>,
 <th>Borough
 </th>,
 <th>Neighbourhood
 </th>]

In [11]:
# loop through each item in header, extract the text with .text and remove new line (i.e: \n) with .strip() in list comprehension
column = [i.text.strip() for i in headers]
column

['Postal Code', 'Borough', 'Neighbourhood']

In [12]:
data = tbl[0].find_all('td') # extract item in table body (i.e: each cell in the table)
data

[<td>M1A
 </td>,
 <td>Not assigned
 </td>,
 <td>Not assigned
 </td>,
 <td>M2A
 </td>,
 <td>Not assigned
 </td>,
 <td>Not assigned
 </td>,
 <td>M3A
 </td>,
 <td>North York
 </td>,
 <td>Parkwoods
 </td>,
 <td>M4A
 </td>,
 <td>North York
 </td>,
 <td>Victoria Village
 </td>,
 <td>M5A
 </td>,
 <td>Downtown Toronto
 </td>,
 <td>Regent Park, Harbourfront
 </td>,
 <td>M6A
 </td>,
 <td>North York
 </td>,
 <td>Lawrence Manor, Lawrence Heights
 </td>,
 <td>M7A
 </td>,
 <td>Downtown Toronto
 </td>,
 <td>Queen's Park, Ontario Provincial Government
 </td>,
 <td>M8A
 </td>,
 <td>Not assigned
 </td>,
 <td>Not assigned
 </td>,
 <td>M9A
 </td>,
 <td>Etobicoke
 </td>,
 <td>Islington Avenue, Humber Valley Village
 </td>,
 <td>M1B
 </td>,
 <td>Scarborough
 </td>,
 <td>Malvern, Rouge
 </td>,
 <td>M2B
 </td>,
 <td>Not assigned
 </td>,
 <td>Not assigned
 </td>,
 <td>M3B
 </td>,
 <td>North York
 </td>,
 <td>Don Mills
 </td>,
 <td>M4B
 </td>,
 <td>East York
 </td>,
 <td>Parkview Hill, Woodbine Gardens
 </td>,


In [13]:
total_cell, total_row = len(data), len(data)/3

print('The table contains: {} cells, which mean {} rows'.format(total_cell, total_row))

The table contains: 540 cells, which mean 180.0 rows


In [14]:
col_1 = [] # create list named "col_1"

# loop through cells in "data", start from index 0, with incremental step 3 (i.e. 0,3,6,9...)
for i in range(0, total_cell, 3):
    
    k = data[i].text.strip() # extract text within <td> tag; remove space or \n
    col_1.append(k) # append each item to "col_1"

In [15]:
# check the list of postcode
# the list should contains postcode from M*A, M*B, M*C... to M*Z; * being number 1 to 9

print(col_1, end=' ')

['M1A', 'M2A', 'M3A', 'M4A', 'M5A', 'M6A', 'M7A', 'M8A', 'M9A', 'M1B', 'M2B', 'M3B', 'M4B', 'M5B', 'M6B', 'M7B', 'M8B', 'M9B', 'M1C', 'M2C', 'M3C', 'M4C', 'M5C', 'M6C', 'M7C', 'M8C', 'M9C', 'M1E', 'M2E', 'M3E', 'M4E', 'M5E', 'M6E', 'M7E', 'M8E', 'M9E', 'M1G', 'M2G', 'M3G', 'M4G', 'M5G', 'M6G', 'M7G', 'M8G', 'M9G', 'M1H', 'M2H', 'M3H', 'M4H', 'M5H', 'M6H', 'M7H', 'M8H', 'M9H', 'M1J', 'M2J', 'M3J', 'M4J', 'M5J', 'M6J', 'M7J', 'M8J', 'M9J', 'M1K', 'M2K', 'M3K', 'M4K', 'M5K', 'M6K', 'M7K', 'M8K', 'M9K', 'M1L', 'M2L', 'M3L', 'M4L', 'M5L', 'M6L', 'M7L', 'M8L', 'M9L', 'M1M', 'M2M', 'M3M', 'M4M', 'M5M', 'M6M', 'M7M', 'M8M', 'M9M', 'M1N', 'M2N', 'M3N', 'M4N', 'M5N', 'M6N', 'M7N', 'M8N', 'M9N', 'M1P', 'M2P', 'M3P', 'M4P', 'M5P', 'M6P', 'M7P', 'M8P', 'M9P', 'M1R', 'M2R', 'M3R', 'M4R', 'M5R', 'M6R', 'M7R', 'M8R', 'M9R', 'M1S', 'M2S', 'M3S', 'M4S', 'M5S', 'M6S', 'M7S', 'M8S', 'M9S', 'M1T', 'M2T', 'M3T', 'M4T', 'M5T', 'M6T', 'M7T', 'M8T', 'M9T', 'M1V', 'M2V', 'M3V', 'M4V', 'M5V', 'M6V', 'M7V', 'M8V'

In [16]:
col_2 = [] # create list named "col_2"

# loop through cells in "data", start from index 1, with incremental step 3 (i.e. 1,4,7,10...)
for i in range(1, total_cell, 3):
    
    k = data[i].text.strip() # extract text within <td> tag; remove space or \n
    col_2.append(k) # append each item to "col_2"

In [17]:
# check the list of borough

col_2

['Not assigned',
 'Not assigned',
 'North York',
 'North York',
 'Downtown Toronto',
 'North York',
 'Downtown Toronto',
 'Not assigned',
 'Etobicoke',
 'Scarborough',
 'Not assigned',
 'North York',
 'East York',
 'Downtown Toronto',
 'North York',
 'Not assigned',
 'Not assigned',
 'Etobicoke',
 'Scarborough',
 'Not assigned',
 'North York',
 'East York',
 'Downtown Toronto',
 'York',
 'Not assigned',
 'Not assigned',
 'Etobicoke',
 'Scarborough',
 'Not assigned',
 'Not assigned',
 'East Toronto',
 'Downtown Toronto',
 'York',
 'Not assigned',
 'Not assigned',
 'Not assigned',
 'Scarborough',
 'Not assigned',
 'Not assigned',
 'East York',
 'Downtown Toronto',
 'Downtown Toronto',
 'Not assigned',
 'Not assigned',
 'Not assigned',
 'Scarborough',
 'North York',
 'North York',
 'East York',
 'Downtown Toronto',
 'West Toronto',
 'Not assigned',
 'Not assigned',
 'Not assigned',
 'Scarborough',
 'North York',
 'North York',
 'East York',
 'Downtown Toronto',
 'West Toronto',
 'Not assi

In [18]:
col_3 = [] # create list named "col_3"

# loop through cells in "data", start from index 2, with incremental step 3 (i.e. 2,5,8,11...)
for i in range(2, total_cell, 3):
    
    k = data[i].text.strip() # extract text within <td> tag; remove space or \n
    col_3.append(k) # append each item to "col_3"

In [19]:
col_3

['Not assigned',
 'Not assigned',
 'Parkwoods',
 'Victoria Village',
 'Regent Park, Harbourfront',
 'Lawrence Manor, Lawrence Heights',
 "Queen's Park, Ontario Provincial Government",
 'Not assigned',
 'Islington Avenue, Humber Valley Village',
 'Malvern, Rouge',
 'Not assigned',
 'Don Mills',
 'Parkview Hill, Woodbine Gardens',
 'Garden District, Ryerson',
 'Glencairn',
 'Not assigned',
 'Not assigned',
 'West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale',
 'Rouge Hill, Port Union, Highland Creek',
 'Not assigned',
 'Don Mills',
 'Woodbine Heights',
 'St. James Town',
 'Humewood-Cedarvale',
 'Not assigned',
 'Not assigned',
 'Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood',
 'Guildwood, Morningside, West Hill',
 'Not assigned',
 'Not assigned',
 'The Beaches',
 'Berczy Park',
 'Caledonia-Fairbanks',
 'Not assigned',
 'Not assigned',
 'Not assigned',
 'Woburn',
 'Not assigned',
 'Not assigned',
 'Leaside',
 'Central Bay Street',
 'Christie',
 'Not as

In [20]:
# initiate a dictionary
tbl_df = {}

for a, b in zip(column, [col_1, col_2, col_3]): # use zip() to combine header for each col_* and data
    tbl_df[a] = b # put header and column into dictionary

In [40]:
print(tbl_df['Postal Code'], end=' ') # take a quick look at 'Postcode' within dictionary

['M1A', 'M2A', 'M3A', 'M4A', 'M5A', 'M6A', 'M7A', 'M8A', 'M9A', 'M1B', 'M2B', 'M3B', 'M4B', 'M5B', 'M6B', 'M7B', 'M8B', 'M9B', 'M1C', 'M2C', 'M3C', 'M4C', 'M5C', 'M6C', 'M7C', 'M8C', 'M9C', 'M1E', 'M2E', 'M3E', 'M4E', 'M5E', 'M6E', 'M7E', 'M8E', 'M9E', 'M1G', 'M2G', 'M3G', 'M4G', 'M5G', 'M6G', 'M7G', 'M8G', 'M9G', 'M1H', 'M2H', 'M3H', 'M4H', 'M5H', 'M6H', 'M7H', 'M8H', 'M9H', 'M1J', 'M2J', 'M3J', 'M4J', 'M5J', 'M6J', 'M7J', 'M8J', 'M9J', 'M1K', 'M2K', 'M3K', 'M4K', 'M5K', 'M6K', 'M7K', 'M8K', 'M9K', 'M1L', 'M2L', 'M3L', 'M4L', 'M5L', 'M6L', 'M7L', 'M8L', 'M9L', 'M1M', 'M2M', 'M3M', 'M4M', 'M5M', 'M6M', 'M7M', 'M8M', 'M9M', 'M1N', 'M2N', 'M3N', 'M4N', 'M5N', 'M6N', 'M7N', 'M8N', 'M9N', 'M1P', 'M2P', 'M3P', 'M4P', 'M5P', 'M6P', 'M7P', 'M8P', 'M9P', 'M1R', 'M2R', 'M3R', 'M4R', 'M5R', 'M6R', 'M7R', 'M8R', 'M9R', 'M1S', 'M2S', 'M3S', 'M4S', 'M5S', 'M6S', 'M7S', 'M8S', 'M9S', 'M1T', 'M2T', 'M3T', 'M4T', 'M5T', 'M6T', 'M7T', 'M8T', 'M9T', 'M1V', 'M2V', 'M3V', 'M4V', 'M5V', 'M6V', 'M7V', 'M8V'

In [23]:
tbl_postcode = pd.DataFrame(tbl_df) # create dataframe based on dictionary "tbl_df"
tbl_postcode.head() # a glance of dataframe

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [24]:
# exclude "Not assigned" in "Borough"
tbl = tbl_postcode.query("Borough != 'Not assigned'")
tbl

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [26]:
# create a temporary dataframe to check number of 'Neighbourhood' within each 'postcode'
# use .to_frame() to convert series of object to dataframe
# .sort_values() > sort 'Neighbourhood' in descending order

tbl_temp = tbl.groupby(['Postal Code'])['Neighbourhood'].count().to_frame().reset_index(drop=False).sort_values(by='Neighbourhood', ascending=False)
tbl_temp

Unnamed: 0,Postal Code,Neighbourhood
0,M1B,1
65,M5R,1
75,M6G,1
74,M6E,1
73,M6C,1
...,...,...
31,M3L,1
30,M3K,1
29,M3J,1
28,M3H,1


In [30]:
# let's check 'postcode' M8Y to confirm num # of neighourhood

tbl.query("Postal Code == 'M8Y'") # result shows 8 neighbourhoods within M8Y, same as above

SyntaxError: invalid syntax (<unknown>, line 1)

In [31]:
# extract all 'postcode' with > 1 neighbourhood
# use .query() to select rows with 'Neighbourhood' > 1 record, then select 'Postcode' and convert data into array
# with .values method

multi = tbl_temp.query("Neighbourhood > 1")['Postal Code'].values
multi

array([], dtype=object)

In [33]:
single = tbl_temp.query("Neighbourhood == 1")['Postal Code'].values
single

array(['M1B', 'M5R', 'M6G', 'M6E', 'M6C', 'M6B', 'M6A', 'M5X', 'M5W',
       'M5V', 'M5T', 'M5S', 'M5P', 'M6J', 'M5N', 'M5M', 'M5L', 'M5K',
       'M5J', 'M5H', 'M5G', 'M5E', 'M5C', 'M5B', 'M6H', 'M6K', 'M4Y',
       'M8Y', 'M9V', 'M9R', 'M9P', 'M9N', 'M9M', 'M9L', 'M9C', 'M9B',
       'M9A', 'M8Z', 'M8X', 'M6L', 'M8W', 'M8V', 'M7Y', 'M7R', 'M7A',
       'M6S', 'M6R', 'M6P', 'M6N', 'M6M', 'M5A', 'M4X', 'M1C', 'M1T',
       'M2P', 'M2N', 'M2M', 'M2L', 'M2K', 'M2J', 'M2H', 'M1X', 'M1W',
       'M1V', 'M1S', 'M3A', 'M1R', 'M1P', 'M1N', 'M1M', 'M1L', 'M1K',
       'M1J', 'M1H', 'M1G', 'M1E', 'M2R', 'M3B', 'M4W', 'M4H', 'M4V',
       'M4T', 'M4S', 'M4R', 'M4P', 'M4N', 'M4M', 'M4L', 'M4K', 'M4J',
       'M4G', 'M3C', 'M4E', 'M4C', 'M4B', 'M4A', 'M3N', 'M3M', 'M3L',
       'M3K', 'M3J', 'M3H', 'M9W'], dtype=object)

In [41]:
# split 'tbl' into table which > 1 'Neighbourhood' within a 'Postcode'

t1 = tbl.loc[tbl.Postcode.isin(multi),:].sort_values(by='Postcode').reset_index(drop=True)
t1

AttributeError: 'DataFrame' object has no attribute 'postcode'

In [42]:
t2 = tbl.loc[tbl.Postcode.isin(single),:].sort_values(by='Postcode').reset_index(drop=True)
t2

AttributeError: 'DataFrame' object has no attribute 'Postcode'

In [44]:
# initiate 3 lists for 'postcode', 'borough', 'neighbourhood'

codes = []
boroughs = []
neighbours = []

for code in multi:
    
    table = t1.loc[t1.Postcode == code, :] # split 't1' to specific table by 'postcode'
    
    code = np.unique(table.Postcode) # extract unique 'postcode' in the column
    codes.append(code[0])
    
    borough = np.unique(table.Borough) # extract unique 'borough' in the column
    boroughs.append(borough[0])
    
    neighbour = table.Neighbourhood.to_list() # extract all 'neighbourhood' & convert to list format
    neighbour = ', '.join(neighbour) # use .join() method to combine each 'neighbourhood' with ", "
    neighbours.append(neighbour)

In [45]:
neighbours # take a look into result of combined 'neighbourhood'

[]

In [46]:
tbl_multi = pd.DataFrame({'Postcode': codes, 'Borough': boroughs, 'Neighbourhood': neighbours})
tbl_multi

Unnamed: 0,Postcode,Borough,Neighbourhood


In [47]:
# combine 'tbl_multi' with 't2' (table with single neighbourhood for each postcode) with concat()

new_df = pd.concat([tbl_multi, t2], axis=0).sort_values(by='Postcode').reset_index(drop=True)
print(new_df.head())

NameError: name 't2' is not defined

In [48]:
# check any row has 'Neighbourhood' that is 'Not assigned'

new_df.query("Neighbourhood == 'Not assigned'")

NameError: name 'new_df' is not defined

In [49]:
# instruction of assignment: any record in Neighbourhood with "Not assigned" will be same as name of 'Borough'
# in this case, 'Postcode' M7A >> Borough & Neighbourhood is "Queen's Park"

new_df.loc[new_df.Postcode == 'M7A', 'Neighbourhood'] = "Queen's Park" # assign new value
new_df.query("Postcode == 'M7A'") # check M7A again

NameError: name 'new_df' is not defined

In [None]:
row, column = new_df.shape # use .shape to indicate number of rows as per assignment requirement

print("Postcode table consists of {} rows & {} columns".format(row, column))

# Part 2

In [50]:
# import library for Geocoding
import geopy
import geopandas

ModuleNotFoundError: No module named 'geopandas'

In [None]:
# trial n error: search geo-location of Toronto based on "Borough"
# note: not requirement of assignment

geo_lat = {}
geo_lon = {}

geo = geopy.Nominatim(user_agent="Detector", timeout=20)

for x in np.unique(new_df.Borough):
    
    loc = geo.geocode("{}, Toronto, Ontario".format(x))
    
    geo_lat[x] = loc.latitude
    geo_lon[x] = loc.longitude

In [None]:
# quick check on latitude

geo_lat

In [None]:
# quick check on longitude

geo_lon

In [None]:
# instruction given by assignment: to extract latitude & longitutde based on "Postcode"
# API failed to search for geo-location for M1E and other postcodes.
# will use csv file provided in coursera instead

geo = geopy.Nominatim(user_agent="Detector", timeout=50)

for x, y in zip(new_df.Postcode[0:5], new_df.Borough[0:5]):
    
    loc = geo.geocode("{}, {}, Toronto, Canada".format(x, y))
    print("{}, {}: latitude {}, longitude {}".format(x, y, loc.latitude, loc.longitude))

In [None]:
# import "Geospatial_Coordinates.csv"

geo_data = pd.read_csv("Geospatial_Coordinates.csv")
geo_data.head()

In [None]:
geo_data.info() # Geospatial_Coordinates.csv has 103 rows (i.e. same as new_df)

In [None]:
# all postcodes in Geospatial_Coordinates.csv match with postcodes in new_df
# both files have 103 records

geo_df = geo_data.loc[geo_data["Postal Code"].isin(new_df.Postcode.values), :]
geo_df.shape

In [None]:
# combine new_df & geo_df with .merge() on Postcode/Postal Code columns
# note: to remove "Postcode"

geo_tbl = new_df.merge(geo_df, left_on="Postcode", right_on="Postal Code")
geo_tbl.head()

In [None]:
geo_toronto = geo_tbl.iloc[:,[3,1,2,4,5]] # exclude column with index = 0 ("Postcode") and re-arrange column names
geo_toronto.head()

In [None]:
geo_toronto.shape

# Part 3

In [None]:
# use geopy to obtain latitude/longitude of Toronto.
# coordinates will be used for map visualization

address = 'Toronto, Ontario'

geolocator = geopy.Nominatim(user_agent="ny_explorer", timeout=30)
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto {}, {}.'.format(latitude, longitude))

In [None]:
# import map rendering library

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import json # library to handle JSON files

In [None]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map based on coordinates of postcode
for lat, lng, neighbourhood in zip(geo_toronto['Latitude'], geo_toronto['Longitude'], geo_toronto['Neighbourhood']):
    label = '{}'.format(neighbourhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [None]:
# Define Foursquare Credentials and Version

CLIENT_ID = 'BTXEHYDBF0CI5H3NRC3OFYWQ3RZ341ANCJ31XKQ4YH2EQJ3Q' # your Foursquare ID
CLIENT_SECRET = 'IO52MQU0B4FSOO0IARL2QCWOO3W0WVDRBZN5B2JANN2ZAVSB' # your Foursquare Secret
VERSION = '20161225' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

In [None]:
LIMIT = 5
radius = 500

location_list = [] # initiate a list to store data from Foursquare API requests

for neighbourhood, latitude, longitude in zip(geo_toronto.Neighbourhood, geo_toronto.Latitude, geo_toronto.Longitude):
    
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT)
    
    data = requests.get(url).json()
    
    # use len() to check if any data within "items" (len = 0 if nothing)
    length = len(data['response']['groups'][0]['items'])
    if length == 0:
        continue # skip the row if nothing is found
    else:
        venue = data['response']['groups'][0]['items'][0]['venue']
        
        # extract info within 'venue'
        name = venue['name']
        lat = venue['location']['lat']
        lon = venue['location']['lng']
        cat = venue['categories'][0]['name']
        
        location_list.append([(neighbourhood, latitude, longitude, name, lat, lon, cat)])

In [None]:
# create data frame based on 'Location_list'

temp = pd.DataFrame(x for row in location_list for x in row)
temp.columns = ['Neighbourhood','N_Latitude','N_Longitude','Venue','V_Latitude','V_Longitude','category']
temp.head()

In [None]:
print("{} nearby locations downloaded for {} neighbourhood.".format(len(temp.Venue), len(geo_toronto.Neighbourhood)))

In [None]:
cat = pd.get_dummies(temp.category) # one hot encoding with get_dummies()
df_01 = pd.concat([temp[['Neighbourhood']], cat], axis=1) # combine neighbourhood & category tables
df_01.head()

In [None]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

In [None]:
df_02 = df_01.drop('Neighbourhood', axis=1)

n_group = 6 # we will group neighbourhoods into 6 clusters

# run k-means clustering
kmeans = KMeans(n_clusters=n_group, random_state=0).fit(df_02)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

In [None]:
# add clustering labels
df_01.insert(1, 'label', kmeans.labels_)

In [None]:
df_01.head() # 'label' generated from k-means included in data frame

In [None]:
# merge 'geo_toronto' & df_01

toronto_merged = pd.merge(geo_toronto, df_01, on='Neighbourhood', how='right')
toronto_merged.head()

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(n_group)
ys = [i + x + (i*x)**2 for i in range(n_group)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters