##  Toronto Neighborhoods Segmenting and Clustering  

#### *Author: Mohammad Sayeb*

#### Let's import the relevant modules

In [1]:
import numpy as np #for dealing with multidimensional arrays and matrices
import pandas as pd #pandas data frame from efficient dataframe manipulation
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json # library to handle JSON files
import geocoder # convert an address into latitude and longitude values
!pip3 install geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests #requesting information from webpages 
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

import plotly #visualization tool

# Matplotlib and associated plotting modules
import matplotlib 
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

#tools for scrapping website for data
import bs4 #beautiful soup library for website scraping 
from bs4 import BeautifulSoup #scraping tool
import lxml #needed to convert html bs4 object to data frame


Defaulting to user installation because normal site-packages is not writeable


## Section 1

We scrape table data from the wiki page and assign it to a DataFrame table

In [2]:
URL='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')

In [3]:
table = soup.find_all('table')

In [4]:
df = pd.read_html(str(table))[0]

In [5]:
df.shape

(180, 3)

Ignore the rows that don't have an assigned borough

In [6]:
df = df[df['Borough']!='Not assigned']
print (df.shape)
df.head()

(103, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Now we would like to put the neighborhoods that belong to the same postal code in the same row separated by commas. The fist step here is to see if there are any duplicated values for the Postal Codes.

In [7]:
duplicate_boolean = df.duplicated(subset=None, keep='first')
duplicate_boolean[duplicate_boolean ==True]

Series([], dtype: bool)

We see that there are no duplicated postal code values we don't need to worry about combining the Neighbourhoods that belong to the same Postal Code into one row separated by commas

If a cell has a borough but a Not assigned neighbourhood, then the neighborhood will be the same as the borough

In [8]:
df[df['Neighbourhood']=='Not assigned']

Unnamed: 0,Postal Code,Borough,Neighbourhood


There are no rows with a Not assinged neighbourhood

In [9]:
print ('the data frame has {} rows and {} columns'.format(df.shape[0],df.shape[1]))
df.shape

the data frame has 103 rows and 3 columns


(103, 3)

## Section 2

Now let's try to get the lattitude and Longitude for each neightbourhood

In [10]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


We use geocoder to get the longitude and latitude for each postal code. 

In [11]:
latitude=[]
longitude=[]
for postal_code in df['Postal Code']:

    # initialize your variable to None
    lat_lng_coords = None
#     print ('{}, Toronto, Ontario'.format(postal_code))
    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
      lat_lng_coords = g.latlng

    latitude.append( lat_lng_coords[0])
    longitude.append( lat_lng_coords[1])

In [12]:
df['latitude'] = latitude
df['longitude'] = longitude
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,latitude,longitude
2,M3A,North York,Parkwoods,43.75245,-79.32991
3,M4A,North York,Victoria Village,43.73057,-79.31306
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72327,-79.45042
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188


In [13]:
df.reset_index(inplace=True)
df.drop(columns='index', inplace=True)

In [14]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,latitude,longitude
0,M3A,North York,Parkwoods,43.75245,-79.32991
1,M4A,North York,Victoria Village,43.73057,-79.31306
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72327,-79.45042
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188


## Section 3

get latitude and longitude of Toronto

In [15]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto  are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto  are 43.6534817, -79.3839347.


In [16]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(tiles = 'StamenTerrain', location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['latitude'], df['longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto

However, for illustration purposes, let's simplify the above map and segment and cluster only the neighborhoods in Toronto. So let's slice the original dataframe and create a new dataframe of the Toronto data.

In [17]:
df_toronto = df[df['Borough'].str.contains('oronto')]

In [18]:
df_toronto.reset_index(drop = True, inplace=True)
df_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,latitude,longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.65739,-79.37804
3,M5C,Downtown Toronto,St. James Town,43.65215,-79.37587
4,M4E,East Toronto,The Beaches,43.67709,-79.29547


Here we are visualizing the new neightbourhoods

In [19]:
# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(tiles = 'StamenTerrain',location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df_toronto['latitude'], df_toronto['longitude'], df_toronto['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Next, we are going to utilize the Foursquare API to explore the neighborhoods and segment them.

In [20]:
CLIENT_ID = 'ITPTFVZXK1ZNFNYDXQUIXOIBX4UTD0Q5R55AMROGVLCWFMZ5' # your Foursquare ID
CLIENT_SECRET = 'OXXKWPXMHESTLBEVMWK2VTRS0CKWNES0ZKEBMJWYTE0CTBCW' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: ITPTFVZXK1ZNFNYDXQUIXOIBX4UTD0Q5R55AMROGVLCWFMZ5
CLIENT_SECRET:OXXKWPXMHESTLBEVMWK2VTRS0CKWNES0ZKEBMJWYTE0CTBCW


#### Let's explore the first neighbourhood in our data frame

In [21]:
neighborhood_latitude = df_toronto.loc[0, 'latitude'] # neighborhood latitude value
neighborhood_longitude = df_toronto.loc[0, 'longitude'] # neighborhood longitude value
neighborhood_name = df_toronto.loc[0, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Regent Park, Harbourfront are 43.65512000000007, -79.36263999999994.


#### Let's look at the top 100 venues around Regend Park, Harbourfront neighbourhood within  a radius of 500 meters

In [24]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(\
CLIENT_ID,
CLIENT_SECRET,
VERSION,
neighborhood_latitude,
neighborhood_longitude,
radius,
LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=ITPTFVZXK1ZNFNYDXQUIXOIBX4UTD0Q5R55AMROGVLCWFMZ5&client_secret=OXXKWPXMHESTLBEVMWK2VTRS0CKWNES0ZKEBMJWYTE0CTBCW&v=20180605&ll=43.65512000000007,-79.36263999999994&radius=500&limit=100'

In [25]:
results = requests.get(url).json()  #gives result as a dictionary

#### Let's see what the result object is made of

In [27]:
print (results.keys())
print ()
print ('keys for meta are:',results['meta'].keys())
print ()
print ('keys for response are:',results['response'].keys())
print ()
print ("result['response']['groups'] are ", results['response']['groups'][0].keys())

dict_keys(['meta', 'response'])

keys for meta are: dict_keys(['code', 'requestId'])

keys for response are: dict_keys(['suggestedFilters', 'headerLocation', 'headerFullLocation', 'headerLocationGranularity', 'totalResults', 'suggestedBounds', 'groups'])

result['response']['groups'] are  dict_keys(['type', 'name', 'items'])


#### Let's define a function that gets the catagory of a venue. We will use this funciton clean up and convert our data into a dataframe format 

In [43]:
# function that extracts the category of the venue
def get_category_type(row):
    #try:
        #categories_list = row['categories'] # we don't really need this line since we only have the venue.categories as the column title
    #except:
    categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [42]:
nearby_venues.columns


Index(['venue.name', 'venue.categories', 'venue.location.lat',
       'venue.location.lng'],
      dtype='object')

In [29]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # tranform JSON file into a pandas dataframe

  This is separate from the ipykernel package so we can avoid doing imports until


In [32]:
nearby_venues.head()

Unnamed: 0,referralId,reasons.count,reasons.items,venue.id,venue.name,venue.location.address,venue.location.crossStreet,venue.location.lat,venue.location.lng,venue.location.labeledLatLngs,venue.location.distance,venue.location.postalCode,venue.location.cc,venue.location.city,venue.location.state,venue.location.country,venue.location.formattedAddress,venue.categories,venue.photos.count,venue.photos.groups,venue.venuePage.id,venue.location.neighborhood
0,e-0-54ea41ad498e9a11e9e13308-0,0,"[{'summary': 'This spot is popular', 'type': '...",54ea41ad498e9a11e9e13308,Roselle Desserts,362 King St E,Trinity St,43.653447,-79.362017,"[{'label': 'display', 'lat': 43.65344672305267...",192,M5A 1K9,CA,Toronto,ON,Canada,"[362 King St E (Trinity St), Toronto ON M5A 1K...","[{'id': '4bf58dd8d48988d16a941735', 'name': 'B...",0,[],,
1,e-0-53b8466a498e83df908c3f21-1,0,"[{'summary': 'This spot is popular', 'type': '...",53b8466a498e83df908c3f21,Tandem Coffee,368 King St E,at Trinity St,43.653559,-79.361809,"[{'label': 'display', 'lat': 43.65355870959944...",186,,CA,Toronto,ON,Canada,"[368 King St E (at Trinity St), Toronto ON, Ca...","[{'id': '4bf58dd8d48988d1e0931735', 'name': 'C...",0,[],,
2,e-0-4af59046f964a520e0f921e3-2,0,"[{'summary': 'This spot is popular', 'type': '...",4af59046f964a520e0f921e3,Figs Breakfast & Lunch,344 Queen St. E.,at Parliament St.,43.655675,-79.364503,"[{'label': 'display', 'lat': 43.65567455427388...",162,M5A 1S8,CA,Toronto,ON,Canada,"[344 Queen St. E. (at Parliament St.), Toronto...","[{'id': '4bf58dd8d48988d143941735', 'name': 'B...",0,[],,
3,e-0-4b58dd55f964a5208f6f28e3-3,0,"[{'summary': 'This spot is popular', 'type': '...",4b58dd55f964a5208f6f28e3,The Yoga Lounge,106 Sherbourne St.,at Adelaide St. East,43.655515,-79.364955,"[{'label': 'display', 'lat': 43.65551522261721...",191,,CA,Toronto,ON,Canada,"[106 Sherbourne St. (at Adelaide St. East), To...","[{'id': '4bf58dd8d48988d102941735', 'name': 'Y...",0,[],,
4,e-0-50760559e4b0e8c7babe2497-4,0,"[{'summary': 'This spot is popular', 'type': '...",50760559e4b0e8c7babe2497,Body Blitz Spa East,497 King Street East,btwn Sackville St and Sumach St,43.654735,-79.359874,"[{'label': 'display', 'lat': 43.65473505045365...",226,M5A 1L9,CA,Toronto,ON,Canada,[497 King Street East (btwn Sackville St and S...,"[{'id': '4bf58dd8d48988d1ed941735', 'name': 'S...",0,[],,


#### We are interested in the venue names, categories, and location. Let's filter them out.

In [33]:
interested_columns = ['venue.name','venue.categories','venue.location.lat','venue.location.lng']
nearby_venues = nearby_venues.loc[:,interested_columns]
nearby_venues.head()

Unnamed: 0,venue.name,venue.categories,venue.location.lat,venue.location.lng
0,Roselle Desserts,"[{'id': '4bf58dd8d48988d16a941735', 'name': 'B...",43.653447,-79.362017
1,Tandem Coffee,"[{'id': '4bf58dd8d48988d1e0931735', 'name': 'C...",43.653559,-79.361809
2,Figs Breakfast & Lunch,"[{'id': '4bf58dd8d48988d143941735', 'name': 'B...",43.655675,-79.364503
3,The Yoga Lounge,"[{'id': '4bf58dd8d48988d102941735', 'name': 'Y...",43.655515,-79.364955
4,Body Blitz Spa East,"[{'id': '4bf58dd8d48988d1ed941735', 'name': 'S...",43.654735,-79.359874


In [49]:
nearby_venues['venue.categories'][0]

[{'id': '4bf58dd8d48988d16a941735',
  'name': 'Bakery',
  'pluralName': 'Bakeries',
  'shortName': 'Bakery',
  'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/bakery_',
   'suffix': '.png'},
  'primary': True}]

#### We see that categories is consisted of dictionary with multiple keys. However, we are only interested in the name of the category for now. Therefore, let's apply **get_category_type** to each row of our dataframe in order to get only the name of the venue instead of other extra information

In [50]:
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

#### though this step is not absolutely necessary but let's make the column names more readable

In [60]:
nearby_venues.columns

Index(['name', 'categories', 'lat', 'lng'], dtype='object')

In [61]:
nearby_venues.columns = [column.split('.')[-1] for column in nearby_venues.columns]
nearby_venues.columns

Index(['name', 'categories', 'lat', 'lng'], dtype='object')

In [62]:
nearby_venues.head(5)

Unnamed: 0,name,categories,lat,lng
0,Roselle Desserts,Bakery,43.653447,-79.362017
1,Tandem Coffee,Coffee Shop,43.653559,-79.361809
2,Figs Breakfast & Lunch,Breakfast Spot,43.655675,-79.364503
3,The Yoga Lounge,Yoga Studio,43.655515,-79.364955
4,Body Blitz Spa East,Spa,43.654735,-79.359874


In [63]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

21 venues were returned by Foursquare.


### Now let's explore the neighbourhoods in Toronto