# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
# Libraries
from bs4 import BeautifulSoup
import requests

In [2]:
source = requests.get('https://en.m.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [3]:
soup = BeautifulSoup(source,'lxml')

# To get column headers
headers = soup.find('tr').text.split('\n')
headers.remove('')
headers

# To get all the rows
rows = []
for tr in soup.find_all('tr')[1:]:
    try:
        tds = tr.find_all('td')
        x = tds[0].text
        y = tds[1].text
        z = tds[2].text.strip('\n')
        rows.append([x,y,z])
        
    except IndexError:
        continue

In [4]:
import pandas as pd
df = pd.DataFrame(rows, columns =headers)

In [5]:
# To drop the Not assigned Boroughs
df = df[df.Borough != 'Not assigned']

# Reset the index
df.reset_index(inplace = True)
del df['index']

# Groupby Postal Codes
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [6]:
 for i,j in zip(df['Borough'],df['Neighbourhood']):
        if j == 'Not assigned':
            j = i

In [7]:
df_gr = df.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(', '.join)

In [8]:
df_gr = pd.DataFrame(df_gr).reset_index()

In [9]:
df_gr.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [10]:
df_gr.shape

(103, 3)

In [11]:
df_coordinates = pd.read_csv('Geospatial_Coordinates.csv')
df_coordinates = df_coordinates.rename(columns={"Postal Code": "Postcode"})
df_coordinates.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
df_final = pd.merge(df_gr, df_coordinates, on=['Postcode'], how='inner')
df_final.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [13]:
from geopy.geocoders import Nominatim
address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of the City of Toronto are {}, {}.'.format(latitude, longitude))

  after removing the cwd from sys.path.


The geograpical coordinate of the City of Toronto are 43.653963, -79.387207.


In [14]:
import folium

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_final['Latitude'], df_final['Longitude'], df_final['Borough'], df_final['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

# Clustering Toronto Neighbourhoods

In [15]:
CLIENT_ID = 'VYCTCPRYMD2COQAQO5J113QD5GNHHT2TI1XIZODTJ2ZD2EJN'
CLIENT_SECRET = 'VXBFNPBYMLAYKXG2YPT5RLX1FA5JJSLRZTLWOHNFU415TXZT'
VERSION = '20191108'

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: VYCTCPRYMD2COQAQO5J113QD5GNHHT2TI1XIZODTJ2ZD2EJN
CLIENT_SECRET:VXBFNPBYMLAYKXG2YPT5RLX1FA5JJSLRZTLWOHNFU415TXZT


In [18]:
# Considering only Toronto Neighborhoods
df_toronto = df_final[df_final['Borough'].str.contains('Toronto')]

df_toronto = df_toronto.reset_index(drop=True)
df_toronto

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


In [21]:
#Toronto neighborhoods on map
toronto_bor = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3,
        parse_html=False).add_to(toronto_bor)  
    
toronto_bor


## Exploring the Neighbourhood near Lawrence Park

In [29]:
df_toronto.loc[4, 'Neighbourhood']

'Lawrence Park'

In [31]:
neighbourhood_latitude = df_toronto.loc[4, 'Latitude']
neighbourhood_longitude = df_toronto.loc[4, 'Longitude']

neighbourhood_name = df_toronto.loc[4, 'Neighbourhood']

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

Latitude and longitude values of Lawrence Park are 43.7280205, -79.3887901.


In [32]:
LIMIT = 100
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=VYCTCPRYMD2COQAQO5J113QD5GNHHT2TI1XIZODTJ2ZD2EJN&client_secret=VXBFNPBYMLAYKXG2YPT5RLX1FA5JJSLRZTLWOHNFU415TXZT&v=20191108&ll=43.7280205,-79.3887901&radius=500&limit=100'

In [33]:
results = requests.get(url).json()
results


{'meta': {'code': 200, 'requestId': '5dc746f0cad1b60039964348'},
  'headerLocation': 'Toronto',
  'headerFullLocation': 'Toronto',
  'headerLocationGranularity': 'city',
  'totalResults': 3,
  'suggestedBounds': {'ne': {'lat': 43.7325205045, 'lng': -79.3825744605273},
   'sw': {'lat': 43.7235204955, 'lng': -79.3950057394727}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '50e6da19e4b0d8a78a0e9794',
       'name': 'Lawrence Park Ravine',
       'location': {'address': '3055 Yonge Street',
        'crossStreet': 'Lawrence Avenue East',
        'lat': 43.72696303913755,
        'lng': -79.39438246708775,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.72696303913755,
          'lng': -79.39438246708775}],
        'distance': 465,
        'cc': 'CA',
  

In [34]:
# function to get the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [38]:
import json
from pandas.io.json import json_normalize

venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Lawrence Park Ravine,Park,43.726963,-79.394382
1,Zodiac Swim School,Swim School,43.728532,-79.38286
2,TTC Bus #162 - Lawrence-Donway,Bus Line,43.728026,-79.382805
