<a href="https://colab.research.google.com/github/rezzix/Capstone-Project/blob/master/Toronto_neighbourhood_segmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Segmenting and Clustering Neighborhoods in Toronto

### install useful modules

In [1]:
# beautiful soup for web scrapping
!pip install beautifulsoup4
# geocoder for geolocalisation
!pip install geocoder
# folium for map rendering
!pip install folium

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 2.2MB/s 
Collecting ratelim
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


### import useful libraries

In [2]:
import numpy as np
import pandas as pd
import requests
import re
import os
from bs4 import BeautifulSoup
import geocoder
from getpass import getpass
import folium

### start scrapping the wikipedia page for neighborhoods of Toronto

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url, allow_redirects=True)
soup = BeautifulSoup(page.content, 'html.parser')

In [4]:
postalcodes_tab = soup.find('table',class_='wikitable')

neighb_df = pd.DataFrame(columns=['PostalCode','Borough','Neighborhood'])
i=0

for neighborhood_tr in postalcodes_tab.find_all('tr'):
  if (len(neighborhood_tr.find_all('td')) == 3) :
    neighb_row = [td.text.rstrip() for td in neighborhood_tr.find_all('td')]
    if (neighb_row[1] != 'Not assigned') :
      neighb_df.loc[i] = neighb_row
      i+=1

neighb_df.head(10)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [5]:
neighb_df.shape


(103, 3)

In [7]:
opencage_api_key = getpass("what is your opencage api key : ")
#print (geocoder.opencage('North York, Victoria Village, CA', key=openkage_api_key).latlng)
#print (geocoder.osm('North York, Victoria Village, CA').latlng)

what is your opencage api key : ··········


### add geolocalisation data to the frame

In [8]:
neighb_df['address'] = neighb_df['Neighborhood'] + ', ' + neighb_df['Borough']+', Toronto, Canada'
#
neighb_df['lat'] = neighb_df['PostalCode']
neighb_df['lng'] = neighb_df['PostalCode']

#neighb_df_tst = neighb_df.head(3)

#neighb_df_tst['coordinates']=neighb_df_tst['adress'].apply(geocoder.osm).apply(lambda x: x.latlng if x != None else None)

for index, row in neighb_df.iterrows():
  latlng = geocoder.opencage(row['address'], key=opencage_api_key).latlng
  #print (row['adress'], geocoder.opencage(repr(row['adress']), key=opencage_api_key).latlng)
  if (latlng is not None) :
    row['lat'], row['lng'] = latlng[0], latlng[1]

neighb_df
#neighb_df['lat'] = geocoder.osm(neighb_df['adress']).lat
#geocoder.osm('M3A, Parkwoods, North York, CA').latlng

Unnamed: 0,PostalCode,Borough,Neighborhood,address,lat,lng
0,M3A,North York,Parkwoods,"Parkwoods, North York, Toronto, Canada",43.7611,-79.3241
1,M4A,North York,Victoria Village,"Victoria Village, North York, Toronto, Canada",43.7327,-79.3112
2,M5A,Downtown Toronto,"Regent Park, Harbourfront","Regent Park, Harbourfront, Downtown Toronto, T...",43.7001,-79.4163
3,M6A,North York,"Lawrence Manor, Lawrence Heights","Lawrence Manor, Lawrence Heights, North York, ...",43.7001,-79.4163
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government","Queen's Park, Ontario Provincial Government, D...",43.7001,-79.4163
...,...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North","The Kingsway, Montgomery Road, Old Mill North,...",43.7001,-79.4163
99,M4Y,Downtown Toronto,Church and Wellesley,"Church and Wellesley, Downtown Toronto, Toront...",43.6615,-79.3829
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...","Business reply mail Processing Centre, South C...",45.7236,7.4575
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...","Old Mill South, King's Mill Park, Sunnylea, Hu...",43.7001,-79.4163


In [9]:
address = 'Toronto, CA'

toronto_latlng = geocoder.opencage(address, key=opencage_api_key).latlng

print('The geograpical coordinate of Toronto are {}, {}.'.format(toronto_latlng[0], toronto_latlng[1]))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [10]:
condition = neighb_df['Borough'].str.contains('Toronto')

neighb_toronto_df = neighb_df[condition]

In [11]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[toronto_latlng[0], toronto_latlng[1]], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighb_toronto_df['lat'], neighb_toronto_df['lng'], neighb_toronto_df['Borough'], neighb_toronto_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Clustering by categories of trending venues

In [16]:
# test the response of one call
CLIENT_ID = 'LETUVSLX3N1JYS23O4KJIJTAMSE2K1WYBCFTZZC52TJ5U5XC' # your Foursquare ID
CLIENT_SECRET = 'HEJX3D2KAC3UN5EOE1PHAONNFQVVRG4KECXPWFODDHXFW2UG' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, 43.7327,	-79.3112, 500, 10)
# make the GET request
results = requests.get(url).json()

results

{'meta': {'code': 200, 'requestId': '5f0b9197b7e6df71744cd79f'},
 'response': {'groups': [{'items': [{'reasons': {'count': 0,
       'items': [{'reasonName': 'globalInteractionReason',
         'summary': 'This spot is popular',
         'type': 'general'}]},
      'referralId': 'e-0-550df684498ea2dd2c87bb5a-0',
      'venue': {'categories': [{'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/thai_',
          'suffix': '.png'},
         'id': '4bf58dd8d48988d149941735',
         'name': 'Thai Restaurant',
         'pluralName': 'Thai Restaurants',
         'primary': True,
         'shortName': 'Thai'}],
       'id': '550df684498ea2dd2c87bb5a',
       'location': {'address': '1744  Victoria Park',
        'cc': 'CA',
        'city': 'North York',
        'country': 'Canada',
        'crossStreet': 'Surrey Ave',
        'distance': 482,
        'formattedAddress': ['1744  Victoria Park (Surrey Ave)',
         'North York ON M1R 1R4',
         'Canada'],
        'labeledLa

In [17]:


def getExploreVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(name, lat, lng, v['venue']['name'], v['venue']['location']['lat'], v['venue']['location']['lng'], v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
    
    return(nearby_venues)

toronto_venues = getExploreVenues(names=neighb_toronto_df['Neighborhood'], latitudes=neighb_toronto_df['lat'], longitudes=neighb_toronto_df['lng'] ) 

Regent Park, Harbourfront
Queen's Park, Ontario Provincial Government
Garden District, Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
The Danforth West, Riverdale
Toronto Dominion Centre, Design Exchange
Brockton, Parkdale Village, Exhibition Place
India Bazaar, The Beaches West
Commerce Court, Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West, Forest Hill Road Park
High Park, The Junction South
North Toronto West,  Lawrence Park
The Annex, North Midtown, Yorkville
Parkdale, Roncesvalles
Davisville
University of Toronto, Harbord
Runnymede, Swansea
Moore Park, Summerhill East
Kensington Market, Chinatown, Grange Park
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport


In [22]:
toronto_venues['Venue Category'].unique()

# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues['Venue Category'])

In [24]:
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood']

In [26]:
toronto_onehot.head()

Unnamed: 0,African Restaurant,American Restaurant,Antique Shop,Art Museum,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Bakery,Bank,Bar,Beach,Beer Bar,Beer Store,Bookstore,Boutique,Brazilian Restaurant,Breakfast Spot,Bubble Tea Shop,Burger Joint,Burrito Place,Café,Caribbean Restaurant,Cheese Shop,Chinese Restaurant,Chocolate Shop,Clothing Store,Cocktail Bar,Coffee Shop,Comfort Food Restaurant,Comic Shop,Concert Hall,Convenience Store,Cosmetics Shop,Creperie,Deli / Bodega,Department Store,Dessert Shop,Diner,Dive Bar,Donut Shop,...,Museum,Music Venue,Nail Salon,Neighborhood,New American Restaurant,Park,Pet Store,Pharmacy,Pizza Place,Plaza,Poke Place,Pub,Ramen Restaurant,Restaurant,Salon / Barbershop,Sandwich Place,Seafood Restaurant,Shoe Store,Shopping Mall,Skating Rink,Smoke Shop,Speakeasy,Stationery Store,Steakhouse,Supermarket,Sushi Restaurant,Taco Place,Tailor Shop,Tanning Salon,Tattoo Parlor,Tea Room,Thai Restaurant,Theater,Thrift / Vintage Store,Tibetan Restaurant,Toy / Game Store,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [28]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean()