In [2]:
#get required libraries

from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from IPython.display import display_html

#!conda install -c conda-forge folium=0.5.0 --yes
!pip install folium
import folium

from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
import os
from geopy.geocoders import Nominatim 

print('Libraries import done')

Libraries import done


In [3]:
#scrap wikipedia page
page_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(page_url).text

soup=BeautifulSoup(source,'xml')

tab = str(soup.table)
#display_html(tab,raw=True)

In [5]:
#Convert html to Pandas dataframe to enable cleaning and processing of data

dfs = pd.read_html(tab)
df=dfs[0]
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [6]:
#Ignore cells with borough that are 'Not assigned' so that we only process the rows with an assigned borough
df1 = df[df.Borough != 'Not assigned']

#Combine neighbourhoods with same Postal Code
df2 = df1.groupby(['Postal Code','Borough'], sort=False).agg(', '.join)
df2.reset_index(inplace=True)

#Replace data where neighbourhood that are 'Not assigned' with the name of Borough
df2['Neighbourhood'] = np.where(df2['Neighbourhood'] == 'Not assigned',df2['Borough'], df2['Neighbourhood'])

df2.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [7]:
#display shape of the dataframe
df2.shape

(103, 3)

In [8]:
#Get lattitude and longitude by importing provided link to csv
la_lo = pd.read_csv('https://cocl.us/Geospatial_data')
la_lo.head()

#assign latitude and longitude to neighbourhoods in canada by merging two dataframes
df_canada_geo = pd.merge(df2, la_lo, on ='Postal Code')
df_canada_geo.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [10]:
df_canada_geo.groupby('Borough').count()['Neighbourhood']

Borough
Central Toronto      9
Downtown Toronto    19
East Toronto         5
East York            5
Etobicoke           12
Mississauga          1
North York          24
Scarborough         17
West Toronto         6
York                 5
Name: Neighbourhood, dtype: int64

In [11]:
#get data that has Borough as Toronto
df_toronto_geo = df_canada_geo[df_canada_geo['Borough'].str.contains('Toronto',regex=False)]
#df_toronto_geo.head()
df_toronto_geo.groupby('Borough').count()['Neighbourhood']

Borough
Central Toronto      9
Downtown Toronto    19
East Toronto         5
West Toronto         6
Name: Neighbourhood, dtype: int64

In [65]:
# Get lat and Lon for Toronto
geolocator = Nominatim (user_agent="ny_explorer")
location = geolocator.geocode('Toronto')
lat_toronto = location.latitude
lng_toronto = location.longitude
print (lat_toronto)
print (lng_toronto)

43.6534817
-79.3839347


In [13]:
#Generate maps to visualize neighborhoods and show how they cluster together
map_toronto = folium.Map(location=[lat_toronto,lng_toronto],zoom_start=10)

for lat,lng,borough,neighbourhood in zip(df_toronto_geo['Latitude'],df_toronto_geo['Longitude'],df_toronto_geo['Borough'],df_toronto_geo['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
map_toronto

In [14]:
#Foursquare details
CLIENT_ID = 'TTD0JRI5KTCMWSCYVLBAUCKHXMGEHUQCTNI1A0TV1WS2N21W' #  Client ID
CLIENT_SECRET = 'O0H4QW350PLZUGEKTIDU1KPCOZLFFEOYJ0SIH1BJAIHVIA3V' # Client Secret
VERSION = '20200804'
LIMIT = 150

In [32]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
   
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [33]:
toronto_venues = getNearbyVenues(names=df_toronto_geo['Neighbourhood'],
                                latitudes=df_toronto_geo['Latitude'],
                                longitudes=df_toronto_geo['Longitude'])

Regent Park, Harbourfront
Queen's Park, Ontario Provincial Government
Garden District, Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
The Danforth West, Riverdale
Toronto Dominion Centre, Design Exchange
Brockton, Parkdale Village, Exhibition Place
India Bazaar, The Beaches West
Commerce Court, Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West, Forest Hill Road Park
High Park, The Junction South
North Toronto West, Lawrence Park
The Annex, North Midtown, Yorkville
Parkdale, Roncesvalles
Davisville
University of Toronto, Harbord
Runnymede, Swansea
Moore Park, Summerhill East
Kensington Market, Chinatown, Grange Park
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
R

In [35]:
toronto_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,57,57,57,57,57,57
"Brockton, Parkdale Village, Exhibition Place",23,23,23,23,23,23
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",16,16,16,16,16,16
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",16,16,16,16,16,16
Central Bay Street,59,59,59,59,59,59
Christie,16,16,16,16,16,16
Church and Wellesley,78,78,78,78,78,78
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
Davisville,35,35,35,35,35,35
Davisville North,9,9,9,9,9,9


In [37]:
"Indian Restaurant" in toronto_venues['Venue Category'].unique()

True

In [38]:
to_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
to_onehot['Neighbourhoods'] = toronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [to_onehot.columns[-1]] + list(to_onehot.columns[:-1])
to_onehot = to_onehot[fixed_columns]

print(to_onehot.shape)
to_onehot.head()

(1599, 236)


Unnamed: 0,Neighbourhoods,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Yoga Studio
0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
to_grouped = to_onehot.groupby(["Neighbourhoods"]).mean().reset_index()

print(to_grouped.shape)
to_grouped

(39, 236)


Unnamed: 0,Neighbourhoods,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.017544,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0625,0.0625,0.0625,0.125,0.125,0.125,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.016949,0.0,0.0,0.016949,0.0,0.016949
5,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Church and Wellesley,0.012821,0.0,0.0,0.0,0.0,0.0,0.0,0.012821,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025641
7,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0
8,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
to_indian = to_grouped[["Neighbourhoods","Indian Restaurant"]]
to_indian.head(9)

Unnamed: 0,Neighbourhoods,Indian Restaurant
0,Berczy Park,0.017544
1,"Brockton, Parkdale Village, Exhibition Place",0.0
2,"Business reply mail Processing Centre, South C...",0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0
4,Central Bay Street,0.016949
5,Christie,0.0
6,Church and Wellesley,0.012821
7,"Commerce Court, Victoria Hotel",0.0
8,Davisville,0.028571


In [46]:
#using K-means for the clsutering of the neighbourhoods
from sklearn.cluster import KMeans
k=3
toronto_cluster = to_indian.drop(["Neighbourhoods"],1)
kmeans1 = KMeans(n_clusters = k,random_state=1).fit(toronto_cluster)
kmeans1.labels_
#to_indian.insert(0, 'Cluster Labels', kmeans1.labels_)

array([0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 2, 1, 0, 1, 1], dtype=int32)

In [60]:
to_merged = to_indian.copy()

# add clustering labels
to_merged["Cluster Labels"] = kmeans1.labels_
to_merged.rename(columns={"Neighbourhoods": "Neighbourhood"}, inplace=True)
to_merged.head(5)

Unnamed: 0,Cluster Labels,Neighbourhood,Indian Restaurant
0,0,Berczy Park,0.017544
1,1,"Brockton, Parkdale Village, Exhibition Place",0.0
2,1,"Business reply mail Processing Centre, South C...",0.0
3,1,"CN Tower, King and Spadina, Railway Lands, Har...",0.0
4,0,Central Bay Street,0.016949


In [61]:
to_merged = to_merged.join(toronto_venues.set_index("Neighbourhood"), on="Neighbourhood")

print(to_merged.shape)

to_merged.head()

(1599, 9)


Unnamed: 0,Cluster Labels,Neighbourhood,Indian Restaurant,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,0,Berczy Park,0.017544,43.644771,-79.373306,The Keg Steakhouse + Bar - Esplanade,43.646712,-79.374768,Restaurant
0,0,Berczy Park,0.017544,43.644771,-79.373306,LCBO,43.642944,-79.37244,Liquor Store
0,0,Berczy Park,0.017544,43.644771,-79.373306,Fresh On Front,43.647815,-79.374453,Vegetarian / Vegan Restaurant
0,0,Berczy Park,0.017544,43.644771,-79.373306,Goose Island Brewhouse,43.647329,-79.373541,Beer Bar
0,0,Berczy Park,0.017544,43.644771,-79.373306,Hockey Hall Of Fame (Hockey Hall of Fame),43.646974,-79.377323,Museum


In [63]:
to_merged.sort_values(["Cluster Labels"], inplace=True)
to_merged.head()

Unnamed: 0,Cluster Labels,Neighbourhood,Indian Restaurant,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,0,Berczy Park,0.017544,43.644771,-79.373306,The Keg Steakhouse + Bar - Esplanade,43.646712,-79.374768,Restaurant
4,0,Central Bay Street,0.016949,43.657952,-79.387383,Marshalls,43.659308,-79.382462,Department Store
4,0,Central Bay Street,0.016949,43.657952,-79.387383,The Yoga Sanctuary,43.661499,-79.383636,Yoga Studio
4,0,Central Bay Street,0.016949,43.657952,-79.387383,The Library Specialty Coffee,43.654413,-79.390902,Coffee Shop
4,0,Central Bay Street,0.016949,43.657952,-79.387383,Silver Snail Comics,43.657031,-79.381403,Comic Shop


In [69]:
map_clusters = folium.Map(location=[lat_toronto, lng_toronto],zoom_start=14)

# set color scheme for the clusters
markers_colors={}
markers_colors[0] = 'red'
markers_colors[1] = 'blue'
markers_colors[2] = 'green'
markers_colors[3] = 'yellow'
markers_colors[4] = 'cyan'
markers_colors[5] = 'black'
for lat, lng, cluster in zip(to_merged['Neighbourhood Latitude'], to_merged['Neighbourhood Longitude'], to_merged['Cluster Labels']):
    folium.features.Marker(
        [lat, lng],
        radius=5,
       
        color =markers_colors[cluster],
        fill_color=markers_colors[cluster],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [70]:
#Cluster 0
to_merged.loc[(to_merged['Cluster Labels'] ==0) & (to_merged['Venue Category'] == 'Indian Restaurant') ]

Unnamed: 0,Cluster Labels,Neighbourhood,Indian Restaurant,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
4,0,Central Bay Street,0.016949,43.657952,-79.387383,Colaba Junction,43.66094,-79.385635,Indian Restaurant
30,0,"St. James Town, Cabbagetown",0.021739,43.667967,-79.367675,Butter Chicken Factory,43.667072,-79.369184,Indian Restaurant
0,0,Berczy Park,0.017544,43.644771,-79.373306,Bindia Indian Bistro,43.648559,-79.371816,Indian Restaurant
6,0,Church and Wellesley,0.012821,43.66586,-79.38316,Kothur Indian Cuisine,43.667872,-79.385659,Indian Restaurant
36,0,"The Danforth West, Riverdale",0.023256,43.679557,-79.352188,Sher-E-Punjab,43.677308,-79.353066,Indian Restaurant
31,0,Stn A PO Boxes,0.010204,43.646435,-79.374846,Bindia Indian Bistro,43.648559,-79.371816,Indian Restaurant
14,0,"Harbourfront East, Union Station, Toronto Islands",0.01,43.640816,-79.381752,Indian Roti House,43.63906,-79.385422,Indian Restaurant
8,0,Davisville,0.028571,43.704324,-79.38879,Marigold Indian Bistro,43.702881,-79.388008,Indian Restaurant


In [85]:
#Cluster 1
to_merged.loc[(to_merged['Cluster Labels'] ==1) & (to_merged['Venue Category'] == 'Indian Restaurant') ]

Unnamed: 0,Cluster Labels,Neighbourhood,Indian Restaurant,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category


In [84]:
#Cluster 2
to_merged.loc[(to_merged['Cluster Labels'] ==2) & (to_merged['Venue Category'] == 'Indian Restaurant') ]

Unnamed: 0,Cluster Labels,Neighbourhood,Indian Restaurant,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
34,2,"The Annex, North Midtown, Yorkville",0.052632,43.67271,-79.405678,Roti Cuisine of India,43.674618,-79.408249,Indian Restaurant
