# CLUSTERING NEIGHBOORHOOD IN TORONTO

## Part 1

## Scraping Data From Wikipedia

In [87]:
#Importing Libraries
import pandas as pd
import numpy as np

In [88]:
# Loading data
tables = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

In [89]:
# Checking the type of output
type(tables)

list

In [90]:
# How many tables inside
len(tables)

3

In [91]:
# Let's check first
table[0]

Unnamed: 0,index,Postal Code,Borough,Neighbourhood
0,0,M1A,Not assigned,Not assigned
1,1,M2A,Not assigned,Not assigned
2,2,M3A,North York,Parkwoods
3,3,M4A,North York,Victoria Village
4,4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...,...
175,175,M5Z,Not assigned,Not assigned
176,176,M6Z,Not assigned,Not assigned
177,177,M7Z,Not assigned,Not assigned
178,178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [129]:
# Converting list into dataframe
df = pd.DataFrame(table[0])

In [130]:
# Checking head
df.head()

Unnamed: 0,index,Postal Code,Borough,Neighbourhood
0,0,M1A,Not assigned,Not assigned
1,1,M2A,Not assigned,Not assigned
2,2,M3A,North York,Parkwoods
3,3,M4A,North York,Victoria Village
4,4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [132]:
# Checking shape
df.shape

(180, 3)

In [133]:
# Null values check
df.isnull().sum()

Postal Code      0
Borough          0
Neighbourhood    0
dtype: int64

In [134]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Postal Code    180 non-null    object
 1   Borough        180 non-null    object
 2   Neighbourhood  180 non-null    object
dtypes: object(3)
memory usage: 4.3+ KB


In [135]:
# Check values of Postal Code
df['Postal Code'].value_counts()

M9K    1
M7S    1
M6Y    1
M3L    1
M1X    1
      ..
M4M    1
M5Y    1
M1V    1
M7H    1
M1H    1
Name: Postal Code, Length: 180, dtype: int64

In [136]:
# Check values of Neighbourhood
df['Neighbourhood'].value_counts()

Not assigned                                77
Downsview                                    4
Don Mills                                    2
Caledonia-Fairbanks                          1
Guildwood, Morningside, West Hill            1
                                            ..
Regent Park, Harbourfront                    1
Glencairn                                    1
Victoria Village                             1
Runnymede, The Junction North                1
North Park, Maple Leaf Park, Upwood Park     1
Name: Neighbourhood, Length: 100, dtype: int64

In [137]:
# Check values of Borough
df['Borough'].value_counts()

Not assigned        77
North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East York            5
East Toronto         5
York                 5
Mississauga          1
Name: Borough, dtype: int64

*There are 77 Not assigned Borough, those rows will be dropped*

In [138]:
# Dropping Borough Not assigned rows, assigning as df2
df2 = df[df['Borough']!='Not assigned']

In [139]:
df2.shape

(103, 3)

In [140]:
df2['Postal Code'].value_counts()

M5R    1
M1E    1
M3M    1
M9M    1
M4N    1
      ..
M5X    1
M4E    1
M5S    1
M5K    1
M1H    1
Name: Postal Code, Length: 103, dtype: int64

In [141]:
#Checking M5A Postal Code row, if there are multiple entries
df2[df2['Postal Code']=='M5A']

Unnamed: 0,Postal Code,Borough,Neighbourhood
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


*Each Postal Code used only once*

In [142]:
df2['Neighbourhood'].value_counts()

Downsview                                        4
Don Mills                                        2
The Kingsway, Montgomery Road, Old Mill North    1
Willowdale, Willowdale West                      1
Victoria Village                                 1
                                                ..
Upper Rouge                                      1
Stn A PO Boxes                                   1
Dufferin, Dovercourt Village                     1
York Mills, Silver Hills                         1
Islington Avenue, Humber Valley Village          1
Name: Neighbourhood, Length: 99, dtype: int64

In [143]:
# Checking for Not assigned Neighbourhood values
df2[df2['Neighbourhood']=='Not assigned']

Unnamed: 0,Postal Code,Borough,Neighbourhood


*New dataframe has not Not assigned Neighboorhood*

In [152]:
df2.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [169]:
# Resetting index
df2 = df2.reset_index()
df2.drop('index',axis=1,inplace=True)

In [171]:
df2.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [172]:
df2.shape

(103, 3)

*This is end of Part 1*

## Part 2

### Adding DataFrame Coordinates

In [79]:
# convert an address into latitude and longitude values 
from geopy.geocoders import Nominatim 

In [82]:
# Checking coordinates of Toronto
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


In [173]:
df2.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [178]:
df2['Postal Code'][102]

'M8Z'

In [258]:
# For loop for the coordinates
coor_dict={'Latitude':[],'Longitude':[]} # Creating an empty dictionary
geolocator = Nominatim(user_agent="ny_explorer") # Initializing geolocator

for i in range(len(df2)): # Range of row number in dataframe
    location = geolocator.geocode(df2['Borough'][i]) # Entering Borough name as an adress
    coor_dict['Latitude'].append(location.latitude) # Adding latitute to dictionary
    coor_dict['Longitude'].append(location.longitude) # Adding longitude to dictionary

In [259]:
# Cheking dictionary
coor_dict

{'Latitude': [43.7543263,
  43.7543263,
  43.6541737,
  43.7543263,
  43.6541737,
  43.6435559,
  54.2820009,
  43.7543263,
  43.699971000000005,
  43.6541737,
  43.7543263,
  43.6435559,
  54.2820009,
  43.7543263,
  43.699971000000005,
  43.6541737,
  53.9590555,
  43.6435559,
  54.2820009,
  43.72178945,
  43.6541737,
  53.9590555,
  54.2820009,
  43.699971000000005,
  43.6541737,
  43.6541737,
  54.2820009,
  43.7543263,
  43.7543263,
  43.699971000000005,
  43.6541737,
  43.6534817,
  54.2820009,
  43.7543263,
  43.7543263,
  43.699971000000005,
  43.6541737,
  43.6534817,
  54.2820009,
  43.7543263,
  43.7543263,
  43.72178945,
  43.6541737,
  43.6534817,
  54.2820009,
  43.7543263,
  43.7543263,
  43.72178945,
  43.6541737,
  43.7543263,
  43.7543263,
  54.2820009,
  43.7543263,
  43.7543263,
  43.72178945,
  43.7543263,
  53.9590555,
  43.7543263,
  54.2820009,
  43.7543263,
  43.7543263,
  43.6534817,
  43.6534817,
  53.9590555,
  53.9590555,
  54.2820009,
  43.7543263,
  43.6

In [254]:
# From dictionary to dataframe
pd.DataFrame(coor_dict).head()

Unnamed: 0,Latitude,Longitude
0,43.754326,-79.449117
1,43.754326,-79.449117
2,43.654174,-79.380812
3,43.754326,-79.449117
4,43.654174,-79.380812


In [255]:
# Adding dictionary to dataframe as new columns, assigning as df3
df3 = pd.concat([df2,pd.DataFrame(coor_dict)], axis=1)

In [261]:
df3.shape

(103, 5)

In [262]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Postal Code    103 non-null    object 
 1   Borough        103 non-null    object 
 2   Neighbourhood  103 non-null    object 
 3   Latitude       103 non-null    float64
 4   Longitude      103 non-null    float64
dtypes: float64(2), object(3)
memory usage: 4.1+ KB


In [257]:
# New dataframe
df3.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.754326,-79.449117
1,M4A,North York,Victoria Village,43.754326,-79.449117
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654174,-79.380812
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.754326,-79.449117
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.654174,-79.380812


*This is end of Part 2*

## Part 3

### Geopy library to get coordinates of Toronto

In [273]:
# Getting coordinates of Toronto
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


### Map of Toronto with Neighbourhoods markers

In [270]:
# Importing folium for map
import folium

In [277]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df3['Latitude'], df3['Longitude'], df3['Borough'], df3['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

### Filtering boroughs containing word Toronto

In [285]:
# filter borough names that contain the word Toronto
borough_names = df3['Borough'].unique().tolist()

borough_toronto = []

for i in borough_names:
    if "toronto" in i.lower():
        borough_toronto.append(i)
        
borough_toronto

['Downtown Toronto', 'East Toronto', 'West Toronto', 'Central Toronto']

In [289]:
# create a new DataFrame with only boroughs that contain the word Toronto
df4 = df3[df3['Borough'].isin(borough_toronto)]#.reset_index(drop=True)
print(df4.shape)
df4.head()

(39, 5)


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654174,-79.380812
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.654174,-79.380812
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.654174,-79.380812
15,M5C,Downtown Toronto,St. James Town,43.654174,-79.380812
19,M4E,East Toronto,The Beaches,43.721789,-79.374027


In [291]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df4['Latitude'], df4['Longitude'], df4['Borough'], df4['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

**Explore Neighborhoods in Toronto Using Foursquare API**

In [292]:
# define Foursquare Credentials and Version
CLIENT_ID = 'PMAZFKEO2ZSMXR1QU3VJOQLPVAG1QQG0OZ0MR3RYJPRUBQCS' # your Foursquare ID
CLIENT_SECRET = 'CXU2UCKAXEUC2JJX1PBBQYGWWFKAR5NCVC30QDTQ1VIGTQSC' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: PMAZFKEO2ZSMXR1QU3VJOQLPVAG1QQG0OZ0MR3RYJPRUBQCS
CLIENT_SECRET:CXU2UCKAXEUC2JJX1PBBQYGWWFKAR5NCVC30QDTQ1VIGTQSC


**Getting the top 100 venues that are in Toronto within a radius of 500 meters.**

In [295]:
# Importing requests library
import requests

In [296]:
radius = 500
LIMIT = 100

venues = []

for lat, long, post, borough, neighborhood in zip(df4['Latitude'], df4['Longitude'], df4['Postal Code'], df4['Borough'], df4['Neighbourhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            post, 
            borough,
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [297]:
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['PostalCode', 'Borough', 'Neighborhood', 'BoroughLatitude', 'BoroughLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(2920, 9)


Unnamed: 0,PostalCode,Borough,Neighborhood,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654174,-79.380812,Elgin And Winter Garden Theatres,43.653394,-79.378507,Theater
1,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654174,-79.380812,UNIQLO ユニクロ,43.65591,-79.380641,Clothing Store
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654174,-79.380812,Indigo,43.653515,-79.380696,Bookstore
3,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654174,-79.380812,CF Toronto Eaton Centre,43.654447,-79.380952,Shopping Mall
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654174,-79.380812,LUSH,43.653557,-79.3804,Cosmetics Shop


In [306]:
# Checking how many venues were returned for each Neighborhood
venues_df.groupby("Neighborhood").count()

Unnamed: 0_level_0,PostalCode,Borough,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Berczy Park,95,95,95,95,95,95,95,95
"Brockton, Parkdale Village, Exhibition Place",73,73,73,73,73,73,73,73
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",4,4,4,4,4,4,4,4
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",95,95,95,95,95,95,95,95
Central Bay Street,95,95,95,95,95,95,95,95
Christie,95,95,95,95,95,95,95,95
Church and Wellesley,95,95,95,95,95,95,95,95
"Commerce Court, Victoria Hotel",95,95,95,95,95,95,95,95
Davisville,73,73,73,73,73,73,73,73
Davisville North,73,73,73,73,73,73,73,73


In [301]:
# See how many unique categories can be created from all values
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))

There are 71 uniques categories.


In [302]:
venues_df['VenueCategory'].value_counts()

Coffee Shop             289
Clothing Store          170
Hotel                   140
Restaurant               92
Diner                    87
                       ... 
Gym / Fitness Center     15
Steakhouse               15
Opera House              15
Concert Hall             15
Deli / Bodega             5
Name: VenueCategory, Length: 71, dtype: int64

### Analyzing Each Neighborhood

In [309]:
venues_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654174,-79.380812,Elgin And Winter Garden Theatres,43.653394,-79.378507,Theater
1,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654174,-79.380812,UNIQLO ユニクロ,43.65591,-79.380641,Clothing Store
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654174,-79.380812,Indigo,43.653515,-79.380696,Bookstore
3,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654174,-79.380812,CF Toronto Eaton Centre,43.654447,-79.380952,Shopping Mall
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654174,-79.380812,LUSH,43.653557,-79.3804,Cosmetics Shop


In [321]:
# one hot encoding
toronto_onehot = pd.get_dummies(venues_df['VenueCategory'])

# add postal, borough and neighborhood column back to dataframe
toronto_onehot['PostalCode'] = venues_df['PostalCode'] 
toronto_onehot['Borough'] = venues_df['Borough'] 
toronto_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move postal, borough and neighborhood column to the first column
fixed_columns = (toronto_onehot.columns[-3:]).tolist() + (toronto_onehot.columns[:-3]).tolist()
toronto_onehot = toronto_onehot[fixed_columns]

print(toronto_onehot.shape)
toronto_onehot.head()

(2920, 74)


Unnamed: 0,PostalCode,Borough,Neighborhoods,American Restaurant,Art Gallery,Art Museum,Bank,Bar,Bookstore,Breakfast Spot,...,Shopping Mall,Smoothie Shop,Steakhouse,Sushi Restaurant,Tanning Salon,Thai Restaurant,Theater,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,M5A,Downtown Toronto,"Regent Park, Harbourfront",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,M5A,Downtown Toronto,"Regent Park, Harbourfront",0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [267]:
# Importing Kmeans from sklearn
from sklearn.cluster import KMeans

In [268]:
# set number of clusters
kclusters = 5

Toronto_clustering = pd.get_dummies(df3)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 0, 2, 0, 4, 1, 2, 0, 0])