# Code for Toronto neighbourhood clustering Assignment

### Importing required Packages first

In [1]:
from bs4 import BeautifulSoup as soup
import numpy as np
import pandas as pd
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import matplotlib.cm as cm
import matplotlib.colors as colors
!conda install -c conda-forge folium --yes
import folium # map rendering library
import requests # library to handle requests
import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
from sklearn.cluster import KMeans

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



#### Getting Data for making Toronto dataset

For this, I am using BeautifulSoup package to extract web page data.

In [2]:
webpage = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [3]:
data=soup(webpage, 'lxml')

In [4]:
table = data.find_all('table',  class_= 'wikitable sortable' )
df = pd.read_html(str(table))
df=pd.DataFrame(df[0])
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Not assigned


In [5]:
filter=(df['Borough']!='Not assigned')
df=df[filter]
df.reset_index(inplace = True, drop = True)
for f in range(len(df)):
    if(df.iloc[f]['Neighbourhood']=='Not assigned'):
        df.iloc[f]['Neighbourhood']=df.iloc[f]['Borough']

df=df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda x: ','.join(x)).reset_index()
print(df.head())

  Postcode      Borough                         Neighbourhood
0      M1B  Scarborough                         Rouge,Malvern
1      M1C  Scarborough  Highland Creek,Rouge Hill,Port Union
2      M1E  Scarborough       Guildwood,Morningside,West Hill
3      M1G  Scarborough                                Woburn
4      M1H  Scarborough                             Cedarbrae


In [6]:
df.shape

(103, 3)

In [7]:
latlong=pd.read_csv('http://cocl.us/Geospatial_data')
latlong.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [8]:
df['Latitude']=latlong['Latitude']
df['Longitude']=latlong['Longitude']
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


##### Lets Explore the neighbourhoods of Central Toronto

I am taking Central Toronto as my location, and clustering Neighbourhoods of Central Toronto.

In [9]:
print('The dataframe has {} boroughs'.format(
        len(df['Borough'].unique())
    )
)
print(df['Borough'].unique())
Toronto_data=df[df['Borough']=='Central Toronto'].reset_index(drop=True)
Toronto_data.head()

The dataframe has 11 boroughs
['Scarborough' 'North York' 'East York' 'East Toronto' 'Central Toronto'
 'Downtown Toronto' 'York' 'West Toronto' 'Mississauga' 'Etobicoke'
 "Queen's Park"]


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
1,M4P,Central Toronto,Davisville North,43.712751,-79.390197
2,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
3,M4S,Central Toronto,Davisville,43.704324,-79.38879
4,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316


#### Getting the Geo cordinates of Central Toronto, Canada

In [10]:
address = 'Central Toronto, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [11]:
# create map of Toronto using latitude and longitude values
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(Toronto_data['Latitude'], Toronto_data['Longitude'], Toronto_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

#### Defining Foursquare Credentials and Version

In [12]:
CLIENT_ID = 'ILA30HSV5MS1XD04JHRFC05AD21FSJK2R0B2002P2IYQCLX3' # your Foursquare ID
CLIENT_SECRET = 'CHD0IUG3VJNHL0QSASHIESUOJVC0BLKJFOJET3VS2ZTHKBGQ' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT= 100
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: ILA30HSV5MS1XD04JHRFC05AD21FSJK2R0B2002P2IYQCLX3
CLIENT_SECRET:CHD0IUG3VJNHL0QSASHIESUOJVC0BLKJFOJET3VS2ZTHKBGQ


In [13]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

##### Creating function for getting venues of neighbourhood of Central Toronto

In [14]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

##### Getting the venues of Neighbourhood of Central Toronto

In [15]:
Toronto_venues = getNearbyVenues(names=Toronto_data['Neighbourhood'],
                                   latitudes=Toronto_data['Latitude'],
                                   longitudes=Toronto_data['Longitude']
                                  )
print(Toronto_venues.shape)
Toronto_venues.head()

Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park,Summerhill East
Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West
Roselawn
Forest Hill North,Forest Hill West
The Annex,North Midtown,Yorkville
(115, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Lawrence Park,43.72802,-79.38879,Lawrence Park Ravine,43.726963,-79.394382,Park
1,Lawrence Park,43.72802,-79.38879,Zodiac Swim School,43.728532,-79.38286,Swim School
2,Lawrence Park,43.72802,-79.38879,TTC Bus #162 - Lawrence-Donway,43.728026,-79.382805,Bus Line
3,Davisville North,43.712751,-79.390197,Sherwood Park,43.716551,-79.387776,Park
4,Davisville North,43.712751,-79.390197,Summerhill Market North,43.715499,-79.392881,Food & Drink Shop


In [16]:
Toronto_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Davisville,36,36,36,36,36,36
Davisville North,8,8,8,8,8,8
"Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West",14,14,14,14,14,14
"Forest Hill North,Forest Hill West",4,4,4,4,4,4
Lawrence Park,3,3,3,3,3,3
"Moore Park,Summerhill East",4,4,4,4,4,4
North Toronto West,21,21,21,21,21,21
Roselawn,4,4,4,4,4,4
"The Annex,North Midtown,Yorkville",21,21,21,21,21,21


In [17]:
print('There are {} uniques categories.'.format(len(Toronto_venues['Venue Category'].unique())))

There are 64 uniques categories.


## 3. Analyze Each Neighborhood

In [18]:
# one hot encoding
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Toronto_onehot['Neighbourhood'] = Toronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_grouped =Toronto_onehot.groupby('Neighbourhood').mean().reset_index()
Toronto_grouped

Unnamed: 0,Neighbourhood,American Restaurant,BBQ Joint,Breakfast Spot,Brewery,Burger Joint,Bus Line,Café,Chinese Restaurant,Clothing Store,...,Supermarket,Sushi Restaurant,Swim School,Tennis Court,Thai Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Yoga Studio
0,Davisville,0.0,0.0,0.0,0.027778,0.0,0.0,0.055556,0.0,0.0,...,0.0,0.055556,0.0,0.0,0.027778,0.027778,0.0,0.0,0.0,0.0
1,Davisville North,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.071429,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0
3,"Forest Hill North,Forest Hill West",0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,...,0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0
4,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,...,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Moore Park,Summerhill East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.25,0.0,0.0,0.25,0.0,0.0,0.0
6,North Toronto West,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.047619,0.190476,...,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.047619
7,Roselawn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,"The Annex,North Midtown,Yorkville",0.047619,0.047619,0.0,0.0,0.047619,0.0,0.142857,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0


#### Let's print each neighborhood along with the top 5 most common venues

In [19]:
num_top_venues = 5

for hood in Toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = Toronto_grouped[Toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Davisville----
            venue  freq
0  Sandwich Place  0.08
1    Dessert Shop  0.08
2     Coffee Shop  0.06
3             Gym  0.06
4            Café  0.06


----Davisville North----
               venue  freq
0  Food & Drink Shop  0.12
1              Hotel  0.12
2                Gym  0.12
3   Department Store  0.12
4       Dance Studio  0.12


----Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West----
                   venue  freq
0                    Pub  0.14
1            Coffee Shop  0.14
2    American Restaurant  0.07
3    Fried Chicken Joint  0.07
4  Vietnamese Restaurant  0.07


----Forest Hill North,Forest Hill West----
                 venue  freq
0        Jewelry Store  0.25
1                Trail  0.25
2             Bus Line  0.25
3     Sushi Restaurant  0.25
4  American Restaurant  0.00


----Lawrence Park----
                 venue  freq
0             Bus Line  0.33
1                 Park  0.33
2          Swim School  0.33
3  American Restaurant  0.00
4 

#### First, let's write a function to sort the venues in descending order.

In [20]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [21]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = Toronto_grouped['Neighbourhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Davisville,Dessert Shop,Sandwich Place,Coffee Shop,Pizza Place,Sushi Restaurant,Café,Italian Restaurant,Gym,Greek Restaurant,Diner
1,Davisville North,Dance Studio,Breakfast Spot,Sandwich Place,Hotel,Department Store,Gym,Food & Drink Shop,Park,Yoga Studio,Garden
2,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",Pub,Coffee Shop,American Restaurant,Sushi Restaurant,Restaurant,Fried Chicken Joint,Supermarket,Sports Bar,Liquor Store,Light Rail Station
3,"Forest Hill North,Forest Hill West",Jewelry Store,Trail,Bus Line,Sushi Restaurant,Greek Restaurant,Food & Drink Shop,Fried Chicken Joint,Garden,Gas Station,Gourmet Shop
4,Lawrence Park,Bus Line,Swim School,Park,Yoga Studio,Greek Restaurant,Fried Chicken Joint,Garden,Gas Station,Gourmet Shop,Health & Beauty Service


#### Clustering Neighbbourhoods

In [22]:
# set number of clusters
kclusters = 3

Toronto_grouped_clustering = Toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 1, 0, 0, 2, 0])

In [23]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Labels', kmeans.labels_)

Toronto_merged = Toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
Toronto_merged = Toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

Toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,1,Bus Line,Swim School,Park,Yoga Studio,Greek Restaurant,Fried Chicken Joint,Garden,Gas Station,Gourmet Shop,Health & Beauty Service
1,M4P,Central Toronto,Davisville North,43.712751,-79.390197,0,Dance Studio,Breakfast Spot,Sandwich Place,Hotel,Department Store,Gym,Food & Drink Shop,Park,Yoga Studio,Garden
2,M4R,Central Toronto,North Toronto West,43.715383,-79.405678,0,Clothing Store,Coffee Shop,Yoga Studio,Spa,Metro Station,Mexican Restaurant,Park,Fast Food Restaurant,Diner,Dessert Shop
3,M4S,Central Toronto,Davisville,43.704324,-79.38879,0,Dessert Shop,Sandwich Place,Coffee Shop,Pizza Place,Sushi Restaurant,Café,Italian Restaurant,Gym,Greek Restaurant,Diner
4,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316,0,Restaurant,Trail,Playground,Tennis Court,Gourmet Shop,Fast Food Restaurant,Food & Drink Shop,Fried Chicken Joint,Garden,Gas Station


#### Now let's visualize the Neighbourhood clusters

In [24]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Neighbourhood'], Toronto_merged['Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Examining each clusters

##### Cluster 1

In [25]:
Toronto_merged.loc[Toronto_merged['Labels'] == 0, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Central Toronto,0,Dance Studio,Breakfast Spot,Sandwich Place,Hotel,Department Store,Gym,Food & Drink Shop,Park,Yoga Studio,Garden
2,Central Toronto,0,Clothing Store,Coffee Shop,Yoga Studio,Spa,Metro Station,Mexican Restaurant,Park,Fast Food Restaurant,Diner,Dessert Shop
3,Central Toronto,0,Dessert Shop,Sandwich Place,Coffee Shop,Pizza Place,Sushi Restaurant,Café,Italian Restaurant,Gym,Greek Restaurant,Diner
4,Central Toronto,0,Restaurant,Trail,Playground,Tennis Court,Gourmet Shop,Fast Food Restaurant,Food & Drink Shop,Fried Chicken Joint,Garden,Gas Station
5,Central Toronto,0,Pub,Coffee Shop,American Restaurant,Sushi Restaurant,Restaurant,Fried Chicken Joint,Supermarket,Sports Bar,Liquor Store,Light Rail Station
7,Central Toronto,0,Jewelry Store,Trail,Bus Line,Sushi Restaurant,Greek Restaurant,Food & Drink Shop,Fried Chicken Joint,Garden,Gas Station,Gourmet Shop
8,Central Toronto,0,Café,Sandwich Place,Coffee Shop,American Restaurant,Cosmetics Shop,History Museum,Liquor Store,Middle Eastern Restaurant,Park,Pharmacy


##### Cluster 2

In [26]:
Toronto_merged.loc[Toronto_merged['Labels'] == 1, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Toronto,1,Bus Line,Swim School,Park,Yoga Studio,Greek Restaurant,Fried Chicken Joint,Garden,Gas Station,Gourmet Shop,Health & Beauty Service


##### Cluster 3

In [27]:
Toronto_merged.loc[Toronto_merged['Labels'] == 2, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,Central Toronto,2,Ice Cream Shop,Health & Beauty Service,Garden,Pool,Yoga Studio,Italian Restaurant,Indian Restaurant,Hotel,History Museum,Gym


After the above Analysis, I may go for Cluster 3 Neighbourhoods due to the fact that it has more numbers of venues compared to cluster 1 and 2.