# Clustering Toronto Neighborhoods

### Geting Venues Data

In [1]:
import requests
from geopy.geocoders import Nominatim
import json
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

In [2]:
CLIENT_ID = 'WBGNJBOCFXD2VRVRQSNSQIDDE4GA0NBUODKSD5Z0BUL42DJ3' # your Foursquare ID
CLIENT_SECRET = 'HQXEM2U5D31EJRHYU51L3MFDD3D0DTAL2PGRPUIOWRUHKID2' # your Foursquare Secret
ACCESS_TOKEN = 'FB3QKZP2FCEG5B54YSLZN5OG3KHCVD4V0IE2DBMBITPFCKHF' # your FourSquare Access Token
VERSION = '20180604'
LIMIT = 30
radius=800

* Here i get the neighborhoods data from the csv created in the last notebook

In [3]:
neighborhoods=pd.read_csv('Toronto_Neigh_Latlong.csv',index_col=0)


* There are borough that have more than one neighborhood , so lets split them

In [4]:
neighborhoods['Neighborhood']=neighborhoods['Neighborhood'].str.split(',')


In [5]:
neighborhoods=neighborhoods.explode('Neighborhood')

* Now i have one row for each Neighborhood

In [6]:
neighborhoods.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Postal Code.1,Latitude,Longitude
0,M1B,Scarborough,Malvern,M1B,43.806686,-79.194353
0,M1B,Scarborough,Rouge,M1B,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill,M1C,43.784535,-79.160497
1,M1C,Scarborough,Port Union,M1C,43.784535,-79.160497
1,M1C,Scarborough,Highland Creek,M1C,43.784535,-79.160497


* Drop duplicate Postalcode

In [7]:
neighborhoods.drop(columns='Postal Code.1',inplace=True )
neighborhoods.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern,43.806686,-79.194353
0,M1B,Scarborough,Rouge,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill,43.784535,-79.160497
1,M1C,Scarborough,Port Union,43.784535,-79.160497
1,M1C,Scarborough,Highland Creek,43.784535,-79.160497


In [8]:
neighborhoods.shape

(216, 5)

In [9]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
       
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [10]:
venues=getNearbyVenues(names=neighborhoods['Neighborhood'] ,
                       latitudes=neighborhoods['Latitude'],
                       longitudes=neighborhoods['Longitude']
                      )
print("Done!")


Done!


* After using the getNearbyVenues function I get a DF with each venue and it geographical information

In [11]:
venues.head()


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Malvern,43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
1,Rouge,43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
2,Rouge Hill,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
3,Port Union,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
4,Highland Creek,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar


### Create One Hot with categories

In [12]:
venues.drop(columns=['Neighborhood Latitude','Neighborhood Longitude'],inplace=True)

In [13]:
onehot=pd.get_dummies(venues['Venue Category'],prefix="", prefix_sep="")

In [14]:
onehot['Neighborhood']=venues['Neighborhood']

* Set Neighborhood as first column

In [15]:
n=onehot['Neighborhood']
onehot.drop(labels=['Neighborhood'], axis=1,inplace = True)
onehot.insert(0, 'Neighborhood', n)

* Group by Neighborhood and  calculate the mean for each category

In [16]:
grouped = onehot.groupby('Neighborhood').mean().reset_index()
grouped.head()

Unnamed: 0,Neighborhood,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Train Station,Truck Stop,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Adelaide,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,...,0.0,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Agincourt North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Albion Gardens,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bathurst Quay,0.0,0.066667,0.066667,0.066667,0.133333,0.2,0.133333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Beaumond Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Most common venues for each Neighborhood

In [17]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [18]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = grouped['Neighborhood']

for ind in np.arange(grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Café,Pizza Place,American Restaurant,Vegetarian / Vegan Restaurant,Hotel,Coffee Shop,Speakeasy,Gastropub,Seafood Restaurant,Lounge
1,Agincourt North,Playground,Intersection,Park,Sculpture Garden,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant
2,Albion Gardens,Grocery Store,Fried Chicken Joint,Coffee Shop,Discount Store,Sandwich Place,Pizza Place,Beer Store,Fast Food Restaurant,Pharmacy,Mediterranean Restaurant
3,Bathurst Quay,Airport Service,Airport Lounge,Airport Terminal,Boat or Ferry,Airport,Rental Car Location,Plane,Harbor / Marina,Airport Gate,Airport Food Court
4,Beaumond Heights,Grocery Store,Fried Chicken Joint,Coffee Shop,Discount Store,Sandwich Place,Pizza Place,Beer Store,Fast Food Restaurant,Pharmacy,Mediterranean Restaurant


### Clustering  Neighborhoods


In [19]:
kclusters = 5

grouped_clustering = grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0,
       0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 4, 4, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 4, 0, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       4, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0,
       0, 0, 0, 0, 0, 3, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 0, 0,
       0, 0, 1, 0, 3, 0, 3, 0, 0, 0, 0, 3])

* Combine the dataframe containing the neighborhood information (merged) with the DF containing venues information based on Neighborhood
 * There are some neighborhoods with out venues information , so they appear as NaN. They are dropepd, as well as duplicated information

In [20]:
merged=neighborhoods
merged=merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'),on='Neighborhood')   

merged.drop_duplicates(subset=['Neighborhood'],inplace=True)
merged.dropna(inplace=True) 
merged.reset_index(drop=True,inplace=True)
merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,Malvern,43.806686,-79.194353,Fast Food Restaurant,Accessories Store,Malay Restaurant,Martial Arts School,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant
1,M1B,Scarborough,Rouge,43.806686,-79.194353,Fast Food Restaurant,Accessories Store,Malay Restaurant,Martial Arts School,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant
2,M1C,Scarborough,Rouge Hill,43.784535,-79.160497,Bar,Accessories Store,Malay Restaurant,Martial Arts School,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant
3,M1C,Scarborough,Port Union,43.784535,-79.160497,Bar,Accessories Store,Malay Restaurant,Martial Arts School,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant
4,M1C,Scarborough,Highland Creek,43.784535,-79.160497,Bar,Accessories Store,Malay Restaurant,Martial Arts School,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant


* Make Neighborhood the firist column and sort values

In [21]:
n=merged['Neighborhood']
merged.drop(labels=['Neighborhood'], axis=1,inplace = True)
merged.insert(0, 'Neighborhood', n)
merged.sort_values(by=['Neighborhood'],inplace=True)
merged.reset_index(drop=True,inplace=True)
merged.head()

Unnamed: 0,Neighborhood,Postal Code,Borough,Latitude,Longitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,M5H,Downtown Toronto,43.650571,-79.384568,Café,Pizza Place,American Restaurant,Vegetarian / Vegan Restaurant,Hotel,Coffee Shop,Speakeasy,Gastropub,Seafood Restaurant,Lounge
1,Agincourt North,M1V,Scarborough,43.815252,-79.284577,Playground,Intersection,Park,Sculpture Garden,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant
2,Albion Gardens,M9V,Etobicoke,43.739416,-79.588437,Grocery Store,Fried Chicken Joint,Coffee Shop,Discount Store,Sandwich Place,Pizza Place,Beer Store,Fast Food Restaurant,Pharmacy,Mediterranean Restaurant
3,Bathurst Quay,M5V,Downtown Toronto,43.628947,-79.39442,Airport Service,Airport Lounge,Airport Terminal,Boat or Ferry,Airport,Rental Car Location,Plane,Harbor / Marina,Airport Gate,Airport Food Court
4,Beaumond Heights,M9V,Etobicoke,43.739416,-79.588437,Grocery Store,Fried Chicken Joint,Coffee Shop,Discount Store,Sandwich Place,Pizza Place,Beer Store,Fast Food Restaurant,Pharmacy,Mediterranean Restaurant


* Instert clustering information to the DF

In [22]:
merged.insert(5,"Cluster Label",kmeans.labels_)

In [23]:
merged.head()


Unnamed: 0,Neighborhood,Postal Code,Borough,Latitude,Longitude,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,M5H,Downtown Toronto,43.650571,-79.384568,0,Café,Pizza Place,American Restaurant,Vegetarian / Vegan Restaurant,Hotel,Coffee Shop,Speakeasy,Gastropub,Seafood Restaurant,Lounge
1,Agincourt North,M1V,Scarborough,43.815252,-79.284577,0,Playground,Intersection,Park,Sculpture Garden,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant
2,Albion Gardens,M9V,Etobicoke,43.739416,-79.588437,0,Grocery Store,Fried Chicken Joint,Coffee Shop,Discount Store,Sandwich Place,Pizza Place,Beer Store,Fast Food Restaurant,Pharmacy,Mediterranean Restaurant
3,Bathurst Quay,M5V,Downtown Toronto,43.628947,-79.39442,0,Airport Service,Airport Lounge,Airport Terminal,Boat or Ferry,Airport,Rental Car Location,Plane,Harbor / Marina,Airport Gate,Airport Food Court
4,Beaumond Heights,M9V,Etobicoke,43.739416,-79.588437,0,Grocery Store,Fried Chicken Joint,Coffee Shop,Discount Store,Sandwich Place,Pizza Place,Beer Store,Fast Food Restaurant,Pharmacy,Mediterranean Restaurant


### Ploting dots on map

In [24]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(merged['Latitude'], merged['Longitude'], merged['Neighborhood'],merged['Cluster Label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### What can i get from this analysis?

* It seems that the neighborhoods in toronto are prety homogenous , at least without modifying the original venue data.
* Considering the top venues , it seems that there are plenty of things to do everywhere in the city