<a href="https://colab.research.google.com/github/shubhamnarkhede/Coursera_Capstone/blob/main/Segmenting_and_Clustering_Neighborhoods_in_Toronto_(WEEK_3).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

PART 1

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup 
import ssl
import json
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

ssl._create_default_https_context = ssl._create_unverified_context

In [2]:

source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = BeautifulSoup(source, 'html5lib')

postal_codes_dict = {} # initialize an empty dictionary to save the data in
for table_cell in soup.find_all('td'):
    try:
        postal_code = table_cell.p.b.text # get the postal code
        postal_code_investigate = table_cell.span.text
        neighborhoods_data = table_cell.span.text # get the rest of the data in the cell
        borough = neighborhoods_data.split('(')[0] # get the borough in the cell
        
        # if the cell is not assigned then ignore it
        if neighborhoods_data == 'Not assigned':
            neighborhoods = []
        # else process the data and add it to the dictionary
        else:
            postal_codes_dict[postal_code] = {}
            
            try:
                neighborhoods = neighborhoods_data.split('(')[1]
            
                # remove parantheses from neighborhoods string
                neighborhoods = neighborhoods.replace('(', ' ')
                neighborhoods = neighborhoods.replace(')', ' ')

                neighborhoods_names = neighborhoods.split('/')
                neighborhoods_clean = ', '.join([name.strip() for name in neighborhoods_names])
            except:
                borough = borough.strip('\n')
                neighborhoods_clean = borough
 
            # add borough and neighborhood to dictionary
            postal_codes_dict[postal_code]['borough'] = borough
            postal_codes_dict[postal_code]['neighborhoods'] = neighborhoods_clean
    except:
        pass
    
# create an empty dataframe
columns = ['PostalCode', 'Borough', 'Neighborhood']
toronto_data = pd.DataFrame(columns=columns)
toronto_data

# populate dataframe with data from dictionary
for ind, postal_code in enumerate(postal_codes_dict):
    borough = postal_codes_dict[postal_code]['borough']
    neighborhood = postal_codes_dict[postal_code]['neighborhoods']
    toronto_data = toronto_data.append({"PostalCode": postal_code, 
                                        "Borough": borough, 
                                        "Neighborhood": neighborhood},
                                        ignore_index=True)

# print number of rows of dataframe
toronto_data.shape[0]

103

In [3]:
toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Queen's Park


Part 2

In [5]:
# geocoder seems to be very unreliable, so we can use the link to the csv file to get latitude and longitude
url='https://cocl.us/Geospatial_data/Geospatial_Coordinates.csv'
lat_long_df = pd.read_csv(url)

# since the latitude-longitude table has the same data ordered as of combined_df, we can just create a new dataframe
# with the required columns
detailed_df = pd.DataFrame({'PostalCode':toronto_data['PostalCode'], 
                            'Borough':toronto_data['Borough'], 
                            'Neighborhood':toronto_data['Neighborhood'], 
                            'Latitude':lat_long_df['Latitude'], 
                            'Longitude':lat_long_df['Longitude']})


print("Toronto has a total of {} boroughs and {} neighborhoods.".format(len(detailed_df.Borough.unique()), len(detailed_df.Neighborhood.unique())))

Toronto has a total of 15 boroughs and 103 neighborhoods.


#Cluster analysis of the neighborhoods in Toronto
Toronto neighborhood has a total of 11 boroughs and 100+ neighborhoods. It will become a tedious task to analyse all 11 boroughs, so we will work on boroughs that has the word 'Toronto' in them. There are totally 4 boroughs that has 'toronto' in their name: "Downtown Toronto", "Central Toronto", "West Toronto", and "East Toronto".

In [6]:
# Analysing number of postal codes in '** toronto' borough
toronto_borough = ['Downtown Toronto', 'Central Toronto', 'West Toronto', 'East Toronto']
for tor in toronto_borough:
    print("{} has a total of {} postal codes.".format(tor, detailed_df[detailed_df['Borough'] == tor].PostalCode.count()))

Downtown Toronto has a total of 17 postal codes.
Central Toronto has a total of 9 postal codes.
West Toronto has a total of 6 postal codes.
East Toronto has a total of 4 postal codes.


In [7]:
# Creating a new dataframe for the cluster analysis of 'Toronto' Boroughs
d_t = detailed_df[detailed_df['Borough'] == 'Downtown Toronto']
c_t = detailed_df[detailed_df['Borough'] == 'Central Toronto']
w_t = detailed_df[detailed_df['Borough'] == 'West Toronto']
e_t = detailed_df[detailed_df['Borough'] == 'East Toronto']

combined = pd.concat([d_t, c_t, w_t, e_t], sort=False)
toronto_dataframe = combined.reset_index(drop=True)

print(toronto_dataframe.shape)
toronto_dataframe.head(10)

(36, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.763573,-79.188711
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.692657,-79.264848
2,M5C,Downtown Toronto,St. James Town,43.799525,-79.318389
3,M5E,Downtown Toronto,Berczy Park,43.75749,-79.374714
4,M5G,Downtown Toronto,Central Bay Street,43.782736,-79.442259
5,M6G,Downtown Toronto,Christie,43.753259,-79.329656
6,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.737473,-79.464763
7,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.695344,-79.318389
8,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.668999,-79.315572
9,M5L,Downtown Toronto,"Commerce Court, Victoria Hotel",43.689574,-79.38316


In [8]:
# Using geopy to get the latitude and longitude values of Toronto
address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('Coordinates of Toronto are {}, {}. '.format(latitude, longitude))

Coordinates of Toronto are 43.6534817, -79.3839347. 


Creating a map of Toronto with all its neighborhoods superimposed on Top (filtered by Borough that has the word 'toronto')

In [9]:
# creating a map of toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# adding markers
for lat, lng, borough, neighborhood in zip(toronto_dataframe['Latitude'], toronto_dataframe['Longitude'], toronto_dataframe['Borough'], toronto_dataframe['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.5,
        parse_html=False).add_to(map_toronto)
    
map_toronto

Exploring first neighborhood and it's venues with Foursquare API

In [10]:
# foursquare credentials to access their APIs
client_id = 'PEQPJEZXPG4HRTFTMTZHL3P3QRZ1SV55525PX2YO22F3Q0JB'
client_secret = 'JEDPFGTRY4SISHI1S2W3RTQGIG3UUROA5OG4I3NW4UXBBLB1'
version = '20180605'

In [11]:
# first neighborhood in toronto_dataframe
toronto_dataframe.loc[0, 'Neighborhood']

'Regent Park, Harbourfront'

In [12]:
# getting Regent_Park latitude and longitude values
Regent_Park_latitude = toronto_dataframe.loc[0, 'Latitude']
Regent_Park_longitude = toronto_dataframe.loc[0, 'Longitude']
print('Regent Park\'s latitude and longitude values are {}, {}.'.format(Regent_Park_latitude,Regent_Park_longitude))


Regent Park's latitude and longitude values are 43.7635726, -79.1887115.


In [13]:
# getting the top 100 venues that are in Rosedale within a radius of 500 meteres
no_of_venues = 100
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(client_id, client_secret, version, Regent_Park_latitude, Regent_Park_longitude, radius, no_of_venues)

response = requests.get(url).json()
response

{'meta': {'code': 200, 'requestId': '604ddc62511b4e7cb6e6f40f'},
 'response': {'groups': [{'items': [{'reasons': {'count': 0,
       'items': [{'reasonName': 'globalInteractionReason',
         'summary': 'This spot is popular',
         'type': 'general'}]},
      'referralId': 'e-0-4beee041e24d20a1cd857314-0',
      'venue': {'categories': [{'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/shops/financial_',
          'suffix': '.png'},
         'id': '4bf58dd8d48988d10a951735',
         'name': 'Bank',
         'pluralName': 'Banks',
         'primary': True,
         'shortName': 'Bank'}],
       'id': '4beee041e24d20a1cd857314',
       'location': {'address': '4374 KINGSTON RD',
        'cc': 'CA',
        'city': 'Scarborough',
        'country': 'Canada',
        'crossStreet': 'Kingston & Lawrence',
        'distance': 408,
        'formattedAddress': ['4374 KINGSTON RD (Kingston & Lawrence)',
         'Scarborough ON M1E 2M8',
         'Canada'],
        'labeledLatL

In [14]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [15]:
# cleaning the response json and structuring it into pandas dataframe
venues = response['response']['groups'][0]['items']

# flatten JSON
nearby_venues = json_normalize(venues)

# filtering only needed columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues

  """


Unnamed: 0,name,categories,lat,lng
0,RBC Royal Bank,Bank,43.76679,-79.191151
1,G & G Electronics,Electronics Store,43.765309,-79.191537
2,Sail Sushi,Restaurant,43.765951,-79.191275
3,Big Bite Burrito,Mexican Restaurant,43.766299,-79.19072
4,Enterprise Rent-A-Car,Rental Car Location,43.764076,-79.193406
5,Woburn Medical Centre,Medical Center,43.766631,-79.192286
6,Lawrence Ave E & Kingston Rd,Intersection,43.767704,-79.18949
7,Eggsmart,Breakfast Spot,43.7678,-79.190466


Exploring all neighborhoods and it's venues with Foursquare API
Now we are going to explore all the neighborhoods of four different Boroughs (ending with 'toronto') using Foursquare APIs and above mentioned process

In [16]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(client_id, client_secret, version, lat, lng, radius, no_of_venues)
        
        results = requests.get(url).json()["response"]['groups'][0]['items']
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                            'Neighborhood Latitude',
                            'Neighborhood Longitude',
                            'Venue',
                            'Venue Latitude',
                            'Venue Longitude',
                            'Venue Category']
    
    return(nearby_venues)

In [17]:
# code to run the above function on each neighborhood and create a new dataframe called toronto_venues
toronto_venues = getNearbyVenues(names=toronto_dataframe['Neighborhood'],
                                 latitudes=toronto_dataframe['Latitude'],
                                 longitudes=toronto_dataframe['Longitude'])


Regent Park, Harbourfront
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Rosedale
St. James Town, Cabbagetown
First Canadian Place, Underground city
Church and Wellesley
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West
North Toronto West
The Annex, North Midtown, Yorkville
Davisville
Moore Park, Summerhill East
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
Dufferin, Dovercourt Village
Little Portugal, Trinity
Brockton, Parkdale Village, Exhibition Place
High Park, The Junction South
Parkdale, Roncesvalles
Runnymede, Swansea
The Beaches
The Danforth West, Riverdale
India Bazaar, The Beaches West
St

In [18]:
# checking how many venues were returned overall
print(toronto_venues.shape)
toronto_venues.head(10)

(710, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.763573,-79.188711,RBC Royal Bank,43.76679,-79.191151,Bank
1,"Regent Park, Harbourfront",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
2,"Regent Park, Harbourfront",43.763573,-79.188711,Sail Sushi,43.765951,-79.191275,Restaurant
3,"Regent Park, Harbourfront",43.763573,-79.188711,Big Bite Burrito,43.766299,-79.19072,Mexican Restaurant
4,"Regent Park, Harbourfront",43.763573,-79.188711,Enterprise Rent-A-Car,43.764076,-79.193406,Rental Car Location
5,"Regent Park, Harbourfront",43.763573,-79.188711,Woburn Medical Centre,43.766631,-79.192286,Medical Center
6,"Regent Park, Harbourfront",43.763573,-79.188711,Lawrence Ave E & Kingston Rd,43.767704,-79.18949,Intersection
7,"Regent Park, Harbourfront",43.763573,-79.188711,Eggsmart,43.7678,-79.190466,Breakfast Spot
8,"Garden District, Ryerson",43.692657,-79.264848,The Birchcliff,43.691666,-79.264532,Café
9,"Garden District, Ryerson",43.692657,-79.264848,Birchmount Community Centre,43.695175,-79.262161,General Entertainment


In [19]:
# number of venues for each neighborhood
toronto_venues.groupby('Neighborhood')['Venue'].count()

Neighborhood
Brockton, Parkdale Village, Exhibition Place                                                                   36
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport     13
Central Bay Street                                                                                              5
Christie                                                                                                        3
Church and Wellesley                                                                                            8
Commerce Court, Victoria Hotel                                                                                  2
Davisville                                                                                                      4
Davisville North                                                                                               61
Dufferin, Dovercourt Village                                               

In [20]:
# number of unique categories of venues
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 190 uniques categories.


One-hot encoding all neighborhoods based on the Venue Category

In [21]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.shape

(710, 190)

Grouping rows by neighborhood and by taking the mean of the frequency of occurrence of each category


In [22]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
print(toronto_grouped.shape)
toronto_grouped.head(5)

(35, 190)


Unnamed: 0,Neighborhood,Yoga Studio,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Baby Store,Bagel Shop,Bakery,Bank,Bar,Baseball Field,Basketball Court,Basketball Stadium,Bed & Breakfast,Beer Bar,Beer Store,Belgian Restaurant,Bistro,Boat or Ferry,Bookstore,Boutique,Breakfast Spot,Brewery,Bubble Tea Shop,Burger Joint,Burrito Place,Business Service,Butcher,Café,Camera Store,Candy Store,...,Poutine Place,Pub,Ramen Restaurant,Record Shop,Rental Car Location,Restaurant,Salad Place,Salon / Barbershop,Sandwich Place,Sculpture Garden,Seafood Restaurant,Shoe Store,Shopping Mall,Skate Park,Skating Rink,Smoke Shop,Smoothie Shop,Soup Place,South American Restaurant,Spa,Sporting Goods Shop,Stationery Store,Steakhouse,Supermarket,Supplement Shop,Sushi Restaurant,Tailor Shop,Tanning Salon,Tea Room,Thai Restaurant,Theater,Tibetan Restaurant,Toy / Game Store,Trail,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Wine Bar,Women's Store
0,"Brockton, Parkdale Village, Exhibition Place",0.027778,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.027778,0.027778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027778,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Church and Wellesley,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Each neighborhood with top 5 common venues

In [23]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("[--------"+hood+"--------]")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue', 'freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq':2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print("\n")

[--------Brockton, Parkdale Village, Exhibition Place--------]
                 venue  freq
0          Coffee Shop  0.08
1                 Café  0.06
2            Gastropub  0.06
3               Bakery  0.06
4  American Restaurant  0.06


[--------CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport--------]
                venue  freq
0          Comic Shop  0.08
1          Skate Park  0.08
2       Garden Center  0.08
3              Garden  0.08
4  Light Rail Station  0.08


[--------Central Bay Street--------]
           venue  freq
0    Pizza Place   0.2
1        Butcher   0.2
2       Pharmacy   0.2
3    Coffee Shop   0.2
4  Grocery Store   0.2


[--------Christie--------]
                        venue  freq
0                        Park  0.33
1           Convenience Store  0.33
2           Food & Drink Shop  0.33
3  Modern European Restaurant  0.00
4                Liquor Store  0.00


[--------Church and Wellesley--------]
     

Creating a dataframe that has top 10 venues for each neighborhood


In [24]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [25]:
# create a dataframe to display top 10 venues for each neighborhood
num_top_venues = 10
indicators = ['st', 'nd', 'rd', 'th']

columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
         columns.append('{}th Most Common Venue'.format(ind+1))
        

# create the new dataframe
neighborhood_venues_sorted = pd.DataFrame(columns=columns)
neighborhood_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhood_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

    
#neighborhood_venues_sorted.head()
neighborhood_venues_sorted.shape

(35, 11)

Run K-Means to cluster the neighborhoods

In [26]:
# set the number of clusters
kclusters = 4
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_init=300 , n_clusters=kclusters, random_state=5).fit(toronto_grouped_clustering)

#checking cluster lables generated for each row in the dataframe
kmeans.labels_[0:20]

array([2, 2, 2, 0, 2, 2, 0, 2, 0, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
      dtype=int32)

New dataframe that includes the clusters

In [27]:
# add cluster labels
neighborhood_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_merged = toronto_dataframe

# merge toronto_grouped with toronto_dataframe to add lat/long for each neighborhood
toronto_merged = toronto_merged.join(neighborhood_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.763573,-79.188711,2.0,Electronics Store,Mexican Restaurant,Breakfast Spot,Intersection,Bank,Restaurant,Rental Car Location,Medical Center,Cuban Restaurant,Curling Ice
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.692657,-79.264848,2.0,General Entertainment,Café,College Stadium,Skating Rink,Construction & Landscaping,Convenience Store,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Doner Restaurant
2,M5C,Downtown Toronto,St. James Town,43.799525,-79.318389,2.0,Coffee Shop,Fast Food Restaurant,Pizza Place,Pharmacy,Bank,Camera Store,Breakfast Spot,Supermarket,Chinese Restaurant,Sandwich Place
3,M5E,Downtown Toronto,Berczy Park,43.75749,-79.374714,,,,,,,,,,,
4,M5G,Downtown Toronto,Central Bay Street,43.782736,-79.442259,2.0,Coffee Shop,Butcher,Pharmacy,Grocery Store,Pizza Place,Dumpling Restaurant,Doner Restaurant,Dog Run,Distribution Center,Discount Store


Visualizing the clusters

In [28]:
# creating map
cluster_map = folium.Map(location=[latitude, longitude], zoom_start=10)

# setting color schemes
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# adding markers to the map
toronto_merged=toronto_merged.dropna() 
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
    [lat, lon],
    radius=9,
    popup=label,
    colors=rainbow[int(cluster)-1],
    fill=True,
    fill_color=rainbow[int(cluster)-1],
    fill_opacity=2.0).add_to(cluster_map)
        
cluster_map

Examining the Clusters
Looking at each cluster into detail to determine the discriminating venue that distinguish each cluster


In [29]:
# Cluster - 0
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(4, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,Downtown Toronto,-79.329656,0.0,Park,Convenience Store,Food & Drink Shop,Department Store,Eastern European Restaurant,Dumpling Restaurant,Doner Restaurant,Dog Run,Distribution Center,Discount Store
6,Downtown Toronto,-79.464763,0.0,Park,Airport,Business Service,Department Store,Eastern European Restaurant,Dumpling Restaurant,Doner Restaurant,Dog Run,Distribution Center,Discount Store
22,Central Toronto,-79.453512,0.0,Park,Pool,Women's Store,Airport Food Court,Deli / Bodega,Eastern European Restaurant,Dumpling Restaurant,Doner Restaurant,Dog Run,Distribution Center
23,Central Toronto,-79.490074,0.0,Park,Construction & Landscaping,Basketball Court,Bakery,Dessert Shop,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Doner Restaurant,Dog Run
26,West Toronto,-79.506944,0.0,Shopping Mall,Park,Grocery Store,Bank,Department Store,Eastern European Restaurant,Dumpling Restaurant,Doner Restaurant,Dog Run,Distribution Center
30,West Toronto,-79.422564,0.0,Grocery Store,Café,Park,Coffee Shop,Baby Store,Candy Store,Restaurant,Italian Restaurant,Nightclub,Frozen Yogurt Shop


In [30]:
# Cluster - 1
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(4, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
14,Downtown Toronto,-79.565963,1.0,Pizza Place,Intersection,Women's Store,Deli / Bodega,Dumpling Restaurant,Doner Restaurant,Dog Run,Distribution Center,Discount Store,Diner
31,West Toronto,-79.487262,1.0,Pizza Place,Convenience Store,Women's Store,Deli / Bodega,Eastern European Restaurant,Dumpling Restaurant,Doner Restaurant,Dog Run,Distribution Center,Discount Store


In [31]:
# Cluster - 2
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(4, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,-79.188711,2.0,Electronics Store,Mexican Restaurant,Breakfast Spot,Intersection,Bank,Restaurant,Rental Car Location,Medical Center,Cuban Restaurant,Curling Ice
1,Downtown Toronto,-79.264848,2.0,General Entertainment,Café,College Stadium,Skating Rink,Construction & Landscaping,Convenience Store,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Doner Restaurant
2,Downtown Toronto,-79.318389,2.0,Coffee Shop,Fast Food Restaurant,Pizza Place,Pharmacy,Bank,Camera Store,Breakfast Spot,Supermarket,Chinese Restaurant,Sandwich Place
4,Downtown Toronto,-79.442259,2.0,Coffee Shop,Butcher,Pharmacy,Grocery Store,Pizza Place,Dumpling Restaurant,Doner Restaurant,Dog Run,Distribution Center,Discount Store
7,Downtown Toronto,-79.318389,2.0,Skating Rink,Park,Spa,Athletics & Sports,Beer Store,Curling Ice,Video Store,Concert Hall,Dessert Shop,Eastern European Restaurant
8,Downtown Toronto,-79.315572,2.0,Fast Food Restaurant,Coffee Shop,Fish & Chips Shop,Restaurant,Brewery,Italian Restaurant,Movie Theater,Pub,Ice Cream Shop,Liquor Store
9,Downtown Toronto,-79.38316,2.0,Gym,Trail,Women's Store,Deli / Bodega,Eastern European Restaurant,Dumpling Restaurant,Doner Restaurant,Dog Run,Distribution Center,Discount Store
10,Downtown Toronto,-79.476013,2.0,Turkish Restaurant,Convenience Store,Sandwich Place,Discount Store,Women's Store,Deli / Bodega,Dumpling Restaurant,Doner Restaurant,Dog Run,Distribution Center
11,Downtown Toronto,-79.48445,2.0,Coffee Shop,Café,Pub,Italian Restaurant,Sushi Restaurant,Pizza Place,Dessert Shop,Bar,Smoothie Shop,Latin American Restaurant
12,Downtown Toronto,-79.321558,2.0,Gym / Fitness Center,Burrito Place,Farmers Market,Light Rail Station,Restaurant,Pizza Place,Brewery,Fast Food Restaurant,Park,Skate Park


In [32]:
# Cluster - 3
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(4, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
13,Downtown Toronto,-79.498509,3.0,Construction & Landscaping,Baseball Field,Women's Store,Dessert Shop,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Doner Restaurant,Dog Run,Distribution Center
15,Downtown Toronto,-79.532242,3.0,Baseball Field,Women's Store,Falafel Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Doner Restaurant,Dog Run,Distribution Center,Discount Store
