In [1]:
import numpy as np 
import pandas as pd 
import json
from geopy.geocoders import Nominatim 
import requests 
from pandas.io.json import json_normalize 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
!pip install folium

print('Libraries imported.')

Libraries imported.


In [2]:
# since new url has changed the format of the postal code table in Wiki page, we are using the old url
url = "https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=1011037969"
old_url = requests.get(url)
old_url

<Response [200]>

In [3]:
# we will scrape the data from html to text format using pandas
raw_data = pd.read_html(old_url.text)
raw_data

[    Postal Code           Borough  \
 0           M1A      Not assigned   
 1           M2A      Not assigned   
 2           M3A        North York   
 3           M4A        North York   
 4           M5A  Downtown Toronto   
 ..          ...               ...   
 175         M5Z      Not assigned   
 176         M6Z      Not assigned   
 177         M7Z      Not assigned   
 178         M8Z         Etobicoke   
 179         M9Z      Not assigned   
 
                                          Neighbourhood  
 0                                         Not assigned  
 1                                         Not assigned  
 2                                            Parkwoods  
 3                                     Victoria Village  
 4                            Regent Park, Harbourfront  
 ..                                                 ...  
 175                                       Not assigned  
 176                                       Not assigned  
 177                

In [4]:
# now we will make the text data into a pandas data frame keeping only the postal codes section
raw_data = raw_data[0]
raw_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [5]:
# we will remove the not assigned rows and reset the index
df = raw_data[raw_data["Borough"] != "Not assigned"]
df = df.reset_index()
df.head()

Unnamed: 0,index,Postal Code,Borough,Neighbourhood
0,2,M3A,North York,Parkwoods
1,3,M4A,North York,Victoria Village
2,4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,5,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [6]:
# removing the old index
df.drop(['index'], axis = 'columns', inplace = True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [7]:
df.shape

(103, 3)

### now we have 103 postal codes of all the borough and neighborhoods of Toronto

In [8]:
# now we will download latitude & longitude of all the postal codes, using the link below
data = pd.read_csv("https://cocl.us/Geospatial_data")
data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [9]:
data.shape

(103, 3)

In [10]:
# since both the data set has 103 postal codes, we will assume they are same, and join the datasets using the postal codes
combined_data = df.join(data.set_index('Postal Code'), on='Postal Code', how='inner')
combined_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [11]:
combined_data.shape

(103, 5)

### We have a complete dataset of all 103 postal codes. Next we will map the neighborhood on the map of Toronto, using folium pacakge

In [12]:
!pip install geopy
from geopy.geocoders import Nominatim



In [13]:
# getting the lat & lon of Toronto using geocoder
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The coordinates of Toronto are 43.6534817, -79.3839347.


In [14]:
!conda install -c conda-forge folium=0.5.0 --yes 
import folium

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python-3.7-main

  added / updated specs:
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    _libgcc_mutex-0.1          |      conda_forge           3 KB  conda-forge
    _openmp_mutex-4.5          |           1_llvm           5 KB  conda-forge
    _py-xgboost-mutex-2.0      |            cpu_0           8 KB  conda-forge
    _pytorch_select-0.2        |            gpu_0           2 KB
    absl-py-0.12.0             |     pyhd8ed1ab_0          96 KB  conda-forge
    aiohttp-3.7.4              |   py37h5e8e339_0  

toml-0.10.2          | 18 KB     | ##################################### | 100% 
giflib-5.2.1         | 77 KB     | ##################################### | 100% 
attrs-20.3.0         | 41 KB     | ##################################### | 100% 
cachetools-4.2.1     | 13 KB     | ##################################### | 100% 
markdown-3.3.4       | 67 KB     | ##################################### | 100% 
jupyter_core-4.7.1   | 72 KB     | ##################################### | 100% 
_libgcc_mutex-0.1    | 3 KB      | ##################################### | 100% 
liblapack-3.9.0      | 11 KB     | ##################################### | 100% 
notebook-6.2.0       | 6.2 MB    | ##################################### | 100% 
pytest-6.2.2         | 430 KB    | ##################################### | 100% 
pillow-8.1.2         | 668 KB    | ##################################### | 100% 
libev-4.33           | 104 KB    | ##################################### | 100% 
jsonschema-3.2.0     | 45 KB

multidict-5.1.0      | 67 KB     | ##################################### | 100% 
html5lib-1.1         | 89 KB     | ##################################### | 100% 
keras-applications-1 | 30 KB     | ##################################### | 100% 
cffi-1.14.5          | 225 KB    | ##################################### | 100% 
pyqt-impl-5.12.3     | 5.9 MB    | ##################################### | 100% 
hdf5-1.10.6          | 3.1 MB    | ##################################### | 100% 
libxml2-2.9.10       | 1.3 MB    | ##################################### | 100% 
pycparser-2.20       | 94 KB     | ##################################### | 100% 
gmpy2-2.1.0b1        | 206 KB    | ##################################### | 100% 
networkx-2.5         | 1.2 MB    | ##################################### | 100% 
cycler-0.10.0        | 9 KB      | ##################################### | 100% 
libpng-1.6.37        | 306 KB    | ##################################### | 100% 
nltk-3.4.4           | 1.1 M

done


In [15]:
# Creating the map of Toronto
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# adding markers to map
for latitude, longitude, borough, neighbourhood in zip(combined_data['Latitude'], combined_data['Longitude'], combined_data['Borough'], combined_data['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [latitude, longitude],
        radius=5,
        popup=label,
        color='red',
        fill=True
        ).add_to(map_Toronto)  
    
map_Toronto

### Now we will initialize Foursqare API and 'explore' the venue data for all the Neighborhoods

In [16]:
CLIENT_ID = 'DJOPEKTYQH0YT04MQGNYX2R5F12RULW3THGSS0YRWGHN5OPS' 
CLIENT_SECRET = 'A43RAXWFTQ2ZLBRFYD0TSA40KRLMPXFTMODET32RG5KTQXGY'
VERSION = '20180604' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: DJOPEKTYQH0YT04MQGNYX2R5F12RULW3THGSS0YRWGHN5OPS
CLIENT_SECRET:A43RAXWFTQ2ZLBRFYD0TSA40KRLMPXFTMODET32RG5KTQXGY


In [17]:
# we are creating a function to get all the venue catagories for all the neighborhoods
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius
            )
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Category']
    
    return(nearby_venues)

In [18]:
# collecting the venues in Toronto for all neighborhoods, within 500 mtrs radius
venues_in_toronto = getNearbyVenues(combined_data['Neighbourhood'], combined_data['Latitude'], combined_data['Longitude'])

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Queen's Park, Ontario Provincial Government
Islington Avenue, Humber Valley Village
Malvern, Rouge
Don Mills
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto, Broadview North (Old East York)
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmo

In [19]:
venues_in_toronto.head()

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,Food & Drink Shop
2,Victoria Village,43.725882,-79.315572,Victoria Village Arena,Hockey Arena
3,Victoria Village,43.725882,-79.315572,Portugril,Portuguese Restaurant
4,Victoria Village,43.725882,-79.315572,Tim Hortons,Coffee Shop


In [20]:
venues_in_toronto.shape

(1337, 5)

In [21]:
# checking by venue catagories with max frequency
venues_in_toronto.groupby('Venue Category').max()

Unnamed: 0_level_0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue
Venue Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Accessories Store,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,Ardene Shoes Outlet
Adult Boutique,Church and Wellesley,43.665860,-79.383160,Seduction
Airport,Downsview,43.737473,-79.394420,Toronto Downsview Airport (YZD)
Airport Food Court,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.394420,Billy Bishop Café
Airport Gate,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.394420,Gate 8
...,...,...,...,...
Warehouse Store,Thorncliffe Park,43.705369,-79.349372,Costco
Wine Bar,"Toronto Dominion Centre, Design Exchange",43.653206,-79.379817,The National Club
Wings Joint,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,Wingporium
Women's Store,Caledonia-Fairbanks,43.689026,-79.453512,Maximum Woman


In [22]:
# now we will get dummies for all the venue catagories

toronto_venue_cat = pd.get_dummies(venues_in_toronto[['Venue Category']], prefix="", prefix_sep="")
toronto_venue_cat

Unnamed: 0,Accessories Store,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Train Station,Truck Stop,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1332,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1333,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1334,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1335,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# Adding the Neighborhood column in the encoded dataset

toronto_venue_cat['Neighbourhood'] = venues_in_toronto['Neighbourhood'] 

# moving neighborhood column to the first column
fixed_columns = [toronto_venue_cat.columns[-1]] + list(toronto_venue_cat.columns[:-1])
toronto_venue_cat = toronto_venue_cat[fixed_columns]

toronto_venue_cat.head()

Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Truck Stop,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### We will now group the data by Neighborhoods and get the mean of venue catagories for each neighborhood

In [24]:
toronto_grouped = toronto_venue_cat.groupby('Neighbourhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Truck Stop,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Now we will get the top most venues for each neighborhood

In [25]:
# first we will make a funstion to get the top most venues
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [26]:
# now plug in the function to get top 5 venues for each neighborhood
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Agincourt,Latin American Restaurant,Breakfast Spot,Skating Rink,Lounge,Yoga Studio
1,"Alderwood, Long Branch",Pizza Place,Gym,Skating Rink,Coffee Shop,Pub
2,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Pharmacy,Chinese Restaurant,Shopping Mall
3,Bayview Village,Bank,Café,Japanese Restaurant,Chinese Restaurant,Yoga Studio
4,"Bedford Park, Lawrence Manor East",Sandwich Place,Coffee Shop,Italian Restaurant,Pharmacy,Thai Restaurant
...,...,...,...,...,...,...
91,"Willowdale, Willowdale East",Ramen Restaurant,Coffee Shop,Café,Sandwich Place,Pet Store
92,"Willowdale, Willowdale West",Grocery Store,Supermarket,Coffee Shop,Pizza Place,Pharmacy
93,Woburn,Coffee Shop,Soccer Field,Korean BBQ Restaurant,Yoga Studio,Deli / Bodega
94,Woodbine Heights,Athletics & Sports,Skating Rink,Spa,Curling Ice,Intersection


### Now we will make model to cluster the Neighborhoods

In [27]:
from sklearn.cluster import KMeans

In [28]:
# set number of clusters
k_num_clusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=k_num_clusters, random_state=0).fit(toronto_grouped_clustering)
kmeans

KMeans(n_clusters=5, random_state=0)

In [29]:
kmeans.labels_[0:50]

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2,
       2, 2, 0, 2, 2, 1], dtype=int32)

In [30]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [31]:
neighborhoods_venues_sorted.head()

Unnamed: 0,Cluster Labels,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,2,Agincourt,Latin American Restaurant,Breakfast Spot,Skating Rink,Lounge,Yoga Studio
1,2,"Alderwood, Long Branch",Pizza Place,Gym,Skating Rink,Coffee Shop,Pub
2,2,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Pharmacy,Chinese Restaurant,Shopping Mall
3,2,Bayview Village,Bank,Café,Japanese Restaurant,Chinese Restaurant,Yoga Studio
4,2,"Bedford Park, Lawrence Manor East",Sandwich Place,Coffee Shop,Italian Restaurant,Pharmacy,Thai Restaurant


### Now we will merge this table with the combined_data table and prepare for plotting

In [32]:
toronto_merged = combined_data

toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

toronto_merged

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,0.0,Park,Food & Drink Shop,Yoga Studio,Deli / Bodega,Electronics Store
1,M4A,North York,Victoria Village,43.725882,-79.315572,2.0,French Restaurant,Pizza Place,Coffee Shop,Portuguese Restaurant,Intersection
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636,2.0,Coffee Shop,Park,Breakfast Spot,Café,Bakery
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,2.0,Clothing Store,Furniture / Home Store,Accessories Store,Coffee Shop,Vietnamese Restaurant
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,2.0,Coffee Shop,Diner,Sushi Restaurant,Distribution Center,Sandwich Place
...,...,...,...,...,...,...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944,2.0,Pool,Smoke Shop,River,Yoga Studio,Dance Studio
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160,2.0,Coffee Shop,Café,Japanese Restaurant,Sushi Restaurant,Ramen Restaurant
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,2.0,Pizza Place,Smoke Shop,Skate Park,Light Rail Station,Park
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509,4.0,Baseball Field,Yoga Studio,Falafel Restaurant,Escape Room,Electronics Store


In [33]:
# we will remove the NaN values to prevent data skew
toronto_merged_nonan = toronto_merged.dropna(subset=['Cluster Labels'])

In [34]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [35]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k_num_clusters)
ys = [i + x + (i*x)**2 for i in range(k_num_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged_nonan['Latitude'], toronto_merged_nonan['Longitude'], toronto_merged_nonan['Neighbourhood'], toronto_merged_nonan['Cluster Labels']):
    label = folium.Popup('Cluster ' + str(int(cluster) +1) + '\n' + str(poi) , parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)]
        ).add_to(map_clusters)
        
map_clusters

### Now lets verify each of the clusters

In [36]:
toronto_merged_nonan.loc[toronto_merged_nonan['Cluster Labels'] == 0, toronto_merged_nonan.columns[[1] + list(range(5, toronto_merged_nonan.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,North York,0.0,Park,Food & Drink Shop,Yoga Studio,Deli / Bodega,Electronics Store
32,Scarborough,0.0,Playground,College Gym,Escape Room,Electronics Store,Eastern European Restaurant
35,East York,0.0,Pizza Place,Park,Convenience Store,Intersection,Deli / Bodega
52,North York,0.0,Park,Yoga Studio,Deli / Bodega,Escape Room,Electronics Store
61,Central Toronto,0.0,Park,Swim School,Bus Line,Yoga Studio,Deli / Bodega
64,York,0.0,Park,Jewelry Store,Yoga Studio,Deli / Bodega,Electronics Store
66,North York,0.0,Park,Convenience Store,Yoga Studio,Deli / Bodega,Escape Room
85,Scarborough,0.0,Playground,Park,Intersection,Dance Studio,Electronics Store
91,Downtown Toronto,0.0,Park,Trail,Playground,Yoga Studio,Deli / Bodega


## Cluster 2:

In [37]:
toronto_merged_nonan.loc[toronto_merged_nonan['Cluster Labels'] == 1, toronto_merged_nonan.columns[[1] + list(range(5, toronto_merged_nonan.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
6,Scarborough,1.0,Fast Food Restaurant,Yoga Studio,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant


## Cluster 3:

In [38]:
toronto_merged_nonan.loc[toronto_merged_nonan['Cluster Labels'] == 2, toronto_merged_nonan.columns[[1] + list(range(5, toronto_merged_nonan.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1,North York,2.0,French Restaurant,Pizza Place,Coffee Shop,Portuguese Restaurant,Intersection
2,Downtown Toronto,2.0,Coffee Shop,Park,Breakfast Spot,Café,Bakery
3,North York,2.0,Clothing Store,Furniture / Home Store,Accessories Store,Coffee Shop,Vietnamese Restaurant
4,Downtown Toronto,2.0,Coffee Shop,Diner,Sushi Restaurant,Distribution Center,Sandwich Place
7,North York,2.0,Gym,Beer Store,Restaurant,Coffee Shop,Asian Restaurant
...,...,...,...,...,...,...,...
97,Downtown Toronto,2.0,Café,Coffee Shop,Restaurant,Hotel,Seafood Restaurant
98,Etobicoke,2.0,Pool,Smoke Shop,River,Yoga Studio,Dance Studio
99,Downtown Toronto,2.0,Coffee Shop,Café,Japanese Restaurant,Sushi Restaurant,Ramen Restaurant
100,East Toronto,2.0,Pizza Place,Smoke Shop,Skate Park,Light Rail Station,Park


## Cluster 4:

In [39]:
toronto_merged_nonan.loc[toronto_merged_nonan['Cluster Labels'] == 3, toronto_merged_nonan.columns[[1] + list(range(5, toronto_merged_nonan.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
11,Etobicoke,3.0,Bakery,Yoga Studio,Falafel Restaurant,Escape Room,Electronics Store


## Cluster 5:

In [40]:
toronto_merged_nonan.loc[toronto_merged_nonan['Cluster Labels'] == 4, toronto_merged_nonan.columns[[1] + list(range(5, toronto_merged_nonan.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
57,North York,4.0,Baseball Field,Yoga Studio,Falafel Restaurant,Escape Room,Electronics Store
101,Etobicoke,4.0,Baseball Field,Yoga Studio,Falafel Restaurant,Escape Room,Electronics Store


# End of Assignment