# Notebook for Applied Data Science Capstone

In [43]:
# !conda update -n base -c defaults conda --yes
!conda install BeautifulSoup4 lxml wget folium --yes
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
# !pip install beautifulsoup4

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - beautifulsoup4


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.9.11          |           py36_0         154 KB

The following packages will be UPDATED:

    certifi: 2019.9.11-py36_0  conda-forge --> 2019.9.11-py36_0 
    openssl: 1.1.1c-h516909a_0 conda-forge --> 1.1.1d-h7b6447c_3


Downloading and Extracting Packages
certifi-2019.9.11    | 154 KB    | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda by running

    $ conda upd

In [2]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests 

Here we scrape the data and remove null values, we assume that all null values are named 'not assigned'

In [3]:
# Scraping from wikipedia
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
r = requests.get(URL) 
  
soup = BeautifulSoup(r.content, 'lxml') 

# Extracting the table
My_table = soup.find('table',{'class':'wikitable sortable'})

Heads = My_table.findAll('tr')[0].findAll('th')
Table = {}
Headers   = []

# Extracting the head names
for head in Heads:
    item    = head.text.split('\n')[0]
    Table[item] = []
    Headers.append(item)
    
# Extracting vales from table and inserting into dictionary
for row in My_table.findAll('tr')[1:]:
    for num, column in enumerate(row.findAll('td')):
        item = column.text.split('\n')[0]
        Table[Headers[num]].append(item)
        
canada_post = pd.DataFrame.from_dict(Table)

# Clearing up the nullvalues
nullval = 'Not assigned'
canada_post = canada_post[canada_post.Borough != nullval]

# Set all nullvalues for the neighbourhood to the value for the borough
canada_post.loc[canada_post.Neighbourhood == nullval, 'Neighbourhood'] = canada_post.loc[
    canada_post.Neighbourhood == nullval, 'Borough']

display(canada_post.head(8))

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue


Here we clean up the data by combining multiple entries that have the same postcodes

In [4]:
def Neighbourhood_combine(canada_post, postcode):
    rows2concat = canada_post[canada_post.Postcode == postcode]
    first = rows2concat.iloc[0]
    Neighbourhoods = []

    for index, row in rows2concat.iterrows():
        Neighbourhoods.append(row.Neighbourhood)

    Replacement = {'Postcode': first.Postcode, 'Borough': first.Borough, 'Neighbourhood': Neighbourhoods}
    return Replacement

Multi_post = canada_post.groupby('Postcode').count()['Neighbourhood'] 

df = []

for unique_post in canada_post['Postcode'].unique():
    item = Neighbourhood_combine(canada_post,unique_post)
#     print(item)
    df.append(item)

Unique_post = pd.DataFrame(df)
display(Unique_post.head())
print('The shape is: ', Unique_post.shape)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,[Parkwoods]
1,M4A,North York,[Victoria Village]
2,M5A,Downtown Toronto,"[Harbourfront, Regent Park]"
3,M6A,North York,"[Lawrence Heights, Lawrence Manor]"
4,M7A,Queen's Park,[Queen's Park]


The shape is:  (103, 3)


## Getting the coordinates for the different areas in Toronto

Downloading the location data

In [5]:
!wget -q -O 'Geospatial.csv' https://cocl.us/Geospatial_data

Read the first 5 lines

In [6]:
with open('Geospatial.csv') as coordinates:
    for a in range(5):
        display(coordinates.readline())

'Postal Code,Latitude,Longitude\n'

'M1B,43.8066863,-79.1943534\n'

'M1C,43.7845351,-79.1604971\n'

'M1E,43.7635726,-79.1887115\n'

'M1G,43.7709921,-79.2169174\n'

Read into pandas

In [7]:
geospatial = pd.read_csv('Geospatial.csv')
geospatial.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Find out the type of the datas

In [8]:
for index in geospatial.axes[1]:
    print('The dtype for index : {} is : {}.'.format(index, geospatial[index].dtype))

The dtype for index : Postal Code is : object.
The dtype for index : Latitude is : float64.
The dtype for index : Longitude is : float64.


Now lets merge the dataframes

In [9]:
canada_coord = Unique_post.merge(geospatial, how = 'inner', left_on='Postcode',
                                 right_on='Postal Code')
canada_coord.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M3A,North York,[Parkwoods],M3A,43.753259,-79.329656
1,M4A,North York,[Victoria Village],M4A,43.725882,-79.315572
2,M5A,Downtown Toronto,"[Harbourfront, Regent Park]",M5A,43.65426,-79.360636
3,M6A,North York,"[Lawrence Heights, Lawrence Manor]",M6A,43.718518,-79.464763
4,M7A,Queen's Park,[Queen's Park],M7A,43.662301,-79.389494


## Now lets get the venues near the neighbourhoods 

#### Let's create a function to repeat the same process to all the neighborhoods in Canada

In [10]:
CLIENT_ID = '0H0Y52X0LLK1YK4OHZ0HKFQWXWEL1V1QNMDFRWJ24YBQMDVW' # your Foursquare ID
CLIENT_SECRET = 'BVWD4JRTRKNWN3UDD2IEZU4AKSIFVFXFDCWMYAO4IYFJQL5L' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 0H0Y52X0LLK1YK4OHZ0HKFQWXWEL1V1QNMDFRWJ24YBQMDVW
CLIENT_SECRET:BVWD4JRTRKNWN3UDD2IEZU4AKSIFVFXFDCWMYAO4IYFJQL5L


In [11]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Now write the code to run the above function on each neighborhood and create a new dataframe called *canada_venues*.

In [12]:
# type your answer here

canada_venues = getNearbyVenues(names=canada_coord['Postcode'],
                                   latitudes=canada_coord['Latitude'],
                                   longitudes=canada_coord['Longitude']
                                  )

M3A
M4A
M5A
M6A
M7A
M9A
M1B
M3B
M4B
M5B
M6B
M9B
M1C
M3C
M4C
M5C
M6C
M9C
M1E
M4E
M5E
M6E
M1G
M4G
M5G
M6G
M1H
M2H
M3H
M4H
M5H
M6H
M1J
M2J
M3J
M4J
M5J
M6J
M1K
M2K
M3K
M4K
M5K
M6K
M1L
M2L
M3L
M4L
M5L
M6L
M9L
M1M
M2M
M3M
M4M
M5M
M6M
M9M
M1N
M2N
M3N
M4N
M5N
M6N
M9N
M1P
M2P
M4P
M5P
M6P
M9P
M1R
M2R
M4R
M5R
M6R
M7R
M9R
M1S
M4S
M5S
M6S
M1T
M4T
M5T
M1V
M4V
M5V
M8V
M9V
M1W
M4W
M5W
M8W
M9W
M1X
M4X
M5X
M8X
M4Y
M7Y
M8Y
M8Z


In [13]:
canada_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M3A,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,M3A,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,M4A,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,M4A,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
4,M4A,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


# Analyse data

Here we will cluster the neighborhoods based on the types of venues around each neighborhood.

In [14]:
# one hot encoding
canada_onehot = pd.get_dummies(canada_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
canada_onehot['Neighborhood'] = canada_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [canada_onehot.columns[-1]] + list(canada_onehot.columns[:-1])
canada_onehot = canada_onehot[fixed_columns]

canada_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


And let's examine the new dataframe size.

In [15]:
canada_onehot.shape

(2256, 273)

#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [16]:
canada_grouped = canada_onehot.groupby('Neighborhood').mean().reset_index()
canada_grouped.head()

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [118]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = canada_grouped['Neighborhood']

for ind in np.arange(canada_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(canada_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Fast Food Restaurant,Women's Store,Dog Run,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Doner Restaurant
1,M1C,Bar,Women's Store,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Event Space
2,M1E,Electronics Store,Medical Center,Mexican Restaurant,Pizza Place,Rental Car Location,Breakfast Spot,Intersection,Department Store,Dessert Shop,Dim Sum Restaurant
3,M1G,Coffee Shop,Korean Restaurant,Convenience Store,Dog Run,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Doner Restaurant
4,M1H,Fried Chicken Joint,Hakka Restaurant,Athletics & Sports,Bakery,Bank,Thai Restaurant,Caribbean Restaurant,Diner,Department Store,Dessert Shop


<a id='item4'></a>

## 4. Cluster Neighborhoods

Run *k*-means to cluster the neighborhood into 5 clusters.

In [119]:
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 5

canada_grouped_clustering = canada_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0, n_init=100,max_iter=500).fit(canada_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 2, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [120]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

canada_merged = canada_venues

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
canada_merged = canada_merged.merge(neighborhoods_venues_sorted, how ='inner', left_on='Neighborhood', right_on = 'Neighborhood')

display(canada_merged.head()) # check the last columns!
print(canada_merged.shape)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park,0,Park,Food & Drink Shop,Women's Store,Discount Store,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Dog Run
1,M3A,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop,0,Park,Food & Drink Shop,Women's Store,Discount Store,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Dog Run
2,M4A,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena,1,Hockey Arena,Coffee Shop,Portuguese Restaurant,French Restaurant,Women's Store,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant
3,M4A,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop,1,Hockey Arena,Coffee Shop,Portuguese Restaurant,French Restaurant,Women's Store,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant
4,M4A,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant,1,Hockey Arena,Coffee Shop,Portuguese Restaurant,French Restaurant,Women's Store,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant


(2256, 18)


Let's find the coordinates of Toronto, Canada

In [121]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
address = 'Toronto, canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


Now, let's visualise. Unfortunately, we cannot display all the pins only 1000 of them

In [122]:
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

canada_merged1 = canada_merged[0:1000]

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(canada_merged1['Venue Latitude'], canada_merged1['Venue Longitude'], canada_merged1['Neighborhood'], canada_merged1['Cluster Labels']):
    label = folium.Popup('Post code: {}, Cluster: {}'.format(str(poi), str(cluster)), parse_html=True)
#     print(lat,lon)
    folium.CircleMarker(
        [lat, lon],
        radius=2,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

<a id='item5'></a>

## 5. Examine Clusters

Now, you can examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, you can then assign a name to each cluster. I will leave this exercise to you.

#### Cluster 1

It appears that this cluster would be good for people interested in Hockey and coffee

In [108]:
cluster1 = canada_merged.loc[canada_merged['Cluster Labels'] == 0, canada_merged.columns[[1] + list(range(5, canada_merged.shape[1]))]]
display(cluster1.head())
print(cluster1.shape)

Unnamed: 0,Neighborhood Latitude,Venue Longitude,Venue Category,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,43.725882,-79.315635,Hockey Arena,0,Hockey Arena,Coffee Shop,Portuguese Restaurant,French Restaurant,Women's Store,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant
3,43.725882,-79.313103,Coffee Shop,0,Hockey Arena,Coffee Shop,Portuguese Restaurant,French Restaurant,Women's Store,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant
4,43.725882,-79.312785,Portuguese Restaurant,0,Hockey Arena,Coffee Shop,Portuguese Restaurant,French Restaurant,Women's Store,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant
5,43.725882,-79.317418,French Restaurant,0,Hockey Arena,Coffee Shop,Portuguese Restaurant,French Restaurant,Women's Store,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant
6,43.65426,-79.362017,Bakery,0,Coffee Shop,Park,Bakery,Café,Theater,Breakfast Spot,Pub,Mexican Restaurant,Ice Cream Shop,Chocolate Shop


(2221, 14)


#### Cluster 2

These neighbourhoods would be ideal for those that like parks and women's stores

In [109]:
cluster2 = canada_merged.loc[canada_merged['Cluster Labels'] == 1, canada_merged.columns[[1] + list(range(5, canada_merged.shape[1]))]]
display(cluster2.head())
print(cluster2.shape)

Unnamed: 0,Neighborhood Latitude,Venue Longitude,Venue Category,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,43.753259,-79.33214,Park,1,Park,Food & Drink Shop,Women's Store,Discount Store,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Dog Run
1,43.753259,-79.333114,Food & Drink Shop,1,Park,Food & Drink Shop,Women's Store,Discount Store,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Dog Run
439,43.689026,-79.456326,Fast Food Restaurant,1,Park,Women's Store,Fast Food Restaurant,Market,Dog Run,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store
440,43.689026,-79.4563,Park,1,Park,Women's Store,Fast Food Restaurant,Market,Dog Run,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store
441,43.689026,-79.456333,Women's Store,1,Park,Women's Store,Fast Food Restaurant,Market,Dog Run,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store


(31, 14)


#### Cluster 3

These neighbourhoods would be ideal for those that like Fast food and women's stores

In [110]:
cluster3 = canada_merged.loc[canada_merged['Cluster Labels'] == 2, canada_merged.columns[[1] + list(range(5, canada_merged.shape[1]))]]
display(cluster3.head())
print(cluster3.shape)

Unnamed: 0,Neighborhood Latitude,Venue Longitude,Venue Category,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
110,43.806686,-79.199056,Fast Food Restaurant,2,Fast Food Restaurant,Women's Store,Dog Run,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Doner Restaurant


(1, 14)


#### Cluster 4

These neighbourhoods would be ideal for those that like Pizza and Empanada

In [111]:
cluster4 = canada_merged.loc[canada_merged['Cluster Labels'] == 3, canada_merged.columns[[1] + list(range(5, canada_merged.shape[1]))]]
display(cluster4.head())
print(cluster4.shape)

Unnamed: 0,Neighborhood Latitude,Venue Longitude,Venue Category,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1304,43.756303,-79.567195,Pizza Place,3,Pizza Place,Empanada Restaurant,College Arts Building,College Auditorium,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Donut Shop,Doner Restaurant
1305,43.756303,-79.570637,Empanada Restaurant,3,Pizza Place,Empanada Restaurant,College Arts Building,College Auditorium,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Donut Shop,Doner Restaurant


(2, 14)


#### Cluster 5

These neighbourhoods would be ideal for those that like Bars and Women's stores

In [112]:
cluster5 = canada_merged.loc[canada_merged['Cluster Labels'] == 4, canada_merged.columns[[1] + list(range(5, canada_merged.shape[1]))]]
display(cluster5.head())
print(cluster5.shape)

Unnamed: 0,Neighborhood Latitude,Venue Longitude,Venue Category,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
232,43.784535,-79.163085,Bar,4,Bar,Women's Store,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Event Space


(1, 14)
