

# Coursera Capstone Project for Clustering and Segmentation




In [1]:
#Install beautifulsoup
!conda install -c conda-forge beautifulsoup4 --yes

# Import packages
from bs4 import BeautifulSoup
import urllib as ur
import requests as rq

In [3]:
# Use Beautiful soup
source = rq.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')
article = soup.find('table', class_='wikitable sortable')



In [4]:
# Preprocessing to get the right data frame
codes_list=[]
borough_list=[]
neighborhood_list=[]
i=1
for tag in soup.table.find_all('td'):
    if i == 1:
        codes_list.append(tag.text)
    if i == 2:
        borough_list.append(tag.text)
    if i == 3: 
        row = tag.text
        row = row.replace('\n', '')
        neighborhood_list.append(row)
    i = i+1
    if i==4:
        i=1

In [5]:
len (codes_list[0:])

289

In [6]:
# Convert list to pandas dataframe

import pandas as pd
Canada_Codes = pd.DataFrame(
    {'Postcode': codes_list,
     'Borough': borough_list,
     'Neighbourhood': neighborhood_list
    })

In [7]:

Canada_Codes2 = Canada_Codes
Canada_Codes.head(7)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [8]:
# Remove Not assigned
Canada_Codes2.drop(Canada_Codes2[Canada_Codes2['Borough']=="Not assigned"].index,axis=0, inplace=True)


In [9]:
Canada_Codes2.head(7)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned


In [10]:
Canada_Codes2=Canada_Codes2.groupby("Postcode").agg(lambda x:','.join(set(x)))
Canada_Codes2.head(7)

Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern,Rouge"
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
M1E,Scarborough,"Morningside,Guildwood,West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
M1J,Scarborough,Scarborough Village
M1K,Scarborough,"Ionview,Kennedy Park,East Birchmount Park"


In [11]:
# Not assigned neighborhood, then the neighborhood will be the same as the borough.

Canada_Codes2.loc[Canada_Codes2['Neighbourhood']=="Not assigned",'Neighbourhood']=Canada_Codes2.loc[Canada_Codes2['Neighbourhood']=="Not assigned",'Borough']

Canada_Codes2.index.name = 'Postcode'
Canada_Codes2.reset_index(inplace=True)

Canada_Codes2.shape

(103, 3)

In [12]:
# Final desired dataframe
Canada_Codes2.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern,Rouge"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Morningside,Guildwood,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Ionview,Kennedy Park,East Birchmount Park"
7,M1L,Scarborough,"Golden Mile,Clairlea,Oakridge"
8,M1M,Scarborough,"Cliffside,Scarborough Village West,Cliffcrest"
9,M1N,Scarborough,"Cliffside West,Birch Cliff"


In [13]:
# Read Geo Data
geo_data=pd.read_csv("https://cocl.us/Geospatial_data")
geo_data.head(10)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [14]:
# Create new columns and add it original frame
Canada_Codes2['Latitude']=geo_data['Latitude'].values
Canada_Codes2['Longitude']=geo_data['Longitude'].values

Canada_Codes2.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Morningside,Guildwood,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Ionview,Kennedy Park,East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile,Clairlea,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside,Scarborough Village West,Cliffcrest",43.716316,-79.239476
9,M1N,Scarborough,"Cliffside West,Birch Cliff",43.692657,-79.264848


In [15]:

# Get Toronto codes
Toronto_Codes = Canada_Codes2.drop(Canada_Codes2[Canada_Codes2['Borough'].str.contains("Toronto")==False].index, axis=0, inplace=False)

#Reset Index
Toronto_Codes.index = pd.RangeIndex(len(Toronto_Codes.index))

#to view Dataframe
Toronto_Codes.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"Riverdale,The Danforth West",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Summerhill East,Moore Park",43.689574,-79.38316
9,M4V,Central Toronto,"Summerhill West,Forest Hill SE,Deer Park,South...",43.686412,-79.400049


In [16]:
len(Toronto_Codes['Postcode'].unique())

38

In [17]:

import json # library to handle JSON files

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# Importing to use the Foursquare API lab
import folium # map rendering library

print('Foursquare and map plotting Libraries imported.')

Foursquare and map plotting Libraries imported.


In [18]:

print('The dataframe has {} boroughs spanning across 38 Postcodes and {}  neighborhood groups.'.format(
        len(Toronto_Codes['Borough'].unique()),
        Toronto_Codes.shape[0]
    )
)

The dataframe has 4 boroughs spanning across 38 Postcodes and 38  neighborhood groups.


In [19]:
import time
from geopy.geocoders import Nominatim

In [20]:
geolocator = Nominatim(user_agent="capstone_agent")

In [21]:
# Get Toronto geo info
address = 'Toronto, Ontario, Canada'
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [22]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(Toronto_Codes['Latitude'], Toronto_Codes['Longitude'], Toronto_Codes['Borough'], Toronto_Codes['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [26]:
# Get values for Harbourfront,Regent Park
import numpy as np
neighborhood_latitude = np.float(Toronto_Codes.loc[13,['Latitude']].values)
neighborhood_longitude =  np.float(Toronto_Codes.loc[13,['Longitude']].values)
Toronto_Codes.loc[13]

Postcode                              M5A
Borough                  Downtown Toronto
Neighbourhood    Harbourfront,Regent Park
Latitude                          43.6543
Longitude                        -79.3606
Name: 13, dtype: object

In [28]:
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?client_id=W2RZKFIYGSSLPRR3OE14Q2UYYK5MQZYBV42IYCADG5YZ4PZR&client_secret=2YHEQZDDQ2HBTLH3PFONKEW2DAMNXKSW1HZP5VECDLNFRWWC&ll=43.6543,-79.3606&v=20180604&radius=500&limit=30'

In [29]:
url

'https://api.foursquare.com/v2/venues/explore?client_id=W2RZKFIYGSSLPRR3OE14Q2UYYK5MQZYBV42IYCADG5YZ4PZR&client_secret=2YHEQZDDQ2HBTLH3PFONKEW2DAMNXKSW1HZP5VECDLNFRWWC&ll=43.6543,-79.3606&v=20180604&radius=500&limit=30'

In [79]:
# get results
results = rq.get(url).json()


In [31]:

# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [32]:

# Get nearby venues for Harbourfront and Regent Park
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Roselle Desserts,Bakery,43.653447,-79.362017
1,Tandem Coffee,Coffee Shop,43.653559,-79.361809
2,Toronto Cooper Koo Family Cherry St YMCA Centre,Gym / Fitness Center,43.653191,-79.357947
3,Morning Glory Cafe,Breakfast Spot,43.653947,-79.361149
4,Body Blitz Spa East,Spa,43.654735,-79.359874


In [35]:

# Store Credentials
CLIENT_ID = 'W2RZKFIYGSSLPRR3OE14Q2UYYK5MQZYBV42IYCADG5YZ4PZR' # your Foursquare ID
CLIENT_SECRET = '2YHEQZDDQ2HBTLH3PFONKEW2DAMNXKSW1HZP5VECDLNFRWWC' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30

print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

30 venues were returned by Foursquare.


In [39]:
# Get venue and other info for all neighborhoods of Tornoto
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = rq.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [40]:
toronto_venues = getNearbyVenues(names=Toronto_Codes['Neighbourhood'],
                                   latitudes=Toronto_Codes['Latitude'],
                                   longitudes=Toronto_Codes['Longitude']
                                  )

The Beaches
Riverdale,The Danforth West
The Beaches West,India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Summerhill East,Moore Park
Summerhill West,Forest Hill SE,Deer Park,South Hill,Rathnelly
Rosedale
St. James Town,Cabbagetown
Church and Wellesley
Harbourfront,Regent Park
Ryerson,Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide,Richmond,King
Harbourfront East,Toronto Islands,Union Station
Design Exchange,Toronto Dominion Centre
Commerce Court,Victoria Hotel
Roselawn
Forest Hill West,Forest Hill North
Yorkville,The Annex,North Midtown
University of Toronto,Harbord
Kensington Market,Grange Park,Chinatown
South Niagara,Bathurst Quay,King and Spadina,Railway Lands,CN Tower,Harbourfront West,Island airport
Stn A PO Boxes 25 The Esplanade
Underground city,First Canadian Place
Christie
Dovercourt Village,Dufferin
Little Portugal,Trinity
Exhibition Place,Parkdale Village,Brockton
High Park,The Junction South
Parkdale,Roncesvall

In [41]:
print(toronto_venues.shape)
toronto_venues.head()

(830, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
1,The Beaches,43.676357,-79.293031,Starbucks,43.678798,-79.298045,Coffee Shop
2,The Beaches,43.676357,-79.293031,Glen Stewart Ravine,43.6763,-79.294784,Other Great Outdoors
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,"Riverdale,The Danforth West",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


In [50]:
# Show the count of venues for each neighborhood
toronto_venues.groupby('Neighborhood').count().iloc[0:,4]

Neighborhood
Adelaide,Richmond,King                                                                                  30
Berczy Park                                                                                             30
Business Reply Mail Processing Centre 969 Eastern                                                       18
Central Bay Street                                                                                      30
Christie                                                                                                16
Church and Wellesley                                                                                    30
Commerce Court,Victoria Hotel                                                                           30
Davisville                                                                                              30
Davisville North                                                                                         9
Design Exchange,Toronto 

In [51]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 191 uniques categories.


In [52]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Thai Restaurant,Theater,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
toronto_onehot.shape

(830, 191)

In [54]:
### Get the mean for each unique venue by neighborhood
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Thai Restaurant,Theater,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop
0,"Adelaide,Richmond,King",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,...,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Business Reply Mail Processing Centre 969 Eastern,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.033333,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0
4,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Church and Wellesley,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0
6,"Commerce Court,Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.033333,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Design Exchange,Toronto Dominion Centre",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,...,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0


In [55]:

toronto_grouped.shape

(38, 191)

In [56]:

## Top 5 venue for each neighborhood
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide,Richmond,King----
                 venue  freq
0           Steakhouse  0.10
1                 Café  0.07
2  American Restaurant  0.07
3                Hotel  0.07
4     Asian Restaurant  0.07


----Berczy Park----
                venue  freq
0        Cocktail Bar  0.10
1  Seafood Restaurant  0.07
2                Café  0.07
3      Farmers Market  0.07
4              Bakery  0.07


----Business Reply Mail Processing Centre 969 Eastern----
                venue  freq
0  Light Rail Station  0.11
1         Yoga Studio  0.06
2       Auto Workshop  0.06
3          Comic Shop  0.06
4         Pizza Place  0.06


----Central Bay Street----
                venue  freq
0         Coffee Shop  0.23
1                Café  0.07
2                 Spa  0.07
3  Italian Restaurant  0.07
4                Park  0.03


----Christie----
           venue  freq
0  Grocery Store  0.19
1           Café  0.19
2           Park  0.12
3      Nightclub  0.06
4     Baby Store  0.06


----Church and Welles

In [57]:
## Get most common venues
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [58]:
num_top_venues = 7

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue
0,"Adelaide,Richmond,King",Steakhouse,Asian Restaurant,American Restaurant,Hotel,Café,Speakeasy,Smoke Shop
1,Berczy Park,Cocktail Bar,Bakery,Seafood Restaurant,Café,Farmers Market,Bistro,Italian Restaurant
2,Business Reply Mail Processing Centre 969 Eastern,Light Rail Station,Butcher,Skate Park,Smoke Shop,Restaurant,Spa,Brewery
3,Central Bay Street,Coffee Shop,Spa,Italian Restaurant,Café,Vegetarian / Vegan Restaurant,Steakhouse,Modern European Restaurant
4,Christie,Grocery Store,Café,Park,Athletics & Sports,Coffee Shop,Diner,Nightclub
5,Church and Wellesley,Gay Bar,Burger Joint,Gastropub,Ramen Restaurant,Ethiopian Restaurant,Bookstore,Salon / Barbershop
6,"Commerce Court,Victoria Hotel",Café,Coffee Shop,Hotel,Restaurant,Deli / Bodega,Gastropub,Seafood Restaurant
7,Davisville,Dessert Shop,Coffee Shop,Café,Pizza Place,Sandwich Place,Seafood Restaurant,Italian Restaurant
8,Davisville North,Clothing Store,Food & Drink Shop,Park,Sandwich Place,Breakfast Spot,Burger Joint,Hotel
9,"Design Exchange,Toronto Dominion Centre",Coffee Shop,Deli / Bodega,Café,Restaurant,Gastropub,Art Gallery,Pub


In [70]:

# Perform Clustering
kclusters = 4

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood',1)
#print(toronto_grouped_clustering)
#print(toronto_grouped)
# run k-means clustering
kmeans = KMeans(init = "k-means++", n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
labels = kmeans.labels_[0:37] 
print(labels)

[2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 0 2 2 2 2 0 1 2 2 2 2 2 2 3 2 2 2 2 2 2]


In [71]:
toronto_merged = Toronto_Codes
print(toronto_merged.shape)
toronto_merged.rename(columns={'Neighbourhood': 'Neighborhood'}, inplace=True)
toronto_merged.head(4)

(38, 5)


Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"Riverdale,The Danforth West",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923


In [72]:
labels = np.append(labels,labels[0])
print(labels.shape)



(38,)


In [73]:
# add clustering labels
toronto_merged['Cluster Labels'] = labels.tolist()

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,2,Coffee Shop,Other Great Outdoors,Pub,Cuban Restaurant,Eastern European Restaurant,Dumpling Restaurant,Dog Run
1,M4K,East Toronto,"Riverdale,The Danforth West",43.679557,-79.352188,2,Greek Restaurant,Italian Restaurant,Ice Cream Shop,Spa,Pub,Pizza Place,Juice Bar
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572,2,Sandwich Place,Park,Steakhouse,Movie Theater,Pet Store,Pizza Place,Burrito Place
3,M4M,East Toronto,Studio District,43.659526,-79.340923,2,Café,Coffee Shop,American Restaurant,Italian Restaurant,Bakery,Chinese Restaurant,Cheese Shop
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,2,Park,Dim Sum Restaurant,Bus Line,Swim School,Cuban Restaurant,Eastern European Restaurant,Dumpling Restaurant


In [74]:
# Visualize Clusters
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [75]:
## Explore First Cluster
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue
12,Downtown Toronto,0,Gay Bar,Burger Joint,Gastropub,Ramen Restaurant,Ethiopian Restaurant,Bookstore,Salon / Barbershop
17,Downtown Toronto,0,Coffee Shop,Spa,Italian Restaurant,Café,Vegetarian / Vegan Restaurant,Steakhouse,Modern European Restaurant
22,Central Toronto,0,Garden,Wine Shop,Dance Studio,Ethiopian Restaurant,Eastern European Restaurant,Dumpling Restaurant,Dog Run


In [76]:
## Explore Second Cluster
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]


Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue
23,Central Toronto,1,Park,Jewelry Store,Trail,Sushi Restaurant,Wine Shop,Dance Studio,Dumpling Restaurant


In [77]:
## Explore Third Cluster
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue
0,East Toronto,2,Coffee Shop,Other Great Outdoors,Pub,Cuban Restaurant,Eastern European Restaurant,Dumpling Restaurant,Dog Run
1,East Toronto,2,Greek Restaurant,Italian Restaurant,Ice Cream Shop,Spa,Pub,Pizza Place,Juice Bar
2,East Toronto,2,Sandwich Place,Park,Steakhouse,Movie Theater,Pet Store,Pizza Place,Burrito Place
3,East Toronto,2,Café,Coffee Shop,American Restaurant,Italian Restaurant,Bakery,Chinese Restaurant,Cheese Shop
4,Central Toronto,2,Park,Dim Sum Restaurant,Bus Line,Swim School,Cuban Restaurant,Eastern European Restaurant,Dumpling Restaurant
5,Central Toronto,2,Clothing Store,Food & Drink Shop,Park,Sandwich Place,Breakfast Spot,Burger Joint,Hotel
6,Central Toronto,2,Coffee Shop,Sporting Goods Shop,Yoga Studio,Dessert Shop,Mexican Restaurant,Fast Food Restaurant,Diner
7,Central Toronto,2,Dessert Shop,Coffee Shop,Café,Pizza Place,Sandwich Place,Seafood Restaurant,Italian Restaurant
8,Central Toronto,2,Playground,Wine Shop,Cuban Restaurant,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Discount Store
9,Central Toronto,2,Coffee Shop,Pub,Sushi Restaurant,Bagel Shop,Supermarket,Convenience Store,Pizza Place


In [78]:
## Explore Fourth Cluster
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue
30,Downtown Toronto,3,Grocery Store,Café,Park,Athletics & Sports,Coffee Shop,Diner,Nightclub


In [None]:
## End of Analysis