## Segmenting and Clustering - Neighborhoods in Toronto

#### __*Question 1*__

#### __Data Extraction__

Retrieve a listing of neighbourhoods in Ontario, Canada identified by postal codes

In [449]:
# Retrieve list of canadian neighbourhoods by postal code and borough

import pandas as pd

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
df_ca = pd.read_html(url, match="Postal Code") [0]
df_ca


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


#### __Data Preparation__ 

Remove rows with missing borough values and fill missing neighbourhood values with the associated borough value.

In [456]:
# Drop cells with a borough that is Not assigned.

df = df_ca[df_ca["Borough"] != "Not assigned"]
print("\nError count - (Borough == 'Not Assigned'): ", df[df["Borough"] == "Not Assigned"].shape[0], "\n")



Error count - (Borough == 'Not Assigned'):  0 



In [457]:
# Set neighbourhood to borough if neighbourhood is Not Assigned

#print(df.apply(lambda x:(x.Borough if x.Neighbourhood=="Not Assigned" else x.Neighbourhood), axis=1))

for i in range(df.shape[0]):
   if df.iloc[i]["Neighbourhood"] == 'Not Assigned':
      df.iloc[i]["Neighbourhood"] = df.iloc[i]["Borough"]
    
print("\nError count - (Neighbourhood == 'Not Assigned'): ", df[df["Neighbourhood"]=="Not Assigned"].shape[0], "\n")



Error count - (Neighbourhood == 'Not Assigned'):  0 



In [389]:
# Join multiple neighbourhoods by postalcode and borough

df = df.groupby(by=["Postal Code", "Borough"]).agg({'Neighbourhood': ', '.join}).reset_index()
df


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [390]:
# Print the number of rows in the dataframe

print("Dataframe: \t\t", "df")
print("Dataframe shape: \t", df.shape)
print("Number of rows: \t", df.shape[0])
print("Boroughs: \t\t", len(df["Borough"].unique()))
print("Neighbourhoods: \t", len(df["Neighbourhood"].unique()))

Dataframe: 		 df
Dataframe shape: 	 (103, 3)
Number of rows: 	 103
Boroughs: 		 10
Neighbourhoods: 	 99


#### __*Question 2*__

#### __Data Enhancement__

Extend the neighbourhood dataframe to include latitide and longitude coordinates for each neighbourhood

In [234]:
# Install geocoder if not installed

#!pip install geocoder  

In [235]:
# Perform test of geocoder lookup of latitude and longitude for a postal code

import geocoder

pc = "M1B"
borough = "Scarborough"

df_ll = (geocoder.google("{}, {}, Ontario".format(pc, borough))).latlng
print(df_ll) 

# the geocoder call is not returning any results
# switching to file retrieval for latitude and longitude

None


In [236]:
# Retrieve the latitude and logitude coordinates for each postal code in Ontario, Canada

df_ll = pd.read_csv("http://cocl.us/Geospatial_data")
df_ll

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [237]:
# Add the latitude and longitude cooordinates to the dataframe of postal codes using an outer left join

df = pd.merge(df, df_ll, on="Postal Code", how="left")
df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [347]:
# Print the number of rows in the dataframe

print("Dataframe: \t\t", "df")
print("Dataframe shape: \t", df.shape)
print("Number of rows: \t", df.shape[0])
print("Boroughs: \t\t", len(df["Borough"].unique()))
print("Neighbourhoods: \t", len(df["Neighbourhood"].unique()))

Dataframe: 		 df
Dataframe shape: 	 (103, 5)
Number of rows: 	 103
Boroughs: 		 10
Neighbourhoods: 	 99


#### __*Question 3*__

#### __Data Preparation__

In [239]:
# install geopy to retrieve latitude and longtitude values for a given address

#!conda install -c conda-forge geopy --yes  

In [240]:
from geopy.geocoders import Nominatim 

In [241]:
# retrieve the latitude and longitude for the city of toronto

address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
lat_to = location.latitude
long_to = location.longitude

print('Toronto - latitude, longitude: ({}, {})'.format(lat_to, long_to))

Toronto - latitude, longitude: (43.6534817, -79.3839347)


In [242]:
# filter the neighbourhood dataframe for boroughs that are located in the city of toronto

df_to = df[df["Borough"].str.contains("Toronto")].sort_values(["Borough", "Postal Code"]).reset_index(drop=True)
df_to


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
1,M4P,Central Toronto,Davisville North,43.712751,-79.390197
2,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678
3,M4S,Central Toronto,Davisville,43.704324,-79.38879
4,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
5,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049
6,M5N,Central Toronto,Roselawn,43.711695,-79.416936
7,M5P,Central Toronto,"Forest Hill North & West, Forest Hill Road Park",43.696948,-79.411307
8,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",43.67271,-79.405678
9,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529


In [348]:
# Print the number of rows in the dataframe

print("Dataframe: \t\t", "df_to")
print("Dataframe shape: \t", df_to.shape)
print("Number of rows: \t", df_to.shape[0])
print("Boroughs: \t\t", len(df_to["Borough"].unique()))
print("Neighbourhoods: \t", len(df_to["Neighbourhood"].unique()))

Dataframe: 		 df_to
Dataframe shape: 	 (39, 5)
Number of rows: 	 39
Boroughs: 		 4
Neighbourhoods: 	 39


#### __Data Visualization__

#### _Display map of Toronto neighbourhoods using the Folium API_

In [244]:
# install folium for leaflet mapping if not installed

#!pip install folium    

In [245]:
import folium

In [246]:
# create map of Toronto using latitude and longitude values

map_to = folium.Map(location=[lat_to, long_to], titles="Toronto Neighbourhoods", zoom_start=12)

# add neighbourhood markers to map

for lat, long, borough, neighbourhood in zip(df_to['Latitude'], df_to['Longitude'], df_to['Borough'], df_to['Neighbourhood']):

   folium.Marker(
      [lat, long],
      popup = folium.Popup('{}, {}'.format(neighbourhood, borough), parse_html=True),
      color = 'blue',
      fill = True,
      fill_color = '#3186cc',
      fill_opacity = 0.7,
      parse_html = False).add_to(map_to)  
    
map_to

#### __Data Exploration__

Explore the Toronto neighbourhoods using the FourSquare API

In [247]:
import requests

In [1]:
# Define Foursquare credentials and version

CLIENT_ID = 'XXXX'       # commented out for distribution
CLIENT_SECRET = 'XXXX'   # commented out for distribution
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('FourSquare Credentails: \n')
print('CLIENT_ID: \t', CLIENT_ID)
print('CLIENT_SECRET: \t', CLIENT_SECRET)


FourSquare Credentails: 

CLIENT_ID: 	 XXXX
CLIENT_SECRET: 	 XXXX


Retrieve listing of nearby venuee for each neighbourhood

In [249]:
# function to retrieve nearby venues for each of the Toronto neighburhoods

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
            'Neighbourhood Latitude', 
            'Neighbourhood Longitude', 
            'Venue', 
            'Venue Latitude', 
            'Venue Longitude', 
            'Venue Category']
    
    return(nearby_venues)

In [250]:
venues_to = getNearbyVenues(names=df_to['Neighbourhood'], latitudes=df_to['Latitude'], longitudes=df_to['Longitude'])
venues_to

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Lawrence Park,43.728020,-79.388790,Lawrence Park Ravine,43.726963,-79.394382,Park
1,Lawrence Park,43.728020,-79.388790,HYC Design Inc.,43.726793,-79.391681,Business Service
2,Lawrence Park,43.728020,-79.388790,Zodiac Swim School,43.728532,-79.382860,Swim School
3,Lawrence Park,43.728020,-79.388790,TTC Bus #162 - Lawrence-Donway,43.728026,-79.382805,Bus Line
4,Davisville North,43.712751,-79.390197,Summerhill Market North,43.715499,-79.392881,Food & Drink Shop
...,...,...,...,...,...,...,...
1603,"Runnymede, Swansea",43.651571,-79.484450,West End Mamas,43.648703,-79.484919,Health Food Store
1604,"Runnymede, Swansea",43.651571,-79.484450,(The New) Moksha Yoga Bloor West,43.648658,-79.485242,Yoga Studio
1605,"Runnymede, Swansea",43.651571,-79.484450,The Coffee Bouquets,43.648785,-79.485940,Coffee Shop
1606,"Runnymede, Swansea",43.651571,-79.484450,BloorWest Apple Specialist,43.650132,-79.480806,Electronics Store


In [349]:
# Print the number of rows in the dataframe

print("Dataframe: \t\t", "venues_to")
print("Dataframe shape: \t", venues_to.shape)
print("Number of rows: \t", venues_to.shape[0])
print("Neighbourhoods: \t", len(venues_to["Neighbourhood"].unique()))
print("Venue Categories: \t", len(venues_to["Venue Category"].unique()))
print("Venues: \t\t", len(venues_to["Venue"].unique()))


Dataframe: 		 venues_to
Dataframe shape: 	 (1608, 7)
Number of rows: 	 1608
Neighbourhoods: 	 39
Venue Categories: 	 235
Venues: 		 1037


In [252]:
# Print count of venues grouped by neighbourhood

venues_to.groupby(["Neighbourhood"]).size().reset_index(name="Venues")


Unnamed: 0,Neighbourhood,Venues
0,Berczy Park,56
1,"Brockton, Parkdale Village, Exhibition Place",22
2,"Business reply mail Processing Centre, South C...",16
3,"CN Tower, King and Spadina, Railway Lands, Har...",16
4,Central Bay Street,62
5,Christie,16
6,Church and Wellesley,79
7,"Commerce Court, Victoria Hotel",100
8,Davisville,36
9,Davisville North,9


In [253]:
# Encode the venue categories as numeric column attributes

oneshot_to = pd.get_dummies(venues_to[['Venue Category']], prefix="", prefix_sep="")
oneshot_to["Neighbourhood"] = venues_to["Neighbourhood"] 

cols = [oneshot_to.columns[-1]] + list(oneshot_to.columns[:-1])
oneshot_to = oneshot_to[cols]
 
oneshot_to

Unnamed: 0,Neighbourhood,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Davisville North,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1603,"Runnymede, Swansea",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1604,"Runnymede, Swansea",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1605,"Runnymede, Swansea",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1606,"Runnymede, Swansea",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [342]:
# Print the number of rows in the encoded dataframe

print("Dataframe: \t\t", "oneshot_to")
print("Dataframe shape: \t", oneshot_to.shape)
print("Number of rows: \t", oneshot_to.shape[0])
print("Neighbourhoods: \t", len(oneshot_to["Neighbourhood"].unique()))
print("Venue Categories: \t",oneshot_to.shape[1]-1)
print("Venues: \t\t", (oneshot_to.sum(axis=1,numeric_only=True)).sum(axis=0))


Dataframe: 		 oneshot_to
Dataframe shape: 	 (1608, 236)
Number of rows: 	 1608
Neighbourhoods: 	 39
Venue Categories: 	 235
Venues: 		 1608


In [255]:
# Compute the average frequency of venues in each category for each of the neighbourhoods

grouped_to = oneshot_to.groupby('Neighbourhood').mean().reset_index()
grouped_to

Unnamed: 0,Neighbourhood,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.017857,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0625,0.0625,0.0625,0.125,0.125,0.125,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.016129,0.0,0.0,0.016129,0.016129
5,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Church and Wellesley,0.012658,0.0,0.0,0.0,0.0,0.0,0.0,0.012658,0.0,...,0.012658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025316
7,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,...,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0
8,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.027778,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [344]:
 # Print the number of rows in the dataframe

print("Dataframe: \t\t", "grouped_to")
print("Dataframe shape: \t", grouped_to.shape)
print("Number of rows: \t", grouped_to.shape[0])
print("Neighbourhoods: \t", len(grouped_to["Neighbourhood"].unique()))
print("Venue Categories: \t",grouped_to.shape[1]-1)


Dataframe: 		 grouped_to
Dataframe shape: 	 (39, 236)
Number of rows: 	 39
Neighbourhoods: 	 39
Venue Categories: 	 235


In [318]:
# Print count of venues summarized by neighbourhood and venue category to show the most frequent venue category 
 
count_to = venues_to.groupby(["Neighbourhood", "Venue Category"]).size().reset_index(name="Count") 
count_to = count_to.sort_values(by=["Neighbourhood", "Count"], ascending=[True, False], axis=0).reset_index(drop=True)

# Filter the first neigbourhood as too mank rows to show all neighbours

count_to[count_to["Neighbourhood"]=="Berczy Park"] # filer the first neigbourhood as too mank rows to show for all neighbours

Unnamed: 0,Neighbourhood,Venue Category,Count
0,Berczy Park,Coffee Shop,5
1,Berczy Park,Cocktail Bar,3
2,Berczy Park,Bakery,2
3,Berczy Park,Beer Bar,2
4,Berczy Park,Cheese Shop,2
5,Berczy Park,Farmers Market,2
6,Berczy Park,Restaurant,2
7,Berczy Park,Seafood Restaurant,2
8,Berczy Park,Art Gallery,1
9,Berczy Park,Bagel Shop,1


In [351]:
# Print the top venue categories by neighbourhood sorted by mean frequency 

pd.options.display.float_format = '{:.2f}'.format # display 2 decimals for floats

n_cat = 5  # maximum number of venue categories
for h in grouped_to['Neighbourhood']:
    print("\n---- "+h+" ----")
    df_t = grouped_to[grouped_to['Neighbourhood']==h].T[1:].reset_index().set_axis(['Venue','Freq'],axis=1)
    df_t = df_t.sort_values(by=["Freq","Venue"],ascending=[False,True]).reset_index(drop=True).head(n_cat)
    print (df_t)



---- Berczy Park ----
          Venue Freq
0   Coffee Shop 0.09
1  Cocktail Bar 0.05
2        Bakery 0.04
3      Beer Bar 0.04
4   Cheese Shop 0.04

---- Brockton, Parkdale Village, Exhibition Place ----
            Venue Freq
0            Café 0.14
1  Breakfast Spot 0.09
2     Coffee Shop 0.09
3          Bakery 0.05
4             Bar 0.05

---- Business reply mail Processing Centre, South Central Letter Processing Plant Toronto ----
                Venue Freq
0  Light Rail Station 0.12
1       Auto Workshop 0.06
2             Brewery 0.06
3       Burrito Place 0.06
4             Butcher 0.06

---- CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport ----
                Venue Freq
0      Airport Lounge 0.12
1     Airport Service 0.12
2    Airport Terminal 0.12
3             Airport 0.06
4  Airport Food Court 0.06

---- Central Bay Street ----
                Venue Freq
0         Coffee Shop 0.18
1                Café 0.06
2  Itali

In [None]:
# Display the top venue categories for each neighbourhood in toronto

In [420]:
# Display dataframe for the top venue categories for neighbourhoods using the grouped mean frequency of venue categories

n_cat = 10  # number of categories in the ranking
cols = ["Neighbourhood"] # list of column attributes for the top venue dataframe

for i in range(1,n_cat+1):
    cols.append("Top {}".format(i))
    
top_to = pd.DataFrame(columns=cols)
top_to["Neighbourhood"] = grouped_to["Neighbourhood"]

# Retrieve the top venue categories and update dataframe for each of the neighbourhoods

for n in range(len(top_to)):
    top_to.iloc[n, 1:] = grouped_to.iloc[n, 1:].sort_values(ascending=False).index.values[0:n_cat]

top_to

Unnamed: 0,Neighbourhood,Top 1,Top 2,Top 3,Top 4,Top 5,Top 6,Top 7,Top 8,Top 9,Top 10
0,Berczy Park,Coffee Shop,Cocktail Bar,Farmers Market,Bakery,Restaurant,Cheese Shop,Beer Bar,Seafood Restaurant,Clothing Store,Lounge
1,"Brockton, Parkdale Village, Exhibition Place",Café,Breakfast Spot,Coffee Shop,Burrito Place,Intersection,Bar,Italian Restaurant,Bakery,Restaurant,Climbing Gym
2,"Business reply mail Processing Centre, South C...",Light Rail Station,Auto Workshop,Park,Comic Shop,Pizza Place,Butcher,Restaurant,Burrito Place,Brewery,Skate Park
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Lounge,Airport Service,Airport Terminal,Coffee Shop,Harbor / Marina,Plane,Rental Car Location,Sculpture Garden,Bar,Boat or Ferry
4,Central Bay Street,Coffee Shop,Café,Sandwich Place,Italian Restaurant,Salad Place,Bubble Tea Shop,Burger Joint,Office,Juice Bar,Korean Restaurant
5,Christie,Grocery Store,Café,Park,Athletics & Sports,Italian Restaurant,Restaurant,Baby Store,Candy Store,Nightclub,Coffee Shop
6,Church and Wellesley,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Fast Food Restaurant,Gay Bar,Restaurant,Pub,Men's Store,Mediterranean Restaurant,Smoke Shop
7,"Commerce Court, Victoria Hotel",Coffee Shop,Restaurant,Hotel,Café,Gym,American Restaurant,Seafood Restaurant,Italian Restaurant,Japanese Restaurant,Deli / Bodega
8,Davisville,Pizza Place,Dessert Shop,Sandwich Place,Coffee Shop,Café,Gym,Italian Restaurant,Thai Restaurant,Sushi Restaurant,Gas Station
9,Davisville North,Gym / Fitness Center,Hotel,Park,Breakfast Spot,Sandwich Place,Food & Drink Shop,Department Store,Dance Studio,Pizza Place,Concert Hall


In [421]:
# Print the number of rows in the dataframe

print("Dataframe: \t\t", "top_to")
print("Dataframe shape: \t", top_to.shape)
print("Number of rows: \t", top_to.shape[0])
print("Neighbourhoods: \t", len(top_to["Neighbourhood"].unique()))
 

Dataframe: 		 top_to
Dataframe shape: 	 (39, 11)
Number of rows: 	 39
Neighbourhoods: 	 39


#### __Modeling__

Cluster the neighbourhoods in toronto based on the venue categories using the KMeans clustering model 

In [None]:
from sklearn.cluster import KMeans

In [410]:
# number of clusters

n_groups = 5
cluster_to = grouped_to.drop(["Neighbourhood"], axis=1)

# define and fit KMeans model
model = KMeans(n_clusters=n_groups, random_state=0).fit(cluster_to)

# print cluster labels generated for each row in the dataframe
model.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 2, 1,
       1, 1, 1, 1, 0, 3, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1])

#### __Data Preparation__

In [424]:
# combine dataframe with location coordinates with dataframe containing venue ranking by neighbourhood

df_all = df_to.join(top_to.set_index("Neighbourhood"), on="Neighbourhood")

# add cluster labels from model execution

df_all.insert(3, 'Cluster Labels', model.labels_)
df_all


Unnamed: 0,Postal Code,Borough,Neighbourhood,Cluster Labels,Latitude,Longitude,Top 1,Top 2,Top 3,Top 4,Top 5,Top 6,Top 7,Top 8,Top 9,Top 10
0,M4N,Central Toronto,Lawrence Park,1,43.73,-79.39,Park,Business Service,Swim School,Bus Line,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant
1,M4P,Central Toronto,Davisville North,1,43.71,-79.39,Gym / Fitness Center,Hotel,Park,Breakfast Spot,Sandwich Place,Food & Drink Shop,Department Store,Dance Studio,Pizza Place,Concert Hall
2,M4R,Central Toronto,"North Toronto West, Lawrence Park",1,43.72,-79.41,Clothing Store,Coffee Shop,Yoga Studio,Sporting Goods Shop,Café,Chinese Restaurant,Diner,Fast Food Restaurant,Gift Shop,Mexican Restaurant
3,M4S,Central Toronto,Davisville,1,43.7,-79.39,Pizza Place,Dessert Shop,Sandwich Place,Coffee Shop,Café,Gym,Italian Restaurant,Thai Restaurant,Sushi Restaurant,Gas Station
4,M4T,Central Toronto,"Moore Park, Summerhill East",1,43.69,-79.38,Lawyer,Playground,Tennis Court,Deli / Bodega,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant
5,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",1,43.69,-79.4,Coffee Shop,Pub,Vietnamese Restaurant,Light Rail Station,Supermarket,Liquor Store,Sushi Restaurant,American Restaurant,Pizza Place,Bagel Shop
6,M5N,Central Toronto,Roselawn,1,43.71,-79.42,Music Venue,Home Service,Garden,Dance Studio,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run
7,M5P,Central Toronto,"Forest Hill North & West, Forest Hill Road Park",1,43.7,-79.41,Jewelry Store,Trail,Mexican Restaurant,Sushi Restaurant,Yoga Studio,Department Store,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop
8,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",1,43.67,-79.41,Sandwich Place,Café,Coffee Shop,Park,History Museum,Liquor Store,Burger Joint,Middle Eastern Restaurant,Indian Restaurant,Pub
9,M4W,Downtown Toronto,Rosedale,1,43.68,-79.38,Park,Playground,Trail,Deli / Bodega,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant


In [431]:
# Print the number of rows in the dataframe

print("Dataframe: \t\t", "df_all")
print("Dataframe shape: \t", df_all.shape)
print("Number of rows: \t", df_all.shape[0])
print("Neighbourhoods: \t", len(df_all["Neighbourhood"].unique()))

Dataframe: 		 df_all
Dataframe shape: 	 (39, 16)
Number of rows: 	 39
Neighbourhoods: 	 39


#### __Data Visualization__

In [None]:
Display a map of toronto showing the neighbourhoods colored by the modeled cluster 

In [480]:
# import matplotlib and associated plotting modules

import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np

# create map of neighbourhoods colored by clusters using latitude and longitude values

map_to = folium.Map(location=[lat_to, long_to], titles="Toronto Neighbourhoods", zoom_start=12)

# set color scheme for the clusters

#x = np.arange(n_groups)
#ys = [i + x + (i*x)**2 for i in range(n_groups)]
#colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
#rainbow = [colors.rgb2hex(i) for i in colors_array]


# setup a color scheme for the clusters

colors = ['green', 'purple', 'orange', 'red','blue', 'black', 'beige']

# add neighbourhood markers to map
 
for lat, long, nh, cluster in zip(df_all['Latitude'], df_all['Longitude'], df_to['Neighbourhood'], df_all["Cluster Labels"]):

   folium.CircleMarker(
      [lat, long],
      popup = folium.Popup('Cluster ' + str(cluster+1) + "\n" + nh, parse_html=True),
      color = colors[cluster],
      fill = True,
      fill_color = colors[cluster],
      fill_opacity = 0.8,
      parse_html = False).add_to(map_to)  

map_to

#### __Observations__

The majority of the defined neighbourhoods in the city of toronto (34 out of 39) are grouped into a single cluster based on the distribution of venues located in the neighbourhood. 

The main city core in Toronto along Yonge street is comprised of a mix of businesses, shopping, coffee shops, restaurants, entertainment and related venues.There are other areas in the east end of the city along Queen street such as the Beaches that contain an urban mix of retail and business venues. Shopping centers such as Dufferin Mall and high traffic areas like High Park are also included in the main cluster 2.