# Coursera Capstone Project - Segmenting and Clustering part 3

In [1]:
# !pip install geopy
# !pip install folium

In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import matplotlib.cm as cm 
import matplotlib.colors as colors
from geopy.geocoders import Nominatim
import folium
import requests
from sklearn.cluster import KMeans

In [3]:
df = pd.read_csv("data/geocoded_df.csv")
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


## Basic map

Let's find the coordinates of Toronto, CA to center our map. 

In [4]:
address = "Toronto, CA"
geoloc = Nominatim(user_agent="toronto_explorer")
loc = geoloc.geocode(address)
lat = loc.latitude
long = loc.longitude
print(lat, long)

43.653963 -79.387207


We can now plot a map of Toronto annotated with its different neighborhoods. 

In [5]:
toronto_map = folium.Map(location=[lat, long], zoom_start=11)
for lat, lng, borough, neighborhood, postcode in zip(df["Latitude"], df["Longitude"], 
                                                     df["Borough"], df["Neighborhood"], df["PostalCode"]):
    label = "{}, {} ({})".format(neighborhood, borough, postcode)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color="#ff7f00",
        fill=True,
        fill_color='#fdbf6f',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)

toronto_map

## Exploring venues

We can use data from Foursquare to cluster neighborhoods in Toronto based on the type of venues that are most common in each neighborhood. 

In [6]:
CLIENT_ID = "XXXXXXXX"
CLIENT_SECRET = "XXXXXXXX" 
VERSION = "20180605" 
LIMIT = 20
RADIUS = 500

Let's find the top 20 venues in each neighborhood within a radius of 500 meters. 

In [7]:
def getNearbyVenues(names, latitudes, longitudes):
    
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        url = "https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
            CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, RADIUS, LIMIT)
        results = requests.get(url).json()["response"]["groups"][0]["items"]
        
        venues_list.append([(name, lat, lng, 
            v["venue"]["name"], 
            v["venue"]["location"]["lat"], 
            v["venue"]["location"]["lng"],  
            v["venue"]["categories"][0]["name"]) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ["Neighborhood", "Neighborhood Latitude", "Neighborhood Longitude", 
                             "Venue", "Venue Latitude", "Venue Longitude", "Venue Category"]
    
    return(nearby_venues)

In [8]:
toronto_venues = getNearbyVenues(df["Neighborhood"], df["Latitude"], df["Longitude"])

In [9]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,KFC,43.754387,-79.333021,Fast Food Restaurant
2,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop


Now we can create a one-hot encoded dataframe, where we store the presence or absence of any Venue Category in each Toronto neighborhood. 

In [10]:
toronto_onehot = pd.get_dummies(toronto_venues[["Venue Category"]], prefix="", prefix_sep="")
# there actually is a category named Neighborhood, so we'll use "Name" from now on to refer to the actual neighborhood name
toronto_onehot["Name"] = toronto_venues["Neighborhood"] 
toronto_onehot = toronto_onehot[[toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])]

toronto_onehot.head()

Unnamed: 0,Name,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Let's group venues together per neighborhood, and find the mean frequency of occurrence of each venue category. 

In [11]:
toronto_grps = toronto_onehot.groupby("Name").mean().reset_index()
toronto_grps.head()

Unnamed: 0,Name,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,...,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0
4,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


It is time to find the 10 most common venue category for each neighborhood. 

In [12]:
def most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [13]:
num_top_venues = 20

columns = ["Name"]
for ind in np.arange(num_top_venues):
    columns.append("Most Common Venue {}".format(ind + 1))

neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted["Name"] = toronto_grps["Name"]

for ind in np.arange(toronto_grps.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = most_common_venues(toronto_grps.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Name,Most Common Venue 1,Most Common Venue 2,Most Common Venue 3,Most Common Venue 4,Most Common Venue 5,Most Common Venue 6,Most Common Venue 7,Most Common Venue 8,Most Common Venue 9,...,Most Common Venue 11,Most Common Venue 12,Most Common Venue 13,Most Common Venue 14,Most Common Venue 15,Most Common Venue 16,Most Common Venue 17,Most Common Venue 18,Most Common Venue 19,Most Common Venue 20
0,"Adelaide, King, Richmond",Asian Restaurant,Steakhouse,Gym / Fitness Center,Neighborhood,Pizza Place,Concert Hall,Plaza,Café,Seafood Restaurant,...,Speakeasy,Bar,Hotel,Food Court,Coffee Shop,Vegetarian / Vegan Restaurant,Greek Restaurant,American Restaurant,Food Truck,Furniture / Home Store
1,Agincourt,Lounge,Chinese Restaurant,Breakfast Spot,Sandwich Place,Dance Studio,Eastern European Restaurant,Drugstore,Dog Run,Discount Store,...,Dim Sum Restaurant,Dessert Shop,Department Store,Deli / Bodega,Curling Ice,Empanada Restaurant,Cuban Restaurant,Creperie,Cosmetics Shop,Convenience Store
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Playground,Park,Asian Restaurant,Yoga Studio,Curling Ice,Eastern European Restaurant,Drugstore,Dog Run,Discount Store,...,Dim Sum Restaurant,Dessert Shop,Department Store,Deli / Bodega,Dance Studio,Creperie,Cuban Restaurant,Empanada Restaurant,Cosmetics Shop,Convenience Store
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Beer Store,Fried Chicken Joint,Fast Food Restaurant,Liquor Store,Coffee Shop,Pizza Place,Sandwich Place,Pharmacy,...,American Restaurant,Deli / Bodega,Electronics Store,Airport Gate,Eastern European Restaurant,Drugstore,Dog Run,Discount Store,Diner,Dim Sum Restaurant
4,"Alderwood, Long Branch",Pizza Place,Gym,Coffee Shop,Pharmacy,Skating Rink,Dance Studio,Sandwich Place,Pool,Pub,...,Deli / Bodega,Curling Ice,Dessert Shop,Department Store,Creperie,Dim Sum Restaurant,Diner,Cuban Restaurant,Construction & Landscaping,Cosmetics Shop


## Clustering

Now we can finally cluster Toronto neighborhoods based on the most common venue categories.  
We will use k-Means clustering, and we can arbitrarily set the number of clusters to be equal to 7. 

In [14]:
kclusters = 9
toronto_grp_feats = toronto_grps.drop("Name", axis=1)

model = KMeans(n_clusters=kclusters)
model.fit(toronto_grp_feats)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=9, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

Our model is trained, so we can build a dataframe with latitude, longitude, cluster label and most common venue categories for each neighborhood in Toronto. We will use it to create the final map. 

In [15]:
toronto_final = pd.merge(neighborhoods_venues_sorted, df.rename({"Neighborhood": "Name"}, axis=1), 
                         on="Name", how="left")
toronto_final["Cluster"] = model.labels_

toronto_final.head() 

Unnamed: 0,Name,Most Common Venue 1,Most Common Venue 2,Most Common Venue 3,Most Common Venue 4,Most Common Venue 5,Most Common Venue 6,Most Common Venue 7,Most Common Venue 8,Most Common Venue 9,...,Most Common Venue 16,Most Common Venue 17,Most Common Venue 18,Most Common Venue 19,Most Common Venue 20,PostalCode,Borough,Latitude,Longitude,Cluster
0,"Adelaide, King, Richmond",Asian Restaurant,Steakhouse,Gym / Fitness Center,Neighborhood,Pizza Place,Concert Hall,Plaza,Café,Seafood Restaurant,...,Vegetarian / Vegan Restaurant,Greek Restaurant,American Restaurant,Food Truck,Furniture / Home Store,M5H,Downtown Toronto,43.650571,-79.384568,7
1,Agincourt,Lounge,Chinese Restaurant,Breakfast Spot,Sandwich Place,Dance Studio,Eastern European Restaurant,Drugstore,Dog Run,Discount Store,...,Empanada Restaurant,Cuban Restaurant,Creperie,Cosmetics Shop,Convenience Store,M1S,Scarborough,43.7942,-79.262029,7
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Playground,Park,Asian Restaurant,Yoga Studio,Curling Ice,Eastern European Restaurant,Drugstore,Dog Run,Discount Store,...,Creperie,Cuban Restaurant,Empanada Restaurant,Cosmetics Shop,Convenience Store,M1V,Scarborough,43.815252,-79.284577,1
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Beer Store,Fried Chicken Joint,Fast Food Restaurant,Liquor Store,Coffee Shop,Pizza Place,Sandwich Place,Pharmacy,...,Drugstore,Dog Run,Discount Store,Diner,Dim Sum Restaurant,M9V,Etobicoke,43.739416,-79.588437,7
4,"Alderwood, Long Branch",Pizza Place,Gym,Coffee Shop,Pharmacy,Skating Rink,Dance Studio,Sandwich Place,Pool,Pub,...,Dim Sum Restaurant,Diner,Cuban Restaurant,Construction & Landscaping,Cosmetics Shop,M8W,Etobicoke,43.602414,-79.543484,7


We can finally create our final map with clusters. 

In [16]:
map_clusters = folium.Map(location=[lat, long], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(toronto_final["Latitude"], toronto_final["Longitude"], 
                                  toronto_final["Name"], toronto_final["Cluster"]):
    label = folium.Popup(str(poi) + " Cluster " + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster - 1],
        fill=True,
        fill_color=rainbow[cluster - 1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

We can further inspect each cluster to determine which venues are most common in it. 

In [17]:
from IPython.display import display
for i in range(kclusters): 
    print("Cluster {}".format(i))
    display(toronto_final[toronto_final["Cluster"] == i]
     .describe(include="all")[2:3]
     .drop(["Name", "PostalCode", "Borough", "Latitude", "Longitude", "Cluster"], axis=1)
    )

Cluster 0


Unnamed: 0,Most Common Venue 1,Most Common Venue 2,Most Common Venue 3,Most Common Venue 4,Most Common Venue 5,Most Common Venue 6,Most Common Venue 7,Most Common Venue 8,Most Common Venue 9,Most Common Venue 10,Most Common Venue 11,Most Common Venue 12,Most Common Venue 13,Most Common Venue 14,Most Common Venue 15,Most Common Venue 16,Most Common Venue 17,Most Common Venue 18,Most Common Venue 19,Most Common Venue 20
top,Tennis Court,Yoga Studio,Cuban Restaurant,Electronics Store,Drugstore,Drugstore,Dog Run,Diner,Diner,Dessert Shop,Dessert Shop,Department Store,Dance Studio,Dance Studio,Cuban Restaurant,Clothing Store,Creperie,Cosmetics Shop,Convenience Store,Construction & Landscaping


Cluster 1


Unnamed: 0,Most Common Venue 1,Most Common Venue 2,Most Common Venue 3,Most Common Venue 4,Most Common Venue 5,Most Common Venue 6,Most Common Venue 7,Most Common Venue 8,Most Common Venue 9,Most Common Venue 10,Most Common Venue 11,Most Common Venue 12,Most Common Venue 13,Most Common Venue 14,Most Common Venue 15,Most Common Venue 16,Most Common Venue 17,Most Common Venue 18,Most Common Venue 19,Most Common Venue 20
top,Park,Park,Fast Food Restaurant,Yoga Studio,Yoga Studio,Eastern European Restaurant,Drugstore,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop,Department Store,Deli / Bodega,Dance Studio,Creperie,Cuban Restaurant,Creperie,Cosmetics Shop,Convenience Store


Cluster 2


Unnamed: 0,Most Common Venue 1,Most Common Venue 2,Most Common Venue 3,Most Common Venue 4,Most Common Venue 5,Most Common Venue 6,Most Common Venue 7,Most Common Venue 8,Most Common Venue 9,Most Common Venue 10,Most Common Venue 11,Most Common Venue 12,Most Common Venue 13,Most Common Venue 14,Most Common Venue 15,Most Common Venue 16,Most Common Venue 17,Most Common Venue 18,Most Common Venue 19,Most Common Venue 20
top,Cafeteria,Yoga Studio,Curling Ice,Electronics Store,Eastern European Restaurant,Drugstore,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop,Department Store,Deli / Bodega,Dance Studio,Cuban Restaurant,Event Space,Creperie,Cosmetics Shop,Convenience Store,Construction & Landscaping


Cluster 3


Unnamed: 0,Most Common Venue 1,Most Common Venue 2,Most Common Venue 3,Most Common Venue 4,Most Common Venue 5,Most Common Venue 6,Most Common Venue 7,Most Common Venue 8,Most Common Venue 9,Most Common Venue 10,Most Common Venue 11,Most Common Venue 12,Most Common Venue 13,Most Common Venue 14,Most Common Venue 15,Most Common Venue 16,Most Common Venue 17,Most Common Venue 18,Most Common Venue 19,Most Common Venue 20
top,Bank,Yoga Studio,Falafel Restaurant,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Drugstore,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop,Department Store,Deli / Bodega,Dance Studio,Curling Ice,Cuban Restaurant,Creperie,Cosmetics Shop,Convenience Store


Cluster 4


Unnamed: 0,Most Common Venue 1,Most Common Venue 2,Most Common Venue 3,Most Common Venue 4,Most Common Venue 5,Most Common Venue 6,Most Common Venue 7,Most Common Venue 8,Most Common Venue 9,Most Common Venue 10,Most Common Venue 11,Most Common Venue 12,Most Common Venue 13,Most Common Venue 14,Most Common Venue 15,Most Common Venue 16,Most Common Venue 17,Most Common Venue 18,Most Common Venue 19,Most Common Venue 20
top,Coffee Shop,Park,Convenience Store,Arts & Crafts Store,Supermarket,Electronics Store,Eastern European Restaurant,Drugstore,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop,Department Store,Dessert Shop,Cuban Restaurant,Curling Ice,Dance Studio,Creperie,Cosmetics Shop


Cluster 5


Unnamed: 0,Most Common Venue 1,Most Common Venue 2,Most Common Venue 3,Most Common Venue 4,Most Common Venue 5,Most Common Venue 6,Most Common Venue 7,Most Common Venue 8,Most Common Venue 9,Most Common Venue 10,Most Common Venue 11,Most Common Venue 12,Most Common Venue 13,Most Common Venue 14,Most Common Venue 15,Most Common Venue 16,Most Common Venue 17,Most Common Venue 18,Most Common Venue 19,Most Common Venue 20
top,Baseball Field,Yoga Studio,Falafel Restaurant,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Drugstore,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop,Department Store,Deli / Bodega,Dance Studio,Curling Ice,Cuban Restaurant,Creperie,Cosmetics Shop,Convenience Store


Cluster 6


Unnamed: 0,Most Common Venue 1,Most Common Venue 2,Most Common Venue 3,Most Common Venue 4,Most Common Venue 5,Most Common Venue 6,Most Common Venue 7,Most Common Venue 8,Most Common Venue 9,Most Common Venue 10,Most Common Venue 11,Most Common Venue 12,Most Common Venue 13,Most Common Venue 14,Most Common Venue 15,Most Common Venue 16,Most Common Venue 17,Most Common Venue 18,Most Common Venue 19,Most Common Venue 20
top,Park,Yoga Studio,Curling Ice,Electronics Store,Eastern European Restaurant,Drugstore,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop,Department Store,Deli / Bodega,Dance Studio,Cuban Restaurant,Event Space,Creperie,Cosmetics Shop,Convenience Store,Construction & Landscaping


Cluster 7


Unnamed: 0,Most Common Venue 1,Most Common Venue 2,Most Common Venue 3,Most Common Venue 4,Most Common Venue 5,Most Common Venue 6,Most Common Venue 7,Most Common Venue 8,Most Common Venue 9,Most Common Venue 10,Most Common Venue 11,Most Common Venue 12,Most Common Venue 13,Most Common Venue 14,Most Common Venue 15,Most Common Venue 16,Most Common Venue 17,Most Common Venue 18,Most Common Venue 19,Most Common Venue 20
top,Café,Coffee Shop,Breakfast Spot,Sandwich Place,Yoga Studio,Eastern European Restaurant,Drugstore,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop,Department Store,Deli / Bodega,Curling Ice,Curling Ice,Diner,Creperie,Discount Store,Diner


Cluster 8


Unnamed: 0,Most Common Venue 1,Most Common Venue 2,Most Common Venue 3,Most Common Venue 4,Most Common Venue 5,Most Common Venue 6,Most Common Venue 7,Most Common Venue 8,Most Common Venue 9,Most Common Venue 10,Most Common Venue 11,Most Common Venue 12,Most Common Venue 13,Most Common Venue 14,Most Common Venue 15,Most Common Venue 16,Most Common Venue 17,Most Common Venue 18,Most Common Venue 19,Most Common Venue 20
top,Bar,Construction & Landscaping,Yoga Studio,Deli / Bodega,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Drugstore,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop,Department Store,Dance Studio,Falafel Restaurant,Curling Ice,Cuban Restaurant,Creperie,Cosmetics Shop
