### Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

We first import all of our dependent modules.

In [67]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from pprint import pprint
import pandas as pd
import numpy as np
import folium
import requests
import os
from sklearn.cluster import KMeans

We load the Toronto dataset.

In [46]:
df = pd.read_csv(os.path.join('..', 'data', 'week3_b.csv'))
print(df.shape)
df.head()

(103, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


We define a function for loading client id, secret, and Foursquare version from file.

In [47]:
def get_credentials():
    with open(os.path.join('..', 'data', 'client_id.txt')) as f: id_ = f.read()
    with open(os.path.join('..', 'data', 'client_secret.txt')) as f: secret_ = f.read()
    with open(os.path.join('..', 'data', 'version.txt')) as f: v_ = f.read()
    
    return id_, secret_, v_

We define a function to pull venues from Foursquare, based on latitude and longitude.

In [48]:
def venue_explore(id, secret, version, lat, lon, radius=500, limit=100):
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'
    
    response = requests.get(url.format(
        id, secret, version, lat, lon, radius, limit
    )).json()
    
    return response["response"]['groups'][0]['items']

We define a function to iterate through our Toronto data frame to find venues in each borough, based on latitude and longitude.

In [49]:
def nearby_venues(id_, secret_, v_):
    venues = []
    for index, row in df.iterrows():
        lat = row['Latitude']
        lon = row['Longitude']
        results = venue_explore(id_, secret_, v_, lat, lon)
        
        venues.append([(
                row['Borough'], lat, lon,
                v['venue']['name'],
                v['venue']['location']['lat'],
                v['venue']['location']['lng'],
                v['venue']['categories'][0]['name']) for v in results])
    
    nearby = pd.DataFrame([item for venue_list in venues for item in venue_list])
    nearby.columns = ['Borough', 'Borough Latitude', 'Borough Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
    return nearby

The following code was ran to get venue data from Foursquare and save it into data file `week3_c_venues.csv`.

In [50]:
#client_id, client_secret, fsq_version = get_credentials()
#df_venues = nearby_venues(client_id, client_secret, fsq_version)
#df_venues.to_csv(os.path.join('..', 'data', 'week3_c_venues.csv'), index=False)

We will get venue data from that file, instead of pulling live data from Foursquare, so that our client id and secret can remain private.

In [51]:
df_venues = pd.read_csv(os.path.join('..', 'data', 'week3_c_venues.csv'))
print(df_venues.shape)
df_venues.head()

(2259, 7)


Unnamed: 0,Borough,Borough Latitude,Borough Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Scarborough,43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,Scarborough,43.806686,-79.194353,Interprovincial Group,43.80563,-79.200378,Print Shop
2,Scarborough,43.784535,-79.160497,Chris Effects Painting,43.784343,-79.163742,Construction & Landscaping
3,Scarborough,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
4,Scarborough,43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place


We show the number of venues per borough.

In [52]:
df_venues.groupby('Borough').count()['Venue']

Borough
Central Toronto      111
Downtown Toronto    1288
East Toronto         122
East York             77
Etobicoke             72
Mississauga           10
North York           251
Queen's Park          39
Scarborough           89
West Toronto         180
York                  20
Name: Venue, dtype: int64

In [53]:
print('There are {} uniques categories.'.format(len(df_venues['Venue Category'].unique())))

There are 280 uniques categories.


We perform one-hot encoding to each venue category.

In [54]:
toronto_onehot = pd.get_dummies(df_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot.head()

Unnamed: 0,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Borough is moved to the front of the data frame.

In [55]:
toronto_onehot['Borough'] = df_venues['Borough'] 
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
toronto_onehot.head()

Unnamed: 0,Borough,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Scarborough,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Scarborough,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Scarborough,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Scarborough,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Scarborough,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


We average the frequency of each venue for each borough, then reset the index.

In [56]:
toronto_grouped = toronto_onehot.groupby('Borough').mean().reset_index()

print(toronto_grouped.shape)
toronto_grouped.head()

(11, 281)


Unnamed: 0,Borough,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Central Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.009009,0.0,0.0,0.009009,0.0,0.0,0.0,0.0,0.009009
1,Downtown Toronto,0.0,0.000776,0.000776,0.000776,0.000776,0.000776,0.001553,0.001553,0.001553,...,0.002329,0.013199,0.002329,0.000776,0.003882,0.0,0.006211,0.000776,0.0,0.002329
2,East Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02459
3,East York,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.012987,0.0,0.0,0.0,0.012987
4,Etobicoke,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.013889,0.0,0.0,0.0,0.013889,0.0,0.0


Next we create `df_top` that contains the top venues for each borough.

In [57]:
def most_common_venues(row, num_venues):
    return row.iloc[1:].sort_values(ascending=False).index.values[0:num_venues]

In [58]:
num_venues = 5
cols = ['Borough']

# A column for each top venue
for i in range(5):
    cols.append('#{} Venue'.format(i+1))

# Add boroughs to data frame
df_top = pd.DataFrame(columns=cols)
df_top['Borough'] = toronto_grouped['Borough']

# Add the top boroughs
for i in np.arange(toronto_grouped.shape[0]):
    df_top.iloc[i, 1:] = most_common_venues(toronto_grouped.iloc[i, :], num_venues)

df_top.head()

Unnamed: 0,Borough,#1 Venue,#2 Venue,#3 Venue,#4 Venue,#5 Venue
0,Central Toronto,Coffee Shop,Pizza Place,Sandwich Place,Park,Café
1,Downtown Toronto,Coffee Shop,Café,Restaurant,Hotel,Bakery
2,East Toronto,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Café
3,East York,Coffee Shop,Park,Burger Joint,Sporting Goods Shop,Pizza Place
4,Etobicoke,Pizza Place,Sandwich Place,Pharmacy,Coffee Shop,Liquor Store


We use `KMeans` to cluster the boroughs based on their top venues.

In [59]:
kclusters = 5

# Cluster using venues only
toronto_grouped_clustering = toronto_grouped.drop('Borough', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0)
kmeans.fit(toronto_grouped_clustering)

kmeans.labels_[:5]

array([1, 0, 0, 1, 1], dtype=int32)

We merge the clustering results with Toronto borough data.

In [60]:
df_top.insert(0, 'Cluster Labels', kmeans.labels_)
df_merged = df.join(df_top.set_index('Borough'), on='Borough')
df_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,#1 Venue,#2 Venue,#3 Venue,#4 Venue,#5 Venue
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,1,Fast Food Restaurant,Chinese Restaurant,Bakery,Coffee Shop,Breakfast Spot
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,1,Fast Food Restaurant,Chinese Restaurant,Bakery,Coffee Shop,Breakfast Spot
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,1,Fast Food Restaurant,Chinese Restaurant,Bakery,Coffee Shop,Breakfast Spot
3,M1G,Scarborough,Woburn,43.770992,-79.216917,1,Fast Food Restaurant,Chinese Restaurant,Bakery,Coffee Shop,Breakfast Spot
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,1,Fast Food Restaurant,Chinese Restaurant,Bakery,Coffee Shop,Breakfast Spot


Now, we map boroughs in Toronto and color code them by their cluster.

In [61]:
toronto = folium.Map(location=[43.674105, -79.367693], zoom_start=11, tiles='cartodbpositron')
colors = ['red', 'yellow', 'green', 'blue']

for index, row in df_merged.iterrows():
    c = colors[row['Cluster Labels']-1]
    folium.CircleMarker(
        location=(row['Latitude'], row['Longitude']),
        radius=6,
        popup='{}, {}'.format(row['PostalCode'], row['Borough']),
        fill=True,
        color=c,
        fill_opacity=0.6
        ).add_to(toronto)

toronto

If the map doesn't load, it should look like this.

![Toronto](https://github.com/r3w0p/Coursera_Capstone/blob/master/data/toronto.png)

Below, we show boroughs based upon the venues they were clustered into.

In [62]:
df_merged.loc[df_merged['Cluster Labels'] == 1].head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,#1 Venue,#2 Venue,#3 Venue,#4 Venue,#5 Venue
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,1,Fast Food Restaurant,Chinese Restaurant,Bakery,Coffee Shop,Breakfast Spot
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,1,Fast Food Restaurant,Chinese Restaurant,Bakery,Coffee Shop,Breakfast Spot
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,1,Fast Food Restaurant,Chinese Restaurant,Bakery,Coffee Shop,Breakfast Spot
3,M1G,Scarborough,Woburn,43.770992,-79.216917,1,Fast Food Restaurant,Chinese Restaurant,Bakery,Coffee Shop,Breakfast Spot
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,1,Fast Food Restaurant,Chinese Restaurant,Bakery,Coffee Shop,Breakfast Spot


In [63]:
df_merged.loc[df_merged['Cluster Labels'] == 2].head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,#1 Venue,#2 Venue,#3 Venue,#4 Venue,#5 Venue
73,M6C,York,Humewood-Cedarvale,43.693781,-79.428191,2,Park,Restaurant,Trail,Discount Store,Tennis Court
74,M6E,York,Caledonia-Fairbanks,43.689026,-79.453512,2,Park,Restaurant,Trail,Discount Store,Tennis Court
80,M6M,York,"Del Ray, Keelesdale, Mount Dennis, Silverthorn",43.691116,-79.476013,2,Park,Restaurant,Trail,Discount Store,Tennis Court
81,M6N,York,"The Junction North, Runnymede",43.673185,-79.487262,2,Park,Restaurant,Trail,Discount Store,Tennis Court
98,M9N,York,Weston,43.706876,-79.518188,2,Park,Restaurant,Trail,Discount Store,Tennis Court


In [64]:
df_merged.loc[df_merged['Cluster Labels'] == 3].head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,#1 Venue,#2 Venue,#3 Venue,#4 Venue,#5 Venue
86,M7R,Mississauga,Canada Post Gateway Processing Centre,43.636966,-79.615819,3,Hotel,Coffee Shop,Gym / Fitness Center,Burrito Place,Fried Chicken Joint


In [65]:
df_merged.loc[df_merged['Cluster Labels'] == 4].head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,#1 Venue,#2 Venue,#3 Venue,#4 Venue,#5 Venue
85,M7A,Queen's Park,Queen's Park,43.662301,-79.389494,4,Coffee Shop,Park,Gym,Japanese Restaurant,Sushi Restaurant


Save final data frame to file.

In [66]:
df_merged.to_csv(os.path.join('..', 'data', 'week3_c.csv'), index=False)
df_merged.shape

(103, 11)