### Part III: Explore and Cluster Cleaned Neighborhood Data
#### Import necessary libraries 

In [1]:
# Just imported all libraries from sample lab
import numpy as np
import pandas as pd

import json

from geopy.geocoders import Nominatim

import requests
from pandas.io.json import json_normalize

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium

#### Import csv file from Part II

In [18]:
postcode_data = pd.read_csv('comp_pc_data.csv', index_col=0)
postcode_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [19]:
# Check number of boroughs and neighborhoods in Toronto
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(postcode_data['Borough'].unique()),
        postcode_data.shape[0]))

The dataframe has 11 boroughs and 103 neighborhoods.


#### Use geopy library to get latitude and longitude values of Toronto

In [20]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinates of Toronto are {}, {}.'.format(
        latitude, longitude))

The geographical coordinates of Toronto are 43.653963, -79.387207.


#### Create a map of Toronto with postcode markers

In [21]:
# initialize folium map
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers
for lat, lng, pc, borough, nhood in zip(
        postcode_data['Latitude'],
        postcode_data['Longitude'],
        postcode_data['PostalCode'],
        postcode_data['Borough'],
        postcode_data['Neighborhood']):
    label = '{}, {}, {}'.format(nhood, borough, pc)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)

map_toronto

#### I'm going to focus on Downtown Toronto
 - Create a dataframe with only Downtown Toronto data
 - Create a folium map using geographical data of Downtown Toronto

In [22]:
dt_data = postcode_data[postcode_data['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
dt_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
1,M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.667967,-79.367675
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
3,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
4,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


In [25]:
# create map of Downtown Toronto
map_dt = folium.Map(location=[latitude, longitude], zoom_start=13)

# add markers to map
for lat, lng, label in zip(dt_data['Latitude'], dt_data['Longitude'], dt_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_dt)  
    
map_dt

#### Segmenting postal code data using Foursquare API

In [30]:
# Client ID and Client Secret in local txt file
filepath = 'foursquare_cred.txt'
with open(filepath) as cred:
    CLIENT_ID = cred.readline()
    CLIENT_SECRET = cred.readline()

VERSION = '20190704'

In [39]:
# Function to get venues for each postal code in Downtown Torontox
def getNearbyVenues(postalcodes, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for pc, lat, lng in zip(postalcodes, latitudes, longitudes):
        print(pc)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            100) # limit set to 100 results
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            pc, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['PostalCode', 
                  'PostalCode Latitude', 
                  'PostalCode Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [42]:
# Run getNearbyVenues on Downtown Toronto Postal Codes
dt_venues = getNearbyVenues(postalcodes=dt_data['PostalCode'],
                            latitudes=dt_data['Latitude'],
                            longitudes=dt_data['Longitude'])

M4W
M4X
M4Y
M5A
M5B
M5C
M5E
M5G
M5H
M5J
M5K
M5L
M5S
M5T
M5V
M5W
M5X
M6G


In [43]:
# Check size of resulting dataframe
print(dt_venues.shape)
dt_venues.head()

(1288, 7)


Unnamed: 0,PostalCode,PostalCode Latitude,PostalCode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M4W,43.679563,-79.377529,Mooredale House,43.678631,-79.380091,Building
1,M4W,43.679563,-79.377529,Rosedale Park,43.682328,-79.378934,Playground
2,M4W,43.679563,-79.377529,Whitney Park,43.682036,-79.373788,Park
3,M4W,43.679563,-79.377529,Alex Murray Parkette,43.6783,-79.382773,Park
4,M4W,43.679563,-79.377529,Milkman's Lane,43.676352,-79.373842,Trail


In [44]:
# Check how many venues for each Postal Code
dt_venues.groupby('PostalCode').count()

Unnamed: 0_level_0,PostalCode Latitude,PostalCode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M4W,5,5,5,5,5,5
M4X,46,46,46,46,46,46
M4Y,87,87,87,87,87,87
M5A,48,48,48,48,48,48
M5B,100,100,100,100,100,100
M5C,100,100,100,100,100,100
M5E,55,55,55,55,55,55
M5G,88,88,88,88,88,88
M5H,100,100,100,100,100,100
M5J,100,100,100,100,100,100


In [45]:
# Check how many unique categories there are
print('There are {} unique categories.'.format(len(dt_venues['Venue Category'].unique())))

There are 208 unique categories.


In [50]:
# one hot encoding
dt_onehot = pd.get_dummies(dt_venues[['Venue Category']], prefix="", prefix_sep="")
dt_onehot['PostalCode'] = dt_venues['PostalCode'] 

# move postal code column to the first column
fixed_columns = [dt_onehot.columns[-1]] + list(dt_onehot.columns[:-1])
dt_onehot = dt_onehot[fixed_columns]

print(dt_onehot.shape)
dt_onehot.head()

(1288, 209)


Unnamed: 0,PostalCode,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M4W,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M4W,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M4W,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4W,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4W,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [52]:
dtoh_grouped = dt_onehot.groupby('PostalCode').mean().reset_index()
print(dtoh_grouped.shape)
dtoh_grouped

(18, 209)


Unnamed: 0,PostalCode,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M4W,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M4Y,0.011494,0.011494,0.0,0.0,0.0,0.0,0.0,0.0,0.011494,...,0.0,0.0,0.0,0.011494,0.0,0.011494,0.0,0.011494,0.0,0.011494
3,M5A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020833
4,M5B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,...,0.0,0.0,0.01,0.01,0.0,0.01,0.01,0.0,0.0,0.0
5,M5C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,...,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.01,0.0
6,M5E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.018182,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,M5G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011364,...,0.0,0.0,0.011364,0.0,0.011364,0.0,0.011364,0.0,0.0,0.011364
8,M5H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.01,0.0
9,M5J,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.01,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0


#### Clustering based on most common venues

In [53]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [80]:
num_top_venues = 3

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['PostalCode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
pc_venues_sorted = pd.DataFrame(columns=columns)
pc_venues_sorted['PostalCode'] = dtoh_grouped['PostalCode']

for ind in np.arange(dtoh_grouped.shape[0]):
    pc_venues_sorted.iloc[ind, 1:] = return_most_common_venues(dtoh_grouped.iloc[ind, :], num_top_venues)

pc_venues_sorted.head()

Unnamed: 0,PostalCode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,M4W,Park,Playground,Trail
1,M4X,Coffee Shop,Restaurant,Bakery
2,M4Y,Coffee Shop,Japanese Restaurant,Sushi Restaurant
3,M5A,Coffee Shop,Pub,Bakery
4,M5B,Coffee Shop,Clothing Store,Café


#### Using KMeans Clustering

In [81]:
kclusters = 5
dtoh_clustering = dtoh_grouped.drop('PostalCode', 1)

kmeans = KMeans(n_clusters=kclusters).fit(dtoh_clustering)

kmeans.labels_[0:10]

array([1, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [82]:
pc_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

dt_clustered = dt_data.join(pc_venues_sorted.set_index('PostalCode'), on='PostalCode')

print(dt_clustered.shape)
dt_clustered

(18, 9)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,1,Park,Playground,Trail
1,M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.667967,-79.367675,2,Coffee Shop,Restaurant,Bakery
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316,2,Coffee Shop,Japanese Restaurant,Sushi Restaurant
3,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,2,Coffee Shop,Pub,Bakery
4,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,2,Coffee Shop,Clothing Store,Café
5,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,2,Coffee Shop,Café,Hotel
6,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,2,Coffee Shop,Cocktail Bar,Farmers Market
7,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,2,Coffee Shop,Café,Italian Restaurant
8,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568,2,Coffee Shop,Café,Steakhouse
9,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station",43.640816,-79.381752,2,Coffee Shop,Hotel,Aquarium


In [83]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(dt_clustered['Latitude'],
                                  dt_clustered['Longitude'],
                                  dt_clustered['PostalCode'],
                                  dt_clustered['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Examining the Clusters
After trial and error, looking at the top 3 venue categories and 5 clusters helped come up with something more meaningful. If I were to continue going into more detail, I would remove the outskirts/suburban parts of Toronto and run kmeans on the main large cluster

##### Cluster 1
Possibly influenced by U of Toronto

In [85]:
dt_clustered.loc[dt_clustered['Cluster Labels'] == 0, dt_clustered.columns[[2] + list(range(5, dt_clustered.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
12,"Harbord, University of Toronto",0,Café,Restaurant,Bookstore
13,"Chinatown, Grange Park, Kensington Market",0,Café,Vegetarian / Vegan Restaurant,Mexican Restaurant


##### Cluster 2
Clearly suburban by looking at the venues

In [86]:
dt_clustered.loc[dt_clustered['Cluster Labels'] == 1, dt_clustered.columns[[2] + list(range(5, dt_clustered.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,Rosedale,1,Park,Playground,Trail


##### Cluster 3
The heart of the city and requires more analysis.
Toronto loves its Coffee Shops and Cafes.

In [87]:
dt_clustered.loc[dt_clustered['Cluster Labels'] == 2, dt_clustered.columns[[2] + list(range(5, dt_clustered.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
1,"Cabbagetown, St. James Town",2,Coffee Shop,Restaurant,Bakery
2,Church and Wellesley,2,Coffee Shop,Japanese Restaurant,Sushi Restaurant
3,"Harbourfront, Regent Park",2,Coffee Shop,Pub,Bakery
4,"Ryerson, Garden District",2,Coffee Shop,Clothing Store,Café
5,St. James Town,2,Coffee Shop,Café,Hotel
6,Berczy Park,2,Coffee Shop,Cocktail Bar,Farmers Market
7,Central Bay Street,2,Coffee Shop,Café,Italian Restaurant
8,"Adelaide, King, Richmond",2,Coffee Shop,Café,Steakhouse
9,"Harbourfront East, Toronto Islands, Union Station",2,Coffee Shop,Hotel,Aquarium
10,"Design Exchange, Toronto Dominion Centre",2,Coffee Shop,Café,Hotel


##### Cluster 4
Literally just the airport. Makes sense that nothing else is there.

In [88]:
dt_clustered.loc[dt_clustered['Cluster Labels'] == 3, dt_clustered.columns[[2] + list(range(5, dt_clustered.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
14,"CN Tower, Bathurst Quay, Island airport, Harbo...",3,Airport Lounge,Airport Service,Airport Terminal


##### Cluster 5
Also quite suburban, with a slightly different flavor from the other one.
A bit closer to the city.

In [89]:
dt_clustered.loc[dt_clustered['Cluster Labels'] == 4, dt_clustered.columns[[2] + list(range(5, dt_clustered.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
17,Christie,4,Grocery Store,Café,Park
