# Segmenting and Clustering Neighborhoods in Toronto - part 3

## create complete neighborhoods dataset

In [44]:
import pandas as pd
import numpy as np
import urllib.request
!conda install -c conda-forge beautifulsoup4 --yes
print('finished!')
import bs4 as bs

source = urllib.request.urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').read()
soup = bs.BeautifulSoup(source)

table = soup.find('table')
table_rows = table.find_all('tr')

wiki_table = []

for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    wiki_table.append(row)
    
df = pd.DataFrame(wiki_table, columns=["PostalCode", "Borough", "Neighborhood"])
df = df.iloc[1:]
df.replace(r'\s', '', regex = True, inplace = True)
df = df[df.Borough != "Notassigned"]
df = df.reset_index(drop=True)

index = 0
size = len(df)-1
while index < size:
    if df.iloc[index]['PostalCode'] == df.iloc[index+1]["PostalCode"]:
        df.iloc[index]['Neighborhood'] += ", "+df.iloc[index+1]['Neighborhood']
        df.drop(index+1, inplace=True)
        df = df.reset_index(drop=True)
        index -= 1
        size -= 1
    index += 1
    
index = 0
size = len(df)-1

while index < size:
    if df.iloc[index]['Neighborhood'] == 'Notassigned':
        df.iloc[index]['Neighborhood'] = df.iloc[index]['Borough']
    index += 1

postal_df = pd.read_csv('http://cocl.us/Geospatial_data', delimiter=',')

new_df =  df.merge(postal_df, left_on='PostalCode', right_on='Postal Code')
new_df.drop('Postal Code', axis=1, inplace=True)


new_df.head(11)

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.

finished!


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,NorthYork,Parkwoods,43.753259,-79.329656
1,M4A,NorthYork,VictoriaVillage,43.725882,-79.315572
2,M5A,DowntownToronto,Harbourfront,43.65426,-79.360636
3,M6A,NorthYork,"LawrenceHeights, LawrenceManor",43.718518,-79.464763
4,M7A,Queen'sPark,Queen'sPark,43.662301,-79.389494
5,M9A,Queen'sPark,Queen'sPark,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,NorthYork,DonMillsNorth,43.745906,-79.352188
8,M4B,EastYork,"WoodbineGardens, ParkviewHill",43.706397,-79.309937
9,M5B,DowntownToronto,"Ryerson, GardenDistrict",43.657162,-79.378937


In [46]:
temp_df = new_df
temp_df.drop(['PostalCode'], axis=1, inplace=True)
temp_df.head(5)

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,NorthYork,Parkwoods,43.753259,-79.329656
1,NorthYork,VictoriaVillage,43.725882,-79.315572
2,DowntownToronto,Harbourfront,43.65426,-79.360636
3,NorthYork,"LawrenceHeights, LawrenceManor",43.718518,-79.464763
4,Queen'sPark,Queen'sPark,43.662301,-79.389494


## Import packages for Fourthsquare API, K-Means clustering and vizualisation

In [23]:
import json
import requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
!conda install -c conda-forge folium=0.5.0 --yes
import folium

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.



## Fourthsquare API => search for venues for each neighborhood

In [47]:
CLIENT_ID = 'LSFPPJXBATJ1YEJ5BH35NJB2MWQ54MH1J3YILBF35HW4NDUU' # your Foursquare ID
CLIENT_SECRET = 'BIEY1PNHYWK5P2U5AR2ZPEOG0QRGAH2YZB25I4JHRRYDJKC0' # your Foursquare Secret
VERSION = '20191125' # Foursquare API version

In [48]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    LIMIT = 100
    radius = 500
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [49]:
toronto_venues = getNearbyVenues(names=new_df['Neighborhood'],
                                   latitudes=new_df['Latitude'],
                                   longitudes=new_df['Longitude']
                                  )

Parkwoods
VictoriaVillage
Harbourfront
LawrenceHeights, LawrenceManor
Queen'sPark
Queen'sPark
Rouge, Malvern
DonMillsNorth
WoodbineGardens, ParkviewHill
Ryerson, GardenDistrict
Glencairn
Cloverdale, Islington, MartinGrove, PrincessGardens, WestDeanePark
HighlandCreek, RougeHill, PortUnion
FlemingdonPark, DonMillsSouth
WoodbineHeights
St.JamesTown
Humewood-Cedarvale
BloordaleGardens, Eringate, MarklandWood, OldBurnhamthorpe
Guildwood, Morningside, WestHill
TheBeaches
BerczyPark
Caledonia-Fairbanks
Woburn
Leaside
CentralBayStreet
Christie
Cedarbrae
HillcrestVillage
BathurstManor, DownsviewNorth, WilsonHeights
ThorncliffePark
Adelaide, King, Richmond
DovercourtVillage, Dufferin
ScarboroughVillage
Fairview, HenryFarm, Oriole
NorthwoodPark, YorkUniversity
EastToronto
HarbourfrontEast, TorontoIslands, UnionStation
LittlePortugal, Trinity
EastBirchmountPark, Ionview, KennedyPark
BayviewVillage
CFBToronto, DownsviewEast
TheDanforthWest, Riverdale
DesignExchange, TorontoDominionCentre
Brockton,

KeyError: 'groups'

In [50]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,VictoriaVillage,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,VictoriaVillage,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
4,VictoriaVillage,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


In [51]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

In [52]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
             venue  freq
0      Coffee Shop  0.07
1             Café  0.05
2       Steakhouse  0.04
3              Bar  0.04
4  Thai Restaurant  0.04


----Agincourt----
                       venue  freq
0               Skating Rink  0.25
1                     Lounge  0.25
2             Breakfast Spot  0.25
3  Latin American Restaurant  0.25
4                Men's Store  0.00


----AgincourtNorth, L'AmoreauxEast, Milliken, SteelesEast----
                             venue  freq
0                       Playground   0.5
1                             Park   0.5
2                      Yoga Studio   0.0
3                      Men's Store   0.0
4  Molecular Gastronomy Restaurant   0.0


----AlbionGardens, BeaumondHeights, Humbergate, Jamestown, MountOlive, Silverstone, SouthSteeles, Thistletown----
                 venue  freq
0             Pharmacy  0.14
1        Grocery Store  0.14
2           Beer Store  0.14
3       Sandwich Place  0.14
4  Fried Chicken

In [53]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [54]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Bar,Steakhouse,Thai Restaurant,Sushi Restaurant,American Restaurant,Restaurant,Breakfast Spot,Burger Joint
1,Agincourt,Lounge,Latin American Restaurant,Skating Rink,Breakfast Spot,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
2,"AgincourtNorth, L'AmoreauxEast, Milliken, Stee...",Park,Playground,Women's Store,Donut Shop,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant
3,"AlbionGardens, BeaumondHeights, Humbergate, Ja...",Grocery Store,Pizza Place,Sandwich Place,Beer Store,Fried Chicken Joint,Fast Food Restaurant,Pharmacy,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
4,"Alderwood, LongBranch",Pizza Place,Pub,Pool,Gym,Skating Rink,Coffee Shop,Pharmacy,Athletics & Sports,Sandwich Place,Department Store


## Neighborhood clustering relying on top venues

In [65]:
kclusters = 5
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

kmeans.labels_[0:10] 

array([0, 0, 2, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [66]:
toronto_data = temp_df

In [81]:
toronto_merged = toronto_data

toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.dropna(inplace=True)

toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].astype(int)

toronto_merged.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,NorthYork,Parkwoods,43.753259,-79.329656,2,Food & Drink Shop,Park,Women's Store,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
1,NorthYork,VictoriaVillage,43.725882,-79.315572,0,French Restaurant,Hockey Arena,Intersection,Coffee Shop,Portuguese Restaurant,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant
2,DowntownToronto,Harbourfront,43.65426,-79.360636,0,Coffee Shop,Bakery,Pub,Park,Mexican Restaurant,Theater,Breakfast Spot,Restaurant,Café,Shoe Store
3,NorthYork,"LawrenceHeights, LawrenceManor",43.718518,-79.464763,0,Clothing Store,Furniture / Home Store,Accessories Store,Boutique,Fraternity House,Vietnamese Restaurant,Event Space,Coffee Shop,Miscellaneous Shop,Doner Restaurant
4,Queen'sPark,Queen'sPark,43.662301,-79.389494,0,Coffee Shop,Park,Gym,Sushi Restaurant,Nightclub,Seafood Restaurant,Sandwich Place,Burger Joint,Burrito Place,Restaurant


## vizualize clusters

In [82]:
toronto_latitude = 43.653226
toronto_longitude = -79.3831843

map_clusters = folium.Map(location=[toronto_latitude, toronto_longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
    
map_clusters
