## Data exploration and clustering on Toronto  neighborhoods dataset

In [56]:
#!conda install -c conda-forge geopy --yes        # if needed
import numpy as np
import pandas as pd
import json
import folium
import requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

In [13]:
df_toronto = pd.read_csv('Toronto_neighborhoods_v2.csv')
df_toronto.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [15]:
toronto_latitude = 43.65; toronto_longitude = -79.38
map_toronto = folium.Map(location = [toronto_latitude, toronto_longitude], zoom_start = 10.7)
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    

map_toronto

## 1. Use Foursquare API to explore neighbourhoods

In [93]:
def get_category_type(row):
    try:
        clist = row['categories']
    except:
        clist = row['venue.categories']
        
    if len(clist) == 0:
        return None
    else:
        return clist[0]['name']
    
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### 1.1 Get top 100 venues in selected neighbourhood

In [74]:
# select neighbourhood by index i
i=10
VERSION = '20180604'
CLIENT_ID='EU0PEJSWFHJKDPV1RBJTINXOMYPUI0PDXI3LRPEAQTCBPB4R'
CLIENT_SECRET='QRWV1SRQ0GGDJHURKTYII3XOSXVBFSTQ1UQH21EXVSMHBV4X'
LIMIT=100
radius=500
lat = df_toronto.iloc[i]['Latitude']
lng = df_toronto.iloc[i]['Longitude']

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(            CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION, 
        lat, 
        lng, 
        radius, 
        LIMIT)  
results = requests.get(url).json()
venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues)

filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]
nearby_venues
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))
nearby_venues.head()


6 venues were returned by Foursquare.




Unnamed: 0,name,categories,lat,lng
0,Kim Kim restaurant,Chinese Restaurant,43.753833,-79.276611
1,Kairali,Indian Restaurant,43.754915,-79.276945
2,Karaikudi Chettinad South Indian Restaurant,Indian Restaurant,43.756042,-79.276276
3,Pho Vietnam,Vietnamese Restaurant,43.75777,-79.278572
4,Big Al's Pet Supercentre,Pet Store,43.759279,-79.278325


### 1.2. Analyze each neighbourhood

In [76]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [115]:
latitude_scar = 43.773077
longitude_scar = -79.257774
scarborough_data = df_toronto[df_toronto['Borough'] == 'Scarborough'].reset_index(drop=True)
scarborough_data.head(7)
scarborough_venues = getNearbyVenues(names=scarborough_data['Neighborhood'],
                                   latitudes=scarborough_data['Latitude'],
                                   longitudes=scarborough_data['Longitude']
                                  )


Malvern / Rouge
Rouge Hill / Port Union / Highland Creek
Guildwood / Morningside / West Hill
Woburn
Cedarbrae
Scarborough Village
Kennedy Park / Ionview / East Birchmount Park
Golden Mile / Clairlea / Oakridge
Cliffside / Cliffcrest / Scarborough Village West
Birch Cliff / Cliffside West
Dorset Park / Wexford Heights / Scarborough Town Centre
Wexford / Maryvale
Agincourt
Clarks Corners / Tam O'Shanter / Sullivan
Milliken / Agincourt North / Steeles East / L'Amoreaux East
Steeles West / L'Amoreaux West
Upper Rouge


In [84]:
# get one hot encoding
scarb_onehot = pd.get_dummies(scarborough_venues[['Venue Category']], prefix="", prefix_sep="")
scarb_onehot['Neighborhood'] = scarborough_venues['Neighborhood'] 
fixed_columns = [scarb_onehot.columns[-1]] + list(scarb_onehot.columns[:-1])
scarb_onehot = scarb_onehot[fixed_columns]
scarb_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,Athletics & Sports,Bakery,Bank,Bar,Breakfast Spot,Brewery,Bubble Tea Shop,Burger Joint,...,Pizza Place,Playground,Rental Car Location,Shopping Mall,Skating Rink,Smoke Shop,Soccer Field,Thai Restaurant,Thrift / Vintage Store,Vietnamese Restaurant
0,Malvern / Rouge,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Rouge Hill / Port Union / Highland Creek,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Guildwood / Morningside / West Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Guildwood / Morningside / West Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Guildwood / Morningside / West Hill,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [85]:
scarb_onehot.shape

(85, 52)

In [95]:
scarb_grouped = scarb_onehot.groupby('Neighborhood').mean().reset_index()
scarb_grouped.head(7)

Unnamed: 0,Neighborhood,American Restaurant,Athletics & Sports,Bakery,Bank,Bar,Breakfast Spot,Brewery,Bubble Tea Shop,Burger Joint,...,Pizza Place,Playground,Rental Car Location,Shopping Mall,Skating Rink,Smoke Shop,Soccer Field,Thai Restaurant,Thrift / Vintage Store,Vietnamese Restaurant
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0
1,Birch Cliff / Cliffside West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0
2,Cedarbrae,0.0,0.125,0.125,0.125,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0
3,Clarks Corners / Tam O'Shanter / Sullivan,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,...,0.153846,0.0,0.0,0.076923,0.0,0.0,0.0,0.076923,0.0,0.0
4,Cliffside / Cliffcrest / Scarborough Village West,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Dorset Park / Wexford Heights / Scarborough To...,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667
6,Golden Mile / Clairlea / Oakridge,0.0,0.0,0.222222,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0


In [102]:
num_top_venues=10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))
    
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = scarb_grouped['Neighborhood']
for ind in np.arange(scarb_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(scarb_grouped.iloc[ind, :], num_top_venues)
neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Skating Rink,Breakfast Spot,Latin American Restaurant,Lounge,Vietnamese Restaurant,Chinese Restaurant,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store
1,Birch Cliff / Cliffside West,Café,General Entertainment,Skating Rink,College Stadium,Caribbean Restaurant,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Department Store
2,Cedarbrae,Thai Restaurant,Athletics & Sports,Bakery,Bank,Gas Station,Fried Chicken Joint,Hakka Restaurant,Caribbean Restaurant,Vietnamese Restaurant,Coffee Shop
3,Clarks Corners / Tam O'Shanter / Sullivan,Pizza Place,Pharmacy,Fried Chicken Joint,Gas Station,Noodle House,Italian Restaurant,Chinese Restaurant,Shopping Mall,Bank,Fast Food Restaurant
4,Cliffside / Cliffcrest / Scarborough Village West,American Restaurant,Motel,Hakka Restaurant,General Entertainment,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Department Store,Convenience Store
5,Dorset Park / Wexford Heights / Scarborough To...,Indian Restaurant,Vietnamese Restaurant,Brewery,Pet Store,Chinese Restaurant,Caribbean Restaurant,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store
6,Golden Mile / Clairlea / Oakridge,Bus Line,Bakery,Ice Cream Shop,Intersection,Park,Bus Station,Soccer Field,Bar,Coffee Shop,Gas Station
7,Guildwood / Morningside / West Hill,Bank,Intersection,Breakfast Spot,Rental Car Location,Electronics Store,Medical Center,Mexican Restaurant,Vietnamese Restaurant,Chinese Restaurant,Fried Chicken Joint
8,Kennedy Park / Ionview / East Birchmount Park,Department Store,Convenience Store,Coffee Shop,Bus Station,Vietnamese Restaurant,Caribbean Restaurant,General Entertainment,Gas Station,Fried Chicken Joint,Fast Food Restaurant
9,Malvern / Rouge,Fast Food Restaurant,Vietnamese Restaurant,Caribbean Restaurant,General Entertainment,Gas Station,Fried Chicken Joint,Electronics Store,Department Store,Convenience Store,College Stadium


## 2. Clustering

### 2.1 Do KMeans clustering

In [107]:
from sklearn.cluster import KMeans

scarb_data = scarborough_data.drop(16)
kclusters = 5
scarb_grouped_clustering = scarb_grouped.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(scarb_grouped_clustering)
# check cluster labels generated for each row in the dataframe
#len(kmeans.labels_)#=16
#scarborough_data.shape

In [111]:
scarb_merged = scarb_data

# add clustering labels
scarb_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
scarb_merged = scarb_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

scarb_merged

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353,1,Fast Food Restaurant,Vietnamese Restaurant,Caribbean Restaurant,General Entertainment,Gas Station,Fried Chicken Joint,Electronics Store,Department Store,Convenience Store,College Stadium
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497,1,Bar,Vietnamese Restaurant,Caribbean Restaurant,General Entertainment,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Department Store,Convenience Store
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711,1,Bank,Intersection,Breakfast Spot,Rental Car Location,Electronics Store,Medical Center,Mexican Restaurant,Vietnamese Restaurant,Chinese Restaurant,Fried Chicken Joint
3,M1G,Scarborough,Woburn,43.770992,-79.216917,1,Coffee Shop,Korean Restaurant,Vietnamese Restaurant,Hakka Restaurant,General Entertainment,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Department Store
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,1,Thai Restaurant,Athletics & Sports,Bakery,Bank,Gas Station,Fried Chicken Joint,Hakka Restaurant,Caribbean Restaurant,Vietnamese Restaurant,Coffee Shop
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476,1,Playground,Convenience Store,Vietnamese Restaurant,Caribbean Restaurant,General Entertainment,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Department Store
6,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park,43.727929,-79.262029,1,Department Store,Convenience Store,Coffee Shop,Bus Station,Vietnamese Restaurant,Caribbean Restaurant,General Entertainment,Gas Station,Fried Chicken Joint,Fast Food Restaurant
7,M1L,Scarborough,Golden Mile / Clairlea / Oakridge,43.711112,-79.284577,1,Bus Line,Bakery,Ice Cream Shop,Intersection,Park,Bus Station,Soccer Field,Bar,Coffee Shop,Gas Station
8,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village West,43.716316,-79.239476,1,American Restaurant,Motel,Hakka Restaurant,General Entertainment,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Department Store,Convenience Store
9,M1N,Scarborough,Birch Cliff / Cliffside West,43.692657,-79.264848,3,Café,General Entertainment,Skating Rink,College Stadium,Caribbean Restaurant,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Department Store


### 2.2 Visualize clusters on map

In [117]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location = [latitude_scar, longitude_scar], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(scarb_merged['Latitude'], scarb_merged['Longitude'], scarb_merged['Neighborhood'], scarb_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [114]:
scarborough_data

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park,43.727929,-79.262029
7,M1L,Scarborough,Golden Mile / Clairlea / Oakridge,43.711112,-79.284577
8,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village West,43.716316,-79.239476
9,M1N,Scarborough,Birch Cliff / Cliffside West,43.692657,-79.264848
