### Create Toronto Neighborhoods DataFrame

In [1]:
import pandas as pd
import numpy as np
#!conda install lxml --yes
import lxml # library to read html

#!conda install -c conda-forge geopy --yes # install geopy to pull coordinates
from geopy.geocoders import Nominatim

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

## Load Toronto neighborhood list from wikipedia.

In [2]:
url='https://en.wikipedia.org/wiki/List_of_city-designated_neighbourhoods_in_Toronto'
dfs=pd.read_html(url)
Toronto_neighbour=dfs[0]
print(Toronto_neighbour.shape)
Toronto_neighbour.head()

(140, 5)


Unnamed: 0,CDN number,City-designated area,Former city/borough,Neighbourhoods covered,Map
0,129,Agincourt North,Scarborough,Agincourt and Brimwood,
1,128,Agincourt South-Malvern West,Scarborough,Agincourt and Malvern,
2,20,Alderwood,Etobicoke,Alderwood,
3,95,Annex,Old City of Toronto,The Annex and Seaton Village,
4,42,Banbury-Don Mills,North York,Don Mills,


### Clean up the dataframe

In [3]:
Toronto_neighbour=Toronto_neighbour[['Former city/borough','City-designated area']]
Toronto_neighbour.columns=['Borough','Neighborhood']
Toronto_neighbour.head()

Unnamed: 0,Borough,Neighborhood
0,Scarborough,Agincourt North
1,Scarborough,Agincourt South-Malvern West
2,Etobicoke,Alderwood
3,Old City of Toronto,Annex
4,North York,Banbury-Don Mills


### Extract coordinates to each neighbourhood

In [4]:
# Pull coordinates
Toronto_neighbour.insert(2,'Lat',np.nan)
Toronto_neighbour.insert(3,'Long',np.nan)
geolocator=Nominatim(user_agent='TOR')

for i in range(len(Toronto_neighbour)):
    try:
        address=Toronto_neighbour.iloc[i,1]+', '+Toronto_neighbour.iloc[i,0]
        location=geolocator.geocode(address)
        Toronto_neighbour.iloc[i,2]=location.latitude
        Toronto_neighbour.iloc[i,3]=location.longitude
    except:
        try:
            address=Toronto_neighbour.iloc[i,1]+', Toronto'
            location=geolocator.geocode(address)
            Toronto_neighbour.iloc[i,2]=location.latitude
            Toronto_neighbour.iloc[i,3]=location.longitude
        except:
            try:
                address=Toronto_neighbour.iloc[i,1].split('-')[0]+', '+TOR_Neigh.iloc[i,0]
                location=geolocator.geocode(address)
                Toronto_neighbour.iloc[i,2]=location.latitude
                Toronto_neighbour.iloc[i,3]=location.longitude
            except:
                try:
                    address=Toronto_neighbour.iloc[i,1].split('-')[1]+', '+Toronto_neighbour.iloc[i,0]
                    location=geolocator.geocode(address)
                    Toronto_neighbour.iloc[i,2]=location.latitude
                    Toronto_neighbour.iloc[i,3]=location.longitude
                except:
                    try:
                        address=Toronto_neighbour.iloc[i,1].split(' ')[1]+', '+Toronto_neighbour.iloc[i,0]
                        location=geolocator.geocode(address)
                        Toronto_neighbour.iloc[i,2]=location.latitude
                        Toronto_neighbour.iloc[i,3]=location.longitude
                    except:
                        None
                        
Toronto_neighbour.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,Borough,Neighborhood,Lat,Long
0,Scarborough,Agincourt North,43.808038,-79.266439
1,Scarborough,Agincourt South-Malvern West,43.775718,-79.259021
2,Etobicoke,Alderwood,43.601717,-79.545232
3,Old City of Toronto,Annex,43.670338,-79.407117
4,North York,Banbury-Don Mills,43.734804,-79.357243


### Select Boroughs that contain Toronto

In [5]:
Toronto_neighbour_filtered=Toronto_neighbour[Toronto_neighbour['Borough'].str.contains('Toronto')]
Toronto_neighbour_filtered.shape

(44, 4)

### Explore Neighbourhoods of Toronto

In [6]:
client_id='GYU440FFLMNB1IBZLBCVSMUJCOXHFPDVUUDB4W5UJNVVRLWN'
client_secret='2JUAIXJV1DY5ODZRIE2GI4RYHYHR12KSGLDB21OFCSAJSAAJ'
version='20190907'

In [7]:
def getNearbyVenues(names, latitudes, longitudes, radius):
    
    venues_list=[]
    for name, lat, long in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            client_id, 
            client_secret, 
            version, 
            lat, 
            long, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            long, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [8]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [9]:
limit=100
r=1000

Toronto_venues = getNearbyVenues(names=Toronto_neighbour_filtered['Neighborhood'],
                                   latitudes=Toronto_neighbour_filtered['Lat'],
                                   longitudes=Toronto_neighbour_filtered['Long'],
                                   radius=r
                                )

Annex
Bay Street Corridor
Blake-Jones
Cabbagetown-South St. James Town
Casa Loma
Church-Yonge Corridor
Corso Italia-Davenport
Danforth Village - Toronto
Dovercourt-Wallace Emerson-Junction
Dufferin Grove
East End-Danforth
Forest Hill North
Forest Hill South
Greenwood-Coxwell
High Park North
High Park-Swansea
Junction Area
Kensington-Chinatown
Lawrence Park North
Lawrence Park South
Little Portugal
Moss Park
Mount Pleasant East
Mount Pleasant West
Niagara
North Riverdale
North St. James Town
Palmerston-Little Italy
Parkdale
Playter Estates-Danforth
Regent Park
Roncesvalles
Rosedale-Moore Park
Runnymede-Bloor West Village
South Riverdale
The Beaches
Trinity–Bellwoods
University
Waterfront Communities-The Island
Weston-Pellam Park
Woodbine Corridor
Wychwood
Yonge and Eglinton
Yonge-St.Clair


In [10]:
print(Toronto_venues.shape)
Toronto_venues.head()

(3377, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Annex,43.670338,-79.407117,Jean Sibelius Square,43.671426,-79.408831,Park
1,Annex,43.670338,-79.407117,Fresh on Bloor,43.666755,-79.403491,Vegetarian / Vegan Restaurant
2,Annex,43.670338,-79.407117,Roti Cuisine of India,43.674618,-79.408249,Indian Restaurant
3,Annex,43.670338,-79.407117,Fuwa Fuwa Japanese Pancakes,43.66588,-79.40784,Pastry Shop
4,Annex,43.670338,-79.407117,BMV Books,43.66586,-79.406824,Bookstore


### Show venue categories in columns.

In [11]:
# one hot encoding
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Toronto_onehot['Neighborhood'] = Toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Afghan Restaurant,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Arcade,Art Gallery,Art Museum,...,Udon Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Group by neighborhood. Estimate the means of frequency of each category.

In [12]:
Toronto_grouped = Toronto_onehot.groupby('Neighborhood').mean().reset_index()
Toronto_grouped.head()

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Afghan Restaurant,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Arcade,Art Gallery,...,Udon Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,Annex,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,...,0.0,0.03,0.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0
1,Bay Street Corridor,0.01,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.01,0.01,0.0,0.0,0.0,0.01,0.01
2,Blake-Jones,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0
3,Cabbagetown-South St. James Town,0.0,0.0,0.0,0.011905,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.011905,0.0,0.0,0.011905,0.0,0.0
4,Casa Loma,0.012048,0.0,0.0,0.012048,0.0,0.0,0.0,0.0,0.0,...,0.0,0.024096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Identify the most popular venues.

In [13]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [14]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Toronto_grouped['Neighborhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

print(neighborhoods_venues_sorted.shape)
neighborhoods_venues_sorted.head()

(44, 11)


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Annex,Café,Restaurant,Bakery,Italian Restaurant,Pizza Place,Korean Restaurant,Japanese Restaurant,Vegetarian / Vegan Restaurant,Coffee Shop,Beer Bar
1,Bay Street Corridor,Coffee Shop,Italian Restaurant,Park,Sushi Restaurant,Burger Joint,Café,Japanese Restaurant,Ramen Restaurant,Dance Studio,Ice Cream Shop
2,Blake-Jones,Greek Restaurant,Coffee Shop,Café,Pizza Place,Burger Joint,Caribbean Restaurant,Fast Food Restaurant,Pub,Ice Cream Shop,Sandwich Place
3,Cabbagetown-South St. James Town,Coffee Shop,Gay Bar,Park,Diner,Thai Restaurant,Gastropub,Restaurant,Café,Japanese Restaurant,Men's Store
4,Casa Loma,Coffee Shop,Sandwich Place,Pizza Place,History Museum,Café,Park,Mexican Restaurant,Dessert Shop,Diner,Burger Joint


### Cluster Neighborhoods
Run K-means to cluster the neighborhoods into 4 clusters.

In [16]:
kclusters = 5

Toronto_grouped_clustering = Toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 3, 3, 3, 3, 3, 1, 4, 1, 1])

In [17]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

Toronto_final = Toronto_neighbour_filtered

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
Toronto_final = Toronto_final.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

Toronto_final.head()

Unnamed: 0,Borough,Neighborhood,Lat,Long,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Old City of Toronto,Annex,43.670338,-79.407117,1,Café,Restaurant,Bakery,Italian Restaurant,Pizza Place,Korean Restaurant,Japanese Restaurant,Vegetarian / Vegan Restaurant,Coffee Shop,Beer Bar
6,Old City of Toronto,Bay Street Corridor,43.664286,-79.387114,3,Coffee Shop,Italian Restaurant,Park,Sushi Restaurant,Burger Joint,Café,Japanese Restaurant,Ramen Restaurant,Dance Studio,Ice Cream Shop
14,Old City of Toronto,Blake-Jones,43.675693,-79.33898,3,Greek Restaurant,Coffee Shop,Café,Pizza Place,Burger Joint,Caribbean Restaurant,Fast Food Restaurant,Pub,Ice Cream Shop,Sandwich Place
19,Old City of Toronto,Cabbagetown-South St. James Town,43.666068,-79.370842,3,Coffee Shop,Gay Bar,Park,Diner,Thai Restaurant,Gastropub,Restaurant,Café,Japanese Restaurant,Men's Store
21,Old City of Toronto,Casa Loma,43.678101,-79.409416,3,Coffee Shop,Sandwich Place,Pizza Place,History Museum,Café,Park,Mexican Restaurant,Dessert Shop,Diner,Burger Joint


In [49]:
# create map using Toronto coordinates
latitude=43.6432
longitude=-79.3832
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=15)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# # add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_final['Lat'], Toronto_final['Long'], 
                                  Toronto_final['Neighborhood'], Toronto_final['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)

    folium.features.CircleMarker(
        [lat, lon],
        radius=3,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7
    ).add_to(map_clusters)
       
map_clusters