In [2]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

#!conda install -c anaconda beautifulsoup4

print('Libraries imported.')

Libraries imported.


#### Import Toronto Postal Codes data and copy to dataframe

In [3]:
from bs4 import BeautifulSoup
import requests
r  = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
data = r.text
soup = BeautifulSoup(data)

table = soup.find( "table", {"class":"wikitable sortable"} )

rows=list()
for row in table.findAll("tr"):
   rows.append(row)

l = []
for tr in rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l.append(row)
toronto_df = pd.DataFrame(l, columns=['PostalCode','Borough','Neighborhood'])

#### Clean data

In [4]:
toronto_df = toronto_df.replace(to_replace='none', value=np.nan).dropna()
toronto_df = toronto_df[toronto_df['Borough'] != 'Not assigned']
toronto_df.reset_index(drop=True)
toronto_df['Neighborhood'] = toronto_df['Neighborhood'].str[:-1]

In [5]:
toronto_df.groupby('PostalCode').agg(dict(Borough = 'min', Neighborhood = lambda y: '%s'%', '.join(y)))
                             #   .agg(dict(A = 'sum', B = 'sum', C = lambda x: '{%s}'%', '.join(x)))
toronto_df.loc[toronto_df['Neighborhood'].str.contains("Not assign"), 'Neighborhood'] = toronto_df['Borough']

In [6]:
toronto_df[toronto_df['Borough'] == 'Queen\'s Park']
toronto_df.shape

(211, 3)

#### Get geolocation data and merge with PostalCodes data

In [7]:
pc  = requests.get("http://cocl.us/Geospatial_data")
data_pc = pc.text
soup_pc = BeautifulSoup(data_pc)
postalCodes = soup_pc.find('p').getText()

In [8]:
postalCodes.splitlines()
postalCodes_df = pd.DataFrame(postalCodes.splitlines())
postalCodes_df = pd.DataFrame(postalCodes_df[0].str.split(',').tolist(),
                                   columns = ['PostalCode','Latitude','Longitude'])
postalCodes_df = postalCodes_df[postalCodes_df['PostalCode'] != 'Postal Code']
toronto_data = pd.merge(toronto_df, postalCodes_df, left_on='PostalCode', right_on='PostalCode', how='left')
toronto_data = toronto_data[toronto_data['Borough'].str.contains('Toronto')]
toronto_data

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,Harbourfront,43.6542599,-79.3606359
3,M5A,Downtown Toronto,Regent Park,43.6542599,-79.3606359
13,M5B,Downtown Toronto,Ryerson,43.6571618,-79.3789371
14,M5B,Downtown Toronto,Garden District,43.6571618,-79.3789371
27,M5C,Downtown Toronto,St. James Town,43.6514939,-79.3754179
36,M4E,East Toronto,The Beaches,43.6763574,-79.2930312
37,M5E,Downtown Toronto,Berczy Park,43.6447708,-79.3733064
41,M5G,Downtown Toronto,Central Bay Street,43.6579524,-79.3873826
42,M6G,Downtown Toronto,Christie,43.669542,-79.4225637
49,M5H,Downtown Toronto,Adelaide,43.6505712,-79.3845675


#### Creat Map with Toronto Neighborhoods

In [9]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="ca_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.653963, -79.387207.


In [10]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

toronto_data[["Latitude", "Longitude"]] = toronto_data[["Latitude", "Longitude"]].apply(pd.to_numeric)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### Call Foursquare to return venues

In [12]:
CLIENT_ID = '3Y43KRY30SIR454E01CPDGENENG5W24N20ZGBW3CKZTYG1A3' # your Foursquare ID
CLIENT_SECRET = 'QFFXGQLHJY1RXPDJEGRGLSHXI5Y1KQNPX3I22NO5S2TEJPTC' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

In [13]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude',
                  'Venue Category']
    
    return(nearby_venues)

In [14]:
toronto_venues = getNearbyVenues(names=toronto_data['Neighborhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

Harbourfront
Regent Park
Ryerson
Garden District
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Adelaide
King
Richmond
Dovercourt Village
Dufferin
Harbourfront East
Toronto Islands
Union Station
Little Portugal
Trinity
The Danforth West
Riverdale
Design Exchange
Toronto Dominion Centre
Brockton
Exhibition Place
Parkdale Village
The Beaches West
India Bazaar
Commerce Court
Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North
Forest Hill West
High Park
The Junction South
North Toronto West
The Annex
North Midtown
Yorkville
Parkdale
Roncesvalles
Davisville
Harbord
University of Toronto
Runnymede
Swansea
Moore Park
Summerhill East
Chinatown
Grange Park
Kensington Market
Deer Park
Forest Hill SE
Rathnelly
South Hill
Summerhill West
CN Tower
Bathurst Quay
Island airport
Harbourfront West
King and Spadina
Railway Lands
South Niagara
Rosedale
Stn A PO Boxes 25 The Esplanade
Cabbagetown
St. James Town
First Canadian Place
Underground city


#### Clean venues data and order by most common venues

In [15]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Coffee Shop,Steakhouse,Café,Thai Restaurant,Bar,Sushi Restaurant,American Restaurant,Hotel,Restaurant,Gym
1,Bathurst Quay,Airport Lounge,Airport Terminal,Airport Service,Plane,Sculpture Garden,Boutique,Boat or Ferry,Harbor / Marina,Airport Gate,Airport
2,Berczy Park,Coffee Shop,Cocktail Bar,Pub,Cheese Shop,Steakhouse,Seafood Restaurant,Farmers Market,Restaurant,Café,Bakery
3,Brockton,Coffee Shop,Breakfast Spot,Café,Yoga Studio,Burrito Place,Restaurant,Caribbean Restaurant,Climbing Gym,Pet Store,Bar
4,Business Reply Mail Processing Centre 969 Eastern,Light Rail Station,Yoga Studio,Garden,Farmers Market,Spa,Fast Food Restaurant,Brewery,Burrito Place,Restaurant,Auto Workshop


#### Cluster Toronto neighborhoods based on most common venues and display map

In [19]:
# set number of clusters
kclusters = 4

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,1,Coffee Shop,Park,Café,Pub,Bakery,Breakfast Spot,Mexican Restaurant,Theater,Ice Cream Shop,Chocolate Shop
3,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636,1,Coffee Shop,Park,Café,Pub,Bakery,Breakfast Spot,Mexican Restaurant,Theater,Ice Cream Shop,Chocolate Shop
13,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937,1,Coffee Shop,Clothing Store,Café,Cosmetics Shop,Middle Eastern Restaurant,Restaurant,Diner,Ramen Restaurant,Pizza Place,Bubble Tea Shop
14,M5B,Downtown Toronto,Garden District,43.657162,-79.378937,1,Coffee Shop,Clothing Store,Café,Cosmetics Shop,Middle Eastern Restaurant,Restaurant,Diner,Ramen Restaurant,Pizza Place,Bubble Tea Shop
27,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,1,Coffee Shop,Restaurant,Café,Bakery,Hotel,Italian Restaurant,Breakfast Spot,Gastropub,Park,Diner


In [20]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Display 4 clusters (0 - Trails and Parks; 1 - Coffee shops and snack bars; 2 - Top restaurants and gyms (central Toronto); 3 - Airport services)

In [81]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
112,Central Toronto,0,Bus Line,Park,Lake,Swim School,Women's Store,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant
122,Central Toronto,0,Trail,Park,Jewelry Store,Sushi Restaurant,Dumpling Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Women's Store
123,Central Toronto,0,Trail,Park,Jewelry Store,Sushi Restaurant,Dumpling Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Women's Store
183,Downtown Toronto,0,Park,Trail,Playground,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Women's Store


In [82]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Downtown Toronto,1,Coffee Shop,Park,Café,Pub,Bakery,Breakfast Spot,Mexican Restaurant,Theater,Ice Cream Shop,Chocolate Shop
3,Downtown Toronto,1,Coffee Shop,Park,Café,Pub,Bakery,Breakfast Spot,Mexican Restaurant,Theater,Ice Cream Shop,Chocolate Shop
13,Downtown Toronto,1,Coffee Shop,Clothing Store,Café,Cosmetics Shop,Middle Eastern Restaurant,Tea Room,Bubble Tea Shop,Pizza Place,Italian Restaurant,Diner
14,Downtown Toronto,1,Coffee Shop,Clothing Store,Café,Cosmetics Shop,Middle Eastern Restaurant,Tea Room,Bubble Tea Shop,Pizza Place,Italian Restaurant,Diner
27,Downtown Toronto,1,Coffee Shop,Restaurant,Café,Breakfast Spot,Hotel,Italian Restaurant,Bakery,Gastropub,Park,Farmers Market
36,East Toronto,1,Health Food Store,Coffee Shop,Pub,Women's Store,Dumpling Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
37,Downtown Toronto,1,Coffee Shop,Cocktail Bar,Pub,Cheese Shop,Steakhouse,Seafood Restaurant,Farmers Market,Restaurant,Café,Bakery
41,Downtown Toronto,1,Coffee Shop,Italian Restaurant,Burger Joint,Chinese Restaurant,Bubble Tea Shop,Bar,Café,Thai Restaurant,Spa,Sushi Restaurant
42,Downtown Toronto,1,Grocery Store,Café,Park,Nightclub,Diner,Baby Store,Athletics & Sports,Restaurant,Italian Restaurant,Coffee Shop
49,Downtown Toronto,1,Coffee Shop,Café,Steakhouse,Bar,Thai Restaurant,Restaurant,Sushi Restaurant,American Restaurant,Hotel,Bakery


In [83]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
150,Central Toronto,2,Restaurant,Gym,Playground,Tennis Court,Doner Restaurant,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Dumpling Restaurant
151,Central Toronto,2,Restaurant,Gym,Playground,Tennis Court,Doner Restaurant,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Dumpling Restaurant


In [84]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
164,Downtown Toronto,3,Airport Lounge,Airport Service,Airport Terminal,Boutique,Harbor / Marina,Boat or Ferry,Airport,Airport Food Court,Airport Gate,Plane
165,Downtown Toronto,3,Airport Lounge,Airport Service,Airport Terminal,Boutique,Harbor / Marina,Boat or Ferry,Airport,Airport Food Court,Airport Gate,Plane
166,Downtown Toronto,3,Airport Lounge,Airport Service,Airport Terminal,Boutique,Harbor / Marina,Boat or Ferry,Airport,Airport Food Court,Airport Gate,Plane
167,Downtown Toronto,3,Airport Lounge,Airport Service,Airport Terminal,Boutique,Harbor / Marina,Boat or Ferry,Airport,Airport Food Court,Airport Gate,Plane
168,Downtown Toronto,3,Airport Lounge,Airport Service,Airport Terminal,Boutique,Harbor / Marina,Boat or Ferry,Airport,Airport Food Court,Airport Gate,Plane
169,Downtown Toronto,3,Airport Lounge,Airport Service,Airport Terminal,Boutique,Harbor / Marina,Boat or Ferry,Airport,Airport Food Court,Airport Gate,Plane
170,Downtown Toronto,3,Airport Lounge,Airport Service,Airport Terminal,Boutique,Harbor / Marina,Boat or Ferry,Airport,Airport Food Court,Airport Gate,Plane
