# My Toronto assignment

## Week 3 - Part I

In [1]:
import pandas as pd
!pip install bs4
import bs4 as bs4
!pip install lxml
import lxml as lxml
import numpy as np

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans
import requests # library to handle requests
import random # library for random number generation

!pip install geopy
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
! pip install folium==0.5.0
import folium # plotting library

print('Libraries imported.')

Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25ldone
[?25h  Created wheel for bs4: filename=bs4-0.0.1-py3-none-any.whl size=1272 sha256=19c96c10101750ab31b0227e27c947bad0cda3ce4b03294a4c80debabd318900
  Stored in directory: /tmp/wsuser/.cache/pip/wheels/0a/9e/ba/20e5bbc1afef3a491f0b3bb74d508f99403aabe76eda2167ca
Successfully built bs4
Installing collected packages: bs4
Successfully installed bs4-0.0.1
Collecting folium==0.5.0
  Downloading folium-0.5.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 8.0 MB/s  eta 0:00:01
[?25hCollecting branca
  Downloading branca-0.4.1-py3-none-any.whl (24 kB)
Building wheels for collected packages: folium
  Building wheel for folium (setup.py) ... [?25ldone
[?25h  Created wheel for folium: filename=folium-0.5.0-py3-none-any.whl size=76240 sha256=601f2069dac55ae64f8ce0d024fb545635e27b614374a018f3a6e72c73b40d3c
  Stored in directory: /tm

In [2]:
#Extract data from HTML
tables = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', header=0,
                      keep_default_na=False)

headings = ['Postal Code', 'Borough', 'Neighbourhood']

#Iterate through all tables and identify the given one
for table in tables:
 
    current_headings = table.columns.values[:3]
    if len(current_headings) != len(headings):
        continue
    if all(current_headings == headings):
        break


In [3]:
#Build dataframe

df_toronto = table[headings]


In [4]:
#Clean dataframe

df_toronto.drop(df_toronto.loc[df_toronto['Borough']=='Not assigned'].index, inplace=True)
df_toronto = df_toronto.astype(str)


In [5]:
#Combine Neighbourhoods with same Postal Code and Borough into one row

df_toronto['Neighbourhood'] = df_toronto.astype('str').groupby(['Postal Code','Borough'])['Neighbourhood'].transform(lambda x: ','.join(x))


In [6]:
#Replace 'Not assigned' Neibhourhood by Borough value

df_toronto.loc[df_toronto['Neighbourhood'] == ('Not assigned'), 'Neighbourhood'] = df_toronto['Borough']


## Week 3 - Part II


In [7]:
#Extract data from CSV
tables_coordinates = pd.read_csv('https://cocl.us/Geospatial_data', header=0,keep_default_na=False)


Postal Code     object
Latitude       float64
Longitude      float64
dtype: object

In [8]:
#Combine CSV and Wikipedia data

df_toronto_complete=tables_coordinates.merge(df_toronto, left_on='Postal Code', right_on='Postal Code')

df_toronto_complete.sort_values(by=['Neighbourhood'])


Unnamed: 0,Postal Code,Latitude,Longitude,Borough,Neighbourhood
12,M1S,43.794200,-79.262029,Scarborough,Agincourt
89,M8W,43.602414,-79.543484,Etobicoke,"Alderwood, Long Branch"
28,M3H,43.754328,-79.442259,North York,"Bathurst Manor, Wilson Heights, Downsview North"
19,M2K,43.786947,-79.385975,North York,Bayview Village
62,M5M,43.733283,-79.419750,North York,"Bedford Park, Lawrence Manor East"
...,...,...,...,...,...
24,M2R,43.782736,-79.442259,North York,"Willowdale, Willowdale West"
3,M1G,43.770992,-79.216917,Scarborough,Woburn
36,M4C,43.695344,-79.318389,East York,Woodbine Heights
23,M2P,43.752758,-79.400049,North York,York Mills West


In [9]:
#filter neighbourhoods containing 'West' in their name

df_west=df_toronto_complete[df_toronto_complete['Neighbourhood'].str.contains("West")]
df_west


Unnamed: 0,Postal Code,Latitude,Longitude,Borough,Neighbourhood
2,M1E,43.763573,-79.188711,Scarborough,"Guildwood, Morningside, West Hill"
8,M1M,43.716316,-79.239476,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,43.692657,-79.264848,Scarborough,"Birch Cliff, Cliffside West"
15,M1W,43.799525,-79.318389,Scarborough,"Steeles West, L'Amoreaux West"
23,M2P,43.752758,-79.400049,North York,York Mills West
24,M2R,43.782736,-79.442259,North York,"Willowdale, Willowdale West"
41,M4K,43.679557,-79.352188,East Toronto,"The Danforth West, Riverdale"
42,M4L,43.668999,-79.315572,East Toronto,"India Bazaar, The Beaches West"
46,M4R,43.715383,-79.405678,Central Toronto,"North Toronto West, Lawrence Park"
49,M4V,43.686412,-79.400049,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest..."


## Week 3 - Part III

In [10]:
#set parameter for Foursquare query

CLIENT_ID = 'ZD1PY2AICXD21MXVZRO1P2HME1AV1ELCD1VD3ZZS5TSQELGL' # your Foursquare ID
CLIENT_SECRET = 'PSQFNS0IW5T5K00EWRWEP5AA5PH5PZUYV5LBOHS54LWEYCYH' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
radius = 5000
latitude = 43.763573
longitude = -79.188711


In [11]:
#find nearby venues

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]["groups"][0]["items"]
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

toronto_data = df_west

toronto_venues = getNearbyVenues(names=toronto_data['Neighbourhood'],
                                  latitudes=toronto_data['Latitude'],
                                 longitudes=toronto_data['Longitude']
                                 )

toronto_venues.groupby('Neighborhood').count()

Guildwood, Morningside, West Hill
Cliffside, Cliffcrest, Scarborough Village West
Birch Cliff, Cliffside West
Steeles West, L'Amoreaux West
York Mills West
Willowdale, Willowdale West
The Danforth West, Riverdale
India Bazaar, The Beaches West
North Toronto West, Lawrence Park
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
Forest Hill North & West, Forest Hill Road Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Mimico NW, The Queensway West, South of Bloor, Kingsway Park South West, Royal York South West
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Weston
Westmount
Northwest, West Humber - Clairville


Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Birch Cliff, Cliffside West",4,4,4,4,4,4
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",16,16,16,16,16,16
"Cliffside, Cliffcrest, Scarborough Village West",2,2,2,2,2,2
"Forest Hill North & West, Forest Hill Road Park",4,4,4,4,4,4
"Guildwood, Morningside, West Hill",8,8,8,8,8,8
"India Bazaar, The Beaches West",19,19,19,19,19,19
"Mimico NW, The Queensway West, South of Bloor, Kingsway Park South West, Royal York South West",16,16,16,16,16,16
"North Toronto West, Lawrence Park",18,18,18,18,18,18
"Northwest, West Humber - Clairville",4,4,4,4,4,4
"Steeles West, L'Amoreaux West",12,12,12,12,12,12


In [12]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighborhood,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Bagel Shop,Bakery,...,Steakhouse,Supermarket,Supplement Shop,Sushi Restaurant,Tanning Salon,Thrift / Vintage Store,Trail,Vietnamese Restaurant,Wings Joint,Yoga Studio
0,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
#group venues by neighborhood

toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Bagel Shop,Bakery,...,Steakhouse,Supermarket,Supplement Shop,Sushi Restaurant,Tanning Salon,Thrift / Vintage Store,Trail,Vietnamese Restaurant,Wings Joint,Yoga Studio
0,"Birch Cliff, Cliffside West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"CN Tower, King and Spadina, Railway Lands, Har...",0.0625,0.0625,0.0625,0.125,0.125,0.0625,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Cliffside, Cliffcrest, Scarborough Village West",0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Forest Hill North & West, Forest Hill Road Park",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.25,0.0,0.0,0.25,0.0,0.0,0.0
4,"Guildwood, Morningside, West Hill",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"India Bazaar, The Beaches West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.052632,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0
6,"Mimico NW, The Queensway West, South of Bloor,...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,...,0.0,0.0,0.0625,0.0,0.0625,0.0625,0.0,0.0,0.0625,0.0
7,"North Toronto West, Lawrence Park",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556
8,"Northwest, West Humber - Clairville",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Steeles West, L'Amoreaux West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
#define function to return top venues

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [15]:
#find top venues

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}the Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4the Most Common Venue,5the Most Common Venue,6the Most Common Venue,7the Most Common Venue,8the Most Common Venue,9the Most Common Venue,10the Most Common Venue
0,"Birch Cliff, Cliffside West",General Entertainment,College Stadium,Skating Rink,Café,Diner,Chinese Restaurant,Clothing Store,Coffee Shop,Convenience Store,Cosmetics Shop
1,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Lounge,Airport Service,Airport,Bar,Plane,Boutique,Sculpture Garden,Coffee Shop,Boat or Ferry,Harbor / Marina
2,"Cliffside, Cliffcrest, Scarborough Village West",Motel,American Restaurant,Yoga Studio,Discount Store,Clothing Store,Coffee Shop,College Stadium,Convenience Store,Cosmetics Shop,Dessert Shop
3,"Forest Hill North & West, Forest Hill Road Park",Park,Trail,Sushi Restaurant,Jewelry Store,Yoga Studio,Diner,Chinese Restaurant,Clothing Store,Coffee Shop,College Stadium
4,"Guildwood, Morningside, West Hill",Mexican Restaurant,Medical Center,Electronics Store,Rental Car Location,Intersection,Restaurant,Bank,Breakfast Spot,Yoga Studio,Convenience Store


In [16]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 3, 2, 3, 3, 3, 3, 3, 3, 3], dtype=int32)

In [17]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Latitude,Longitude,Borough,Neighbourhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4the Most Common Venue,5the Most Common Venue,6the Most Common Venue,7the Most Common Venue,8the Most Common Venue,9the Most Common Venue,10the Most Common Venue
2,M1E,43.763573,-79.188711,Scarborough,"Guildwood, Morningside, West Hill",3,Mexican Restaurant,Medical Center,Electronics Store,Rental Car Location,Intersection,Restaurant,Bank,Breakfast Spot,Yoga Studio,Convenience Store
8,M1M,43.716316,-79.239476,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",2,Motel,American Restaurant,Yoga Studio,Discount Store,Clothing Store,Coffee Shop,College Stadium,Convenience Store,Cosmetics Shop,Dessert Shop
9,M1N,43.692657,-79.264848,Scarborough,"Birch Cliff, Cliffside West",0,General Entertainment,College Stadium,Skating Rink,Café,Diner,Chinese Restaurant,Clothing Store,Coffee Shop,Convenience Store,Cosmetics Shop
15,M1W,43.799525,-79.318389,Scarborough,"Steeles West, L'Amoreaux West",3,Fast Food Restaurant,Grocery Store,Bank,Gym,Indian Restaurant,Coffee Shop,Chinese Restaurant,Pharmacy,Breakfast Spot,Sandwich Place
23,M2P,43.752758,-79.400049,North York,York Mills West,1,Park,Convenience Store,Yoga Studio,Diner,Chinese Restaurant,Clothing Store,Coffee Shop,College Stadium,Cosmetics Shop,Dessert Shop


In [18]:
#display cluster on map

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters