In [1]:
import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup


#### Scrape the Wikipedia page for neighborhoods with postal codes starting with M0 and wrangle the data

In [2]:
# get the link of the wiki website
wikiurl = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text

# Pars the HTML file using BeautifulSoup
soup = BeautifulSoup(wikiurl,'lxml')

# find the table
M0_table = soup.find('table',{'class':'wikitable sortable'})

# convert the HTML file to a Panda dataframe
M0_df =pd.read_html(str(M0_table))
M0_df =pd.DataFrame(M0_df[0])

# Drop rows with "Not assigned"
M0_df.drop(M0_df.loc[M0_df['Borough']=='Not assigned'].index, inplace=True)

# reset the dataframe index
M0_df.reset_index(drop=True, inplace=True)

print('This dataframe has {} rows and {} columns'.format(M0_df.shape[0], M0_df.shape[1]))
M0_df.head()

This dataframe has 103 rows and 3 columns


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### Get the coordinates of each neighborhood

In [3]:
'''
# Use geocoder to get the coordinates

#!pip install geocoder
import geocoder

M0_df['Latitude'] = np.zeros(M0_df.shape[0])
M0_df['Longitude'] = np.zeros(M0_df.shape[0])

for k in range(len(M0_df)):
    # initialize variable to None
    lat_lng_coords = None
    
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(M0_df['Postal Code'][k]))
        lat_lng_coords = g.latlng
        
    M0_df['Latitude'][k] = lat_lng_coords[0]
    M0_df['Longitude'][k] = lat_lng_coords[1]
'''

# The geocoder package took too long, so I used the provided csv file 
Coordinates = pd.read_csv("https://cocl.us/Geospatial_data")
M0_df = M0_df.join(Coordinates.set_index('Postal Code'), on='Postal Code')

M0_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


#### Reduce the dataset to the Toronto Neighborhoods

In [4]:
# initialize
Toronto_df = pd.DataFrame({'Postal Code' : [], 'Borough' : [], 'Neighbourhood' : [], 'Latitude' : [], 'Longitude' : []})
i = 0

# Make a new dataframe specific to the boroughs that contain the word Toronto
for row in range(len(M0_df)):
    if M0_df['Borough'][row].split()[-1] == 'Toronto':
        Toronto_df.loc[i]=M0_df.loc[row]
        i= i+1
        
Toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


#### Using the Foursquare API to explore the neighborhoods in Toronto

In [5]:
# Define Foursquare Credentials and Version
CLIENT_ID = 'RYKR4MC4MELEBLYWPCDVBI5LDEO4VL1MTGUHP3HPJ1MMMLIQ' # your Foursquare ID
CLIENT_SECRET = 'VYUM1W30DA1GVMAFAMDRU4UHVQUABS5EUCAIYOTG201ATOPY' # your Foursquare Secret


#### Using the getNearbyVenues function defined in the lab, list the top 100 venues Toronto

In [6]:
VERSION = '20180605' # Foursquare API version
LIMIT = 100

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [7]:
Toronto_venues= getNearbyVenues(names = Toronto_df['Neighbourhood'], latitudes = Toronto_df['Latitude'], longitudes = Toronto_df['Longitude'], radius=500)
Toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Regent Park, Harbourfront",43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


#### Group neighborhoods and take the mean of the frequency of occurrence of each category

In [8]:
# one hot encoding
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Toronto_onehot['Neighbourhood'] = Toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

# group rows by neighborhoods
Toronto_grouped = Toronto_onehot.groupby('Neighbourhood').mean().reset_index()
Toronto_grouped

Unnamed: 0,Neighbourhood,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.018182,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.055556,0.055556,0.055556,0.111111,0.166667,0.111111,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.016667,0.0,0.0,0.016667,0.0,0.016667
5,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Church and Wellesley,0.0125,0.0,0.0,0.0,0.0,0.0,0.0,0.0125,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025
7,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0
8,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.029412,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Find the top 10 venues for each neighborhood

In [9]:
# make the an empty nighborhoods_venues_sorted dataframe

indicators = ['st', 'nd', 'rd']
Col_name = ['Neighbourhood']
indicators
for k in range(10):
    if k <3:
        Col_name.append('{}{} most common'.format(k+1, indicators[k]))
    else:
        Col_name.append('{}th most common'.format(k+1))

neighbourhoods_venues_sorted = pd.DataFrame(columns=Col_name)
neighbourhoods_venues_sorted['Neighbourhood'] = Toronto_grouped['Neighbourhood']


In [10]:
# for each nighborhoods find the top 10 venues by sorting the data descending
for row in range(len(neighbourhoods_venues_sorted)):
    neighbourhoods_venues_sorted.iloc[row, 1:] = Toronto_grouped.iloc[row, :].iloc[1:].astype(float).sort_values(ascending=False).head(10).index.values[0:]

neighbourhoods_venues_sorted.head()


Unnamed: 0,Neighbourhood,1st most common,2nd most common,3rd most common,4th most common,5th most common,6th most common,7th most common,8th most common,9th most common,10th most common
0,Berczy Park,Coffee Shop,Cocktail Bar,Bakery,Cheese Shop,Restaurant,Farmers Market,Beer Bar,Seafood Restaurant,Greek Restaurant,Basketball Stadium
1,"Brockton, Parkdale Village, Exhibition Place",Café,Breakfast Spot,Nightclub,Coffee Shop,Pet Store,Stadium,Bar,Intersection,Bakery,Restaurant
2,"Business reply mail Processing Centre, South C...",Gym / Fitness Center,Farmers Market,Skate Park,Auto Workshop,Burrito Place,Garden,Fast Food Restaurant,Garden Center,Light Rail Station,Park
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Airport Lounge,Airport Terminal,Sculpture Garden,Harbor / Marina,Plane,Boat or Ferry,Rental Car Location,Boutique,Bar
4,Central Bay Street,Coffee Shop,Italian Restaurant,Sandwich Place,Café,Burger Joint,Salad Place,Bubble Tea Shop,Poke Place,Portuguese Restaurant,Pizza Place


#### Cluster the neighborhoods


In [49]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 7

Toronto_grouped_clustering = Toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)



#### Add a column to the Toronto dataframe indicating the cluster number

In [50]:
Toronto_merged= Toronto_df
Toronto_merged['Cluster'] = kmeans.labels_
Toronto_merged.head()
Toronto_merged = Toronto_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

Toronto_merged.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster,1st most common,2nd most common,3rd most common,4th most common,5th most common,6th most common,7th most common,8th most common,9th most common,10th most common
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,6,Coffee Shop,Pub,Bakery,Park,Breakfast Spot,Café,Theater,Farmers Market,French Restaurant,Wine Shop
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0,Coffee Shop,Sushi Restaurant,Yoga Studio,Bar,Smoothie Shop,Sandwich Place,Burrito Place,Café,Portuguese Restaurant,Park
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,Clothing Store,Coffee Shop,Hotel,Bubble Tea Shop,Cosmetics Shop,Café,Italian Restaurant,Middle Eastern Restaurant,Japanese Restaurant,Diner
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,6,Coffee Shop,Café,Cocktail Bar,American Restaurant,Gastropub,Cosmetics Shop,Clothing Store,Seafood Restaurant,Beer Bar,Farmers Market
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,6,Coffee Shop,Health Food Store,Neighborhood,Trail,Pub,Yoga Studio,Dog Run,Diner,Discount Store,Distribution Center


#### Generate a map to visualize the clustered neighborhoods 

In [55]:
# creat a map
#!pip install folium
import folium

Colors = ['blue', 'red', 'pink', 'yellow', 'orange', 'purple', 'green']

map_clusters = folium.Map(location=[Toronto_merged['Latitude'].mean(), Toronto_merged['Longitude'].mean()], zoom_start=12)
for Neighb, lat, lng, cluster in zip(Toronto_merged['Neighbourhood'], Toronto_merged['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Cluster']):
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup='{}, \n cluster{}'.format(Neighb, cluster),
        color=Colors[cluster],
        fill=True,
        fill_color=Colors[cluster],
        fill_opacity=0.7).add_to(map_clusters)
map_clusters