# Clustering Neighbourhoods in Toronto

## Part 1: Scrape neighbourhoods data into pandas dataframe and pre-process dataset

#### Setup Libraries & Packages

In [12]:
pip install lxml html5lib beautifulsoup4


Note: you may need to restart the kernel to use updated packages.


In [13]:
import pandas as pd

#### Scrape neighbourhoods data

In [56]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
dfs = pd.read_html(url)
df1 = dfs[0]

#### Remove records where a borough is not assigned

In [57]:
df1.drop(df[df['Borough']=="Not assigned"].index, inplace = True) 

In [58]:
df1

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


#### Identify the number of rows and columns in cleaned dataframe

In [59]:
print('The dataframe has ', df1.shape[0], ' rows and ', df1.shape[1], ' columns.')

The dataframe has  180  rows and  3  columns.


## Part 2: Obtain latitude and longitude coordinates for each neighbourhood

#### Import csv file

In [60]:
path = "https://raw.githubusercontent.com/sarahmoakler/Coursera_Capstone/master/Geospatial_Coordinates.csv"
df2 = pd.read_csv(path)
df2

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


#### Merge the dataframes into one

In [61]:
df = df1.merge(df2, on="Postal Code")

In [62]:
df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


#### Identify the number of rows and columns in merged dataframe

In [63]:
print('The dataframe has ', df.shape[0], ' rows and ', df.shape[1], ' columns.')

The dataframe has  103  rows and  5  columns.


## Part 3: Explore and Cluster Neighbourhoods

#### Work with boroughs that include North York (report any observations, generate maps to visualize and how they cluster)

#### Determine which borough has the most neighbourhoods

In [64]:
df['Borough'].value_counts()

North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
York                 5
East York            5
East Toronto         5
Mississauga          1
Name: Borough, dtype: int64

#### Since North York has the most neighbourhoods. I will create a new data frame for  North York and cluster this borough.

In [66]:
NorthYork_df = df[df['Borough']=='North York'].reset_index(drop=True)

#### Identify the number of rows and columns in the North York dataframe

In [69]:
print('The dataframe has ', NorthYork_df.shape[0], ' rows and ', NorthYork_df.shape[1], ' columns.')

The dataframe has  24  rows and  5  columns.


#### The North York dataframe has 24 rows which matches the original count from the original dataframe.

#### Import more libraries and packages

In [76]:
!conda install -c conda-forge folium=0.5.0 --yes

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: \ 
Found conflicts! Looking for incompatible packages.
This can take several minutes.  Press CTRL-C to abort.
                                                                                                                    \failed

UnsatisfiableError: The following specifications were found
to be incompatible with the existing python installation in your environment:

Specifications:

  - cffi -> python[version='2.7.*|3.5.*|3.6.*|3.6.12|3.6.12|>=3.6,<3.7.0a0|>=3.7,<3.8.0a0|>=3.9,<3.10.0a0|>=3.8,<3.9.0a0|3.7.9|3.6.9|3.6.9|3.6.9|>=2.7,<2.8.0a0|3.6.9|>=3.5,<3.6.0a0|3.4.*',build='0_73_pypy|2_73_pypy|3_73_pypy|5_73_pypy|4_73_pypy|1_73_pypy']
  - rsa -> python[version='2.7.*|3.4.*|3.5.*|3.6.*']


In [114]:
import numpy as np
import json
from geopy.geocoders import Nominatim
import requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
!pip install folium
import folium



#### Utilize Foursquare API to explore North York's neighbourhoods and segment them

In [177]:
{
    "tags": [
        "remove-input",
    ]
}
CLIENT_ID = '0K1GTLHJ5ZCI4DMT4LF4CVL01G4VIOO3SK5AQ55AZPVNVYK1'
CLIENT_SECRET = 'ROW3NMQDMAQWJTAEID52BNLGQDTYIIOLYZ15B5XHXA3ZBNIW'
VERSION = '20180605'
LIMIT = 100

#### Create function to get nearby venues

In [88]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Run the function to get venues for each North York neighbourhood

In [89]:
NorthYork_venues = getNearbyVenues(names=NorthYork_df['Neighbourhood'], 
                                   latitudes=NorthYork_df['Latitude'],
                                   longitudes=NorthYork_df['Longitude'])

Parkwoods
Victoria Village
Lawrence Manor, Lawrence Heights
Don Mills
Glencairn
Don Mills
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Fairview, Henry Farm, Oriole
Northwood Park, York University
Bayview Village
Downsview
York Mills, Silver Hills
Downsview
North Park, Maple Leaf Park, Upwood Park
Humber Summit
Willowdale, Newtonbrook
Downsview
Bedford Park, Lawrence Manor East
Humberlea, Emery
Willowdale, Willowdale East
Downsview
York Mills West
Willowdale, Willowdale West


#### Check the size of the new North York venues dataframe

In [93]:
print('The dataframe has ', NorthYork_venues.shape[0], ' rows and ', NorthYork_venues.shape[1], ' columns.')

The dataframe has  243  rows and  7  columns.


#### One hot encode the venues

In [105]:
NorthYork_onehot = pd.get_dummies(NorthYork_venues[['Venue Category']], prefix="", prefix_sep="")
NorthYork_onehot['Neighbourhood'] = NorthYork_venues['Neighborhood']
fixed_columns = [NorthYork_onehot.columns[-1]] + list(NorthYork_onehot.columns[:-1])
NorthYork_onehot = NorthYork_onehot[fixed_columns]

#### Check the size of the North York one hot encoded dataframe

In [108]:
print('The dataframe has ', NorthYork_onehot.shape[0], ' rows and ', NorthYork_onehot.shape[1], ' columns.')

The dataframe has  243  rows and  103  columns.


#### Group each neighbourhood by taking the mean of the frequency of occurence for each venue category

In [109]:
NorthYork_grouped = NorthYork_onehot.groupby('Neighbourhood').mean().reset_index()

#### Check the size of the North York grouped dataframe

In [111]:
print('The dataframe has ', NorthYork_grouped.shape[0], ' rows and ', NorthYork_grouped.shape[1], ' columns.')

The dataframe has  19  rows and  103  columns.


#### Create a dataframe for the top 10 venues in each neighbourhood

In [112]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [142]:
num_top_venues = 10
indicators = ['st', 'nd', 'rd']
columns = ['Neighbourhood']

for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

NorthYork_venues_sorted = pd.DataFrame(columns=columns)
NorthYork_venues_sorted['Neighbourhood'] = NorthYork_grouped['Neighbourhood']

for ind in np.arange(NorthYork_grouped.shape[0]):
    NorthYork_venues_sorted.iloc[ind, 1:] = return_most_common_venues(NorthYork_grouped.iloc[ind, :], num_top_venues)

NorthYork_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Fried Chicken Joint,Pharmacy,Pizza Place,Bridal Shop,Diner,Ice Cream Shop,Park,Deli / Bodega
1,Bayview Village,Chinese Restaurant,Café,Bank,Japanese Restaurant,Women's Store,Distribution Center,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega
2,"Bedford Park, Lawrence Manor East",Italian Restaurant,Thai Restaurant,Coffee Shop,Sandwich Place,Greek Restaurant,Restaurant,Grocery Store,Indian Restaurant,Juice Bar,Liquor Store
3,Don Mills,Gym,Japanese Restaurant,Coffee Shop,Beer Store,Restaurant,Discount Store,Italian Restaurant,Bike Shop,Dim Sum Restaurant,Sandwich Place
4,Downsview,Grocery Store,Park,Athletics & Sports,Electronics Store,Liquor Store,Baseball Field,Food Truck,Bank,Gym / Fitness Center,Airport


#### Cluster North York neighbourhoods into 5 clusters

In [143]:
kclusters = 5
NorthYork_grouped_clustering = NorthYork_grouped.drop('Neighbourhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(NorthYork_grouped_clustering)

#### Create a new dataframe for the clusters that includes the top 10 venues for each neighbourhood

In [163]:
#NorthYork_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
NorthYork_merged = NorthYork_df
NorthYork_merged = NorthYork_merged.join(NorthYork_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

In [164]:
NorthYork_merged = NorthYork_merged.dropna(subset=['Cluster Labels']).reset_index(drop=True)

In [166]:
NorthYork_merged['Cluster Labels'] = NorthYork_merged['Cluster Labels'].astype(int)

#### Obtain geographical coordinates of North York

In [168]:
address = 'North York, ON'
geolocator = Nominatim(user_agent="on_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('Geograpical coordinate of North York is {}, {}'.format(latitude, longitude))

Geograpical coordinate of North York is 43.7543263, -79.44911696639593


#### Visualize the clusters

In [175]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

colors_array = cm.rainbow(np.linspace(0, 1, kclusters))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(NorthYork_merged['Latitude'], NorthYork_merged['Longitude'], NorthYork_merged['Neighbourhood'], NorthYork_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster - 1],
        fill=True,
        fill_color=rainbow[cluster - 1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters