# Peer-grade Assignment: Segmenting and Clustering Neighborhoods in Toronto (week3)

## Part 1: Scraping data from a Wikipedia page and cleaning up our dataframe

In [43]:
# Import the key libraries for scraping and processing the data
import requests
import lxml.html as lh
import pandas as pd
import numpy as np

In [2]:
# Now we assign the link of the website through which we are going to scrape the data
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# Create a handle, page, to handle the contents of the website
page = requests.get(url)

# Store the contents of the website under doc
doc = lh.fromstring(page.content)

# Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')

In [3]:
# Create empty list col and a counter index i which with initial value 0
col=[]
i=0

# For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    print ('%d: %s' % (i,name))
    col.append((name,[]))

1: Postal code

2: Borough

3: Neighborhood



In [4]:
# For sanity check, ensure that all the rows have the same width. If not, we probably got something more than just the table.
#Check the length of the first 12 rows
[len(T) for T in tr_elements[:12]]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

In [5]:
# Creating Pandas DataFrame - each header is appended to a tuple along with an empty list.
# Since out first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 3, the //tr data is not from our table 
    if len(T)!=3:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

In [6]:
# Just to be sure, let’s check the length of each column. Ideally, they should all be the same.
[len(C) for (title,C) in col]

[181, 181, 181]

In [7]:
# Now we are ready to create the DataFrame:
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

# The column names and the values now have \n at the end, let's remove it:
df.replace(regex=r'\n', value='',inplace=True)
df.rename(columns={"Postal code\n":"Postal code","Borough\n":"Borough","Neighborhood\n":"Neighborhood"},inplace=True)

# When there are multiple values in Neighborhood  column, they are separated by "/", let's use "," instead:
df.replace(regex=r' /', value=',',inplace=True)

In [8]:
# Remove all the rows where Borough has a "Not assigned" value
df_cleaned = df[df.Borough !="Not assigned"]

# The last row of the table does not have observations anymore, so let's remove it as well
df_final = df_cleaned[df_cleaned.Borough !="Canadian postal codes"]

In [9]:
# Reset the index counter to start from 0 again and omit values that were removed
df_final.reset_index(inplace=True, drop=True)

# Show the final version of the dataframe after all the amendments
df_final

# Use the .shape method to print the number of rows of your dataframe
df_final.shape[0]

103

## Part 2: Getting the latitude and the longitude coordinates of each neighborhood

In [10]:
# Import additional libraries to handle coordinates
import geocoder # import geocoder

In [11]:
col_one_list = df_final['Postal code'].tolist()

In [12]:
import pgeocode
nomi = pgeocode.Nominatim('ca')
df_output = nomi.query_postal_code(col_one_list)

In [13]:
df_tomerge = df_output[["postal_code","latitude","longitude"]]

In [14]:
df_merged = df_final.merge(df_tomerge, left_on="Postal code", right_on="postal_code") 
df_merged.head()

Unnamed: 0,Postal code,Borough,Neighborhood,postal_code,latitude,longitude
0,M3A,North York,Parkwoods,M3A,43.7545,-79.33
1,M4A,North York,Victoria Village,M4A,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",M5A,43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",M6A,43.7223,-79.4504
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",M7A,43.6641,-79.3889


In [15]:
# Omit the column postal_code to avoid duplication
df_merged = df_merged.drop(["postal_code"], axis=1)
df_merged.head()

Unnamed: 0,Postal code,Borough,Neighborhood,latitude,longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889


## Part 3: Exploring and clustering the neighborhoods in Toronto

## i) Explore dataset and define necessary functions

First, let's explore the dataframe we have created and check which Borough has the highest frequency in our sample. We will then explore this particular Borough further and run a comparable exercise to the one from the lab for Manhattan data.

In [50]:
# Let's first import all the necessary libraries and function we will use in this section
import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

print('Libraries imported')

Libraries imported


In [51]:
# Let's explore check the description of the key characteristics of our dataframe, including the non-numeric series
df_merged.describe(include="all")

Unnamed: 0,Postal code,Borough,Neighborhood,latitude,longitude
count,102,102,102,102.0,102.0
unique,102,9,97,,
top,M6E,North York,Downsview,,
freq,1,24,4,,
mean,,,,43.706716,-79.393987
std,,,,0.053028,0.096185
min,,,,43.6021,-79.5909
25%,,,,43.6611,-79.45175
50%,,,,43.70395,-79.3888
75%,,,,43.7491,-79.33775


Given that "North York" is the most frequent Borough, let's explore it in more detail


In [52]:
# Let's create a separate dataframe for North York borough:
northyork_data = df_merged[df_merged['Borough'] == 'North York'].reset_index(drop=True)
northyork_data.head()

Unnamed: 0,Postal code,Borough,Neighborhood,latitude,longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
3,M3B,North York,Don Mills,43.745,-79.359
4,M6B,North York,Glencairn,43.7081,-79.4479


In [53]:
# Get the coordinates of North York as the average of actual latitude and longitude that we have in our sample
latitude = northyork_data.latitude.mean()
longitude = northyork_data.longitude.mean()

print('The geograpical coordinate of North York, Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of North York, Toronto are 43.7509125, -79.42906666666666.


In [54]:
# create map of North York using latitude and longitude values
map_northyork = folium.Map(location=[latitude, longitude], zoom_start=12)
map_northyork

In [55]:
# Define Foursquare Credentials and Version
CLIENT_ID = 'ATQ4JJEQ2RG0DIKCZDRPVNHT3KIA4NVDT3XNM0C055FOHRPF' # your Foursquare ID
CLIENT_SECRET = 'TILNUCGBVRS41JMMWNMMM4A2JXYL4APWFY4MN23KW3SSBYQ0' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: ATQ4JJEQ2RG0DIKCZDRPVNHT3KIA4NVDT3XNM0C055FOHRPF
CLIENT_SECRET:TILNUCGBVRS41JMMWNMMM4A2JXYL4APWFY4MN23KW3SSBYQ0


In [56]:
# Let's explore the first neighborhood in our dataframe, get the neighborhood's name.
df_merged.loc[0, 'Neighborhood']


neighborhood_latitude = df_merged.loc[0, 'latitude'] # neighborhood latitude value
neighborhood_longitude = df_merged.loc[0, 'longitude'] # neighborhood longitude value

neighborhood_name = df_merged.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

# type your answer here
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

# Run the API request 
results = requests.get(url).json()

Latitude and longitude values of Parkwoods are 43.7545, -79.33.


In [57]:
# Similarly to the analysis in the lab, let's define a function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [58]:
# Now we are ready to clean the json and structure it into a pandas dataframe
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

  after removing the cwd from sys.path.


Unnamed: 0,name,categories,lat,lng
0,Brookbanks Park,Park,43.751976,-79.33214
1,Variety Store,Food & Drink Shop,43.751974,-79.333114
2,Corrosion Service Company Limited,Construction & Landscaping,43.752432,-79.334661


In [59]:
# Define a function that will repeat the same process for all the neighborhoods

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [32]:
# Run the above function for our selected Borough North York and create new dataframe called northyork_venues
northyork_venues = getNearbyVenues(names=northyork_data['Neighborhood'],
                                   latitudes=northyork_data['latitude'],
                                   longitudes=northyork_data['longitude']
                                  )

Parkwoods
Victoria Village
Lawrence Manor, Lawrence Heights
Don Mills
Glencairn
Don Mills
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Fairview, Henry Farm, Oriole
Northwood Park, York University
Bayview Village
Downsview
York Mills, Silver Hills
Downsview
North Park, Maple Leaf Park, Upwood Park
Humber Summit
Willowdale, Newtonbrook
Downsview
Bedford Park, Lawrence Manor East
Humberlea, Emery
Willowdale
Downsview
York Mills West
Willowdale


In [60]:
# Let's check the size of the resulting dataframe
print(northyork_venues.shape)
northyork_venues.head()

(316, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.7545,-79.33,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.7545,-79.33,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Parkwoods,43.7545,-79.33,Corrosion Service Company Limited,43.752432,-79.334661,Construction & Landscaping
3,Victoria Village,43.7276,-79.3148,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.7276,-79.3148,Tim Hortons,43.725517,-79.313103,Coffee Shop


In [61]:
# Let's check how many venues were returned for each neighborhood
northyork_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Bathurst Manor, Wilson Heights, Downsview North",6,6,6,6,6,6
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",24,24,24,24,24,24
Don Mills,7,7,7,7,7,7
Downsview,27,27,27,27,27,27
"Fairview, Henry Farm, Oriole",60,60,60,60,60,60
Glencairn,11,11,11,11,11,11
Hillcrest Village,2,2,2,2,2,2
Humber Summit,3,3,3,3,3,3
"Humberlea, Emery",6,6,6,6,6,6


In [62]:
# Let's find our how many unique categories can be curated from all the returned venues
print('There are {} uniques categories.'.format(len(northyork_venues['Venue Category'].unique())))

There are 116 uniques categories.


## ii)  Analyze Each Neighborhood

In [63]:
# one hot encoding
northyork_onehot = pd.get_dummies(northyork_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
northyork_onehot['Neighborhood'] = northyork_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [northyork_onehot.columns[-1]] + list(northyork_onehot.columns[:-1])
northyork_onehot = northyork_onehot[fixed_columns]

northyork_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Baby Store,Bakery,Bank,Bar,...,Sushi Restaurant,Tailor Shop,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Trail,Video Game Store,Vietnamese Restaurant,Women's Store
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [64]:
# Let's check the new dataframe size
northyork_onehot.shape

(316, 117)

In [65]:
# Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
northyork_grouped = northyork_onehot.groupby('Neighborhood').mean().reset_index()

In [66]:
# Let's print each neighborhood along with the top 5 most common venues
num_top_venues = 5

for hood in northyork_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = northyork_grouped[northyork_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Bathurst Manor, Wilson Heights, Downsview North----
                       venue  freq
0  Middle Eastern Restaurant  0.17
1                Pizza Place  0.17
2                Coffee Shop  0.17
3   Mediterranean Restaurant  0.17
4        Fried Chicken Joint  0.17


----Bayview Village----
               venue  freq
0              Trail  0.25
1               Park  0.25
2        Flower Shop  0.25
3        Gas Station  0.25
4  Accessories Store  0.00


----Bedford Park, Lawrence Manor East----
                venue  freq
0         Pizza Place  0.08
1      Sandwich Place  0.08
2  Italian Restaurant  0.08
3          Restaurant  0.08
4         Coffee Shop  0.08


----Don Mills----
                venue  freq
0                Park  0.29
1                 Gym  0.29
2                Pool  0.14
3               River  0.14
4  Golf Driving Range  0.14


----Downsview----
            venue  freq
0  Discount Store  0.07
1     Pizza Place  0.07
2     Coffee Shop  0.07
3   Grocery Store  0.07
4   Sh

In [67]:
# Let's put that into a pandas dataframe
# First, let's write a function to sort the venues in descending order:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [68]:
#Now let's create the new dataframe and display the top 10 venues for each neighborhood.
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = northyork_grouped['Neighborhood']

for ind in np.arange(northyork_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(northyork_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Bathurst Manor, Wilson Heights, Downsview North",Mediterranean Restaurant,Fried Chicken Joint,Deli / Bodega,Coffee Shop,Middle Eastern Restaurant,Pizza Place,Gym / Fitness Center,Gym,Ice Cream Shop,Department Store
1,Bayview Village,Trail,Park,Flower Shop,Gas Station,Women's Store,Fried Chicken Joint,Deli / Bodega,Department Store,Dessert Shop,Discount Store
2,"Bedford Park, Lawrence Manor East",Italian Restaurant,Sandwich Place,Coffee Shop,Restaurant,Pizza Place,Comfort Food Restaurant,Juice Bar,Thai Restaurant,Pharmacy,Café
3,Don Mills,Park,Gym,Pool,Golf Driving Range,River,Women's Store,Food Court,Deli / Bodega,Department Store,Dessert Shop
4,Downsview,Pizza Place,Shopping Mall,Discount Store,Coffee Shop,Grocery Store,Park,Pharmacy,Pool,Fast Food Restaurant,Caribbean Restaurant


## 3iii) Cluster Neighborhoods

In [69]:
# Run k-means to cluster the neighborhood into 5 clusters:
# set number of clusters
kclusters = 5

northyork_grouped_clustering = northyork_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(northyork_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 2, 0, 0])

In [70]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

northyork_merged = northyork_data

# merge northyork_grouped with northyork_data to add latitude/longitude for each neighborhood
northyork_merged = northyork_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

northyork_merged.head() # check the last columns!

Unnamed: 0,Postal code,Borough,Neighborhood,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.7545,-79.33,2,Construction & Landscaping,Park,Food & Drink Shop,Fried Chicken Joint,Deli / Bodega,Department Store,Dessert Shop,Discount Store,Distribution Center,Electronics Store
1,M4A,North York,Victoria Village,43.7276,-79.3148,0,Intersection,Portuguese Restaurant,Pizza Place,Hockey Arena,French Restaurant,Coffee Shop,Park,Department Store,Food Court,Discount Store
2,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504,0,Clothing Store,Coffee Shop,Cosmetics Shop,Restaurant,Women's Store,Men's Store,Furniture / Home Store,Sandwich Place,Electronics Store,Sporting Goods Shop
3,M3B,North York,Don Mills,43.745,-79.359,0,Park,Gym,Pool,Golf Driving Range,River,Women's Store,Food Court,Deli / Bodega,Department Store,Dessert Shop
4,M6B,North York,Glencairn,43.7081,-79.4479,0,Pizza Place,Gas Station,Latin American Restaurant,Fast Food Restaurant,Grocery Store,Mediterranean Restaurant,Asian Restaurant,Japanese Restaurant,Ice Cream Shop,Dessert Shop


In [71]:
# Let's visualize resulting clusters
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(northyork_merged['latitude'], northyork_merged['longitude'], northyork_merged['Neighborhood'], northyork_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters