# Description:

Purpose of the notebook: This notebook contains the work performed by myself, Roel van der Zee, for the assignment, 'Segmenting and Clustering Neighborhoods in Toronto,' as part of the IBM Data Science Professional Certificate - Applied Data Science Capstone course - week 3.
Course URL: https://www.coursera.org/learn/applied-data-science-capstone/

Assignment description: Scrape a Wikipedia page to obtain information on neighborhoods in Toronto using BeautifulSoup package. Perform data wrangling. Explore and cluster neighborhoods in Toronto using Foursquare API and K-means clustering.

#### Additional resources:

Wiki page URL: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

Neighborhood latitude and longitude coordinates document: http://cocl.us/Geospatial_data

## Step 1: Scrape data from Wikipedia page containing information about the neighborhoods in Toronto, Canada into a dataframe

In [1]:
# install beautifulsoup web scraping package and other required libraries
!pip install beautifulsoup4 
!pip install lxml # parser
!pip install html5lib
!pip install requests
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
print('Libraries imported!')



ERROR: Invalid requirement: '#'


Libraries imported!


In [2]:
# parse the html data from the Wikipedia page into a BeautifulSoup object
URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(URL)

soup = BeautifulSoup(page.content, 'lxml')

In [3]:
# create a new dataframe to store neighborfood data
columns = ['postalcode','borough','neighborhood']
neigh_df = pd.DataFrame(columns=columns)
neigh_df

Unnamed: 0,postalcode,borough,neighborhood


In [4]:
# create empty lists to store Postal Code, Borough, and Neigborhood data
postalcode_list = []
borough_list = []
neighborhood_list = []

# loop through soup and append values with tag 'tr' and td' to the 3 lists created above
for tr in soup.find_all('tr')[1:]:
    tds = tr.find_all('td')
    try: # use try/ except to avoid errors
        postalcode_list.append(tds[0].text)
        borough_list.append(tds[1].text)
        neighborhood_list.append(tds[2].text[:-1]) # remove unwanted characters from neighborhood names
    except Exception as e:
        pass

# delete unnecessary items from each list
del(postalcode_list[287:292])
del(borough_list[287:291])
del(neighborhood_list[287:290])

# confirm that each list has the same number of elements & that neighborhood list contains all neighborhoods from the wiki table
print(len(postalcode_list))
print(len(borough_list))
print(len(neighborhood_list))
print(neighborhood_list)

184
184
183
['Not assigned', 'Not assigned', 'Parkwoods', 'Victoria Village', 'Regent Park, Harbourfront', 'Lawrence Manor, Lawrence Heights', "Queen's Park, Ontario Provincial Government", 'Not assigned', 'Islington Avenue, Humber Valley Village', 'Malvern, Rouge', 'Not assigned', 'Don Mills', 'Parkview Hill, Woodbine Gardens', 'Garden District, Ryerson', 'Glencairn', 'Not assigned', 'Not assigned', 'West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale', 'Rouge Hill, Port Union, Highland Creek', 'Not assigned', 'Don Mills', 'Woodbine Heights', 'St. James Town', 'Humewood-Cedarvale', 'Not assigned', 'Not assigned', 'Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood', 'Guildwood, Morningside, West Hill', 'Not assigned', 'Not assigned', 'The Beaches', 'Berczy Park', 'Caledonia-Fairbanks', 'Not assigned', 'Not assigned', 'Not assigned', 'Woburn', 'Not assigned', 'Not assigned', 'Leaside', 'Central Bay Street', 'Christie', 'Not assigned', 'Not assigned', 'Not 

In [5]:
neigh_df['postalcode'] = postalcode_list
neigh_df['borough'] = borough_list
neigh_df['neighborhood'] = neighborhood_list
neigh_df.head()

ValueError: Length of values (183) does not match length of index (184)

In [None]:
# delete rows which have 'Not assigned' as borough
neigh_df = neigh_df[neigh_df.borough != 'Not assigned'].reset_index()
print("Number of rows before combining: ", neigh_df.shape[0])
neigh_df.head()

In [None]:
# more than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.

# aggregate rows
neigh_df = neigh_df.groupby(['postalcode', 'borough'], as_index = False).agg({'neighborhood': ', '.join})
print("Number of rows after combining: ", neigh_df.shape[0])
neigh_df.head()

In [None]:
# if a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

x = 0

for i in range(len(neigh_df['neighborhood'])):
    if neigh_df.loc[i, 'neighborhood'] == 'Not assigned':
        neigh_df.loc[i, 'neighborhood'] = neigh_df.loc[i, 'borough']
        x = x + 1 # count number of updates
print("Number of updates made: ", x)

## Step 2: Add latitude and longitude coordinates to the dataframe for each neighborhood

In [None]:
# read the longtitude, latitude document into a data frame
geo_df = pd.read_csv("http://cocl.us/Geospatial_data")
geo_df.head()

In [None]:
# add longtitude and latitude coordinates to the data frame for each neighborhood
for i in range(len(neigh_df['neighborhood'])):
    if neigh_df.loc[i,'postalcode'] == geo_df.loc[i,'Postal Code']:
        neigh_df.loc[i,'latitude'] = geo_df.loc[i,'Latitude']
        neigh_df.loc[i,'longitude'] = geo_df.loc[i,'Longitude']
        
neigh_df.head()

## Step 3: Explore and cluster neighborhoods in Toronto using Foursquare API

In [None]:
# create a new dataframe to store neighborfood data
columns = ['postalcode','borough','neighborhood','latitude','longitude']
toronto_df = pd.DataFrame(columns=columns)
toronto_df

In [None]:
# filter the toronto_df to retain only neighborhoods in Toronto
x = 0 # create counter x

for i in range(len(neigh_df['neighborhood'])):
    if "Toronto" in neigh_df.loc[i,'borough']:
        toronto_df.loc[i] = neigh_df.loc[i]
        x = x + 1

# reset index
toronto_df.reset_index(drop=True, inplace=True)

# print output
print('There are',x, 'neighborhoods in Toronto')
toronto_df.head()

In [None]:
# import folium library for mapping
!conda install -c conda-forge folium=0.5.0 --yes
import folium
print("done!")

In [None]:
# create variables to store latitude and longitude coordinates of Toronto
latitude = 43.7001100
longitude = -79.4163000

# # create map of Manhattan using latitude and longitude values

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_df['latitude'], toronto_df['longitude'], toronto_df['neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [None]:
# The code was removed by Watson Studio for sharing.

In [None]:
# limit number of venues returned by Foursquare API to 100
LIMIT = 100

# set radius of 500 meters
radius = 500 

# create a custom function to retrieve nearby venues from Foursquare API
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['neighborhood', 
                  'neighborhood latitude', 
                  'neighborhood longitude', 
                  'venue', 
                  'venue latitude', 
                  'venue longitude', 
                  'venue category']
    
    return(nearby_venues)

In [None]:
# run the custom function above and print outputs
toronto_venues = getNearbyVenues(names=toronto_df['neighborhood'],
                                   latitudes=toronto_df['latitude'],
                                   longitudes=toronto_df['longitude']
                                  )

print(toronto_venues.shape)
toronto_venues.head()

In [None]:
# check venues returned fro each neighborhood
toronto_venues.groupby('neighborhood').count()

In [None]:
print('There are {} uniques categories.'.format(len(toronto_venues['venue category'].unique())))

In [None]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['venue category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['neighborhood'] = toronto_venues['neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

print(toronto_onehot.shape)
toronto_onehot.head()

In [None]:
# group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
toronto_grouped = toronto_onehot.groupby('neighborhood').mean().reset_index()
print(toronto_grouped.shape)
toronto_grouped.head()

In [None]:
# print each neighborhood along with the top 5 most common venues
num_top_venues = 5

for hood in toronto_grouped['neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

In [None]:
# create a function to sort the venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
# create a new dataframe and display the top 10 venues for each neighborhood
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['neighborhood'] = toronto_grouped['neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

In [None]:
# import K-means library
from sklearn.cluster import KMeans

# run k-means to cluster the neighborhood into 5 clusters
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

In [None]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood
# add clustering labels
try: # use try/ except to avoid errors
    neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
except Exception as e:
    pass
    
toronto_merged = toronto_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('neighborhood'), on='neighborhood')

toronto_merged.head() # check the last columns!

In [None]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# visualize the resutling clusters
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['latitude'], toronto_merged['longitude'], toronto_merged['neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [None]:
# examine cluster 1
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]].head()

In [None]:
# examine cluster 2
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]].head()

In [None]:
# examine cluster 3
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]].head()

In [None]:
# examine cluster 4
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]].head()

In [None]:
# examine cluster 5
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]].head()