# Segmenting and Clustering Neighborhoods in Toronto

### Notebook by Skyler Schilke for Applied Data Science Capstone course on Coursera

## Part 1
### First we Retrieve Table Contents with BeautifulSoup

In [1]:
# import necessary packages
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# define url and get contents
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url)
# return prettify html content
soup = BeautifulSoup(page.content, 'html.parser')
# print(soup.prettify())


In [3]:
# define the target table from the html
my_table = soup.find('table', {'class': 'wikitable sortable'})

In [4]:
# within the table, get the content
headers = my_table.findAll('th')
contents = my_table.findAll('td')

### Next, populate the contents of the table into lists

In [5]:
# fill up the lists of columns and all_content
columns = []
all_content = []
for header in headers:
    columns.append(header.get_text())
for content in contents:
    all_content.append(content.get_text())

In [6]:
# split up all_content into their appropriate columns 
postcode = all_content[0::3]
borough = all_content[1::3]
neigh = all_content[2::3]

# remove the '\n' from the Neighbourhood column
columns[2] = columns[2][:-1] # remove the '\n' from the Neighbourhood column

In [7]:
# check how long the columns are and verify the first three values of each list
print('The postcode list is length: ', len(postcode), '.  The first three values are: ', postcode[0:3])
print('The borough list is length: ', len(borough), '.  The first three values are: ', borough[0:3])
print('The neigh list is length: ', len(neigh), '.  The first three values are: ', neigh[0:3])

The postcode list is length:  288 .  The first three values are:  ['M1A', 'M2A', 'M3A']
The borough list is length:  288 .  The first three values are:  ['Not assigned', 'Not assigned', 'North York']
The neigh list is length:  288 .  The first three values are:  ['Not assigned\n', 'Not assigned\n', 'Parkwoods\n']


In [8]:
# create the dataframe with the argument columns=columns to preserve the order

df = pd.DataFrame({columns[0]: postcode,
                  columns[1]: borough,
                  columns[2]: neigh}, 
                 columns=columns)

# return sample of df
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


In [9]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


### Now time to clean up the data

In [10]:
# remove the \n from neighbourhood
df['Neighbourhood'] = df['Neighbourhood'].str[:-1]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### First remove the rows with borough = 'Not assigned'

In [11]:
# ignore cells with a borough that is Not assigned
df = df[df['Borough'] != 'Not assigned']
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### Combine rows with the same Postcode and include all neighbourhoods in same row, comma separated

In [12]:
# create a new df groupby and aggregate with a comma
df2 = df.set_index('Postcode').astype(str).groupby(level=0).agg(', '.join).reset_index()

df2.drop(columns = 'Borough', axis=1, inplace=True)
df.drop(columns = 'Neighbourhood', axis=1, inplace=True)

In [13]:
# merge the two df's on the Postcode and drop duplicate postcodes in a new df
df3 = pd.merge(df2, df, on='Postcode')
df3 = df3.drop_duplicates('Postcode')
df3 = df3[columns]

### If Neighborhood is 'Not assigned', set it equal to the Bourough for the same row

In [14]:
# check to see if there is anywhere where Neighbourhood == 'Not assigned'
df3[df3['Neighbourhood'] == 'Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood
159,M7A,Queen's Park,Not assigned


In [15]:
# Whereever Neighbourhood == 'Not assigned', make it equal the Bourough for the same row
df3['Neighbourhood'][df3['Neighbourhood'] == 'Not assigned'] = df3['Borough']
# make sure the result is blank
df3[df3['Neighbourhood'] == 'Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood


### Results

In [16]:
# return a sample set of the final df
df3.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
2,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
5,M1E,Scarborough,"Guildwood, Morningside, West Hill"
8,M1G,Scarborough,Woburn
9,M1H,Scarborough,Cedarbrae
10,M1J,Scarborough,Scarborough Village
11,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
14,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
17,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
20,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [17]:
# return the shape of the final df
df3.shape

(103, 3)

## Part 2
### Add latitude and longitude coordinates

In [18]:
# read csv from link
df_coords = pd.read_csv('https://cocl.us/Geospatial_data')

In [19]:
# rename column to match df3
df_coords = df_coords.rename(columns={'Postal Code':'Postcode'})
df_coords.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [20]:
# merge on 'Postcode'
df_final = pd.merge(df3, df_coords, on='Postcode')
df_final.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [21]:
df_final.shape

(103, 5)

## Part 3
### Now, we will cluster the buroughs based on postal codes using k-means clusters based on place type in each Postcode from Foursquare API

In [22]:
# return how many boroughs can be curated from all the returned venues
print('There are {} unique Boroughs.'.format(len(df_final['Borough'].unique())))
df_counts = df_final.groupby('Borough').count()
df_counts = df_counts.drop(['Neighbourhood', 'Latitude', 'Longitude'], axis=1)
df_counts = df_counts.rename(columns={'Postcode':'Count'})
df_counts

There are 11 unique Boroughs.


Unnamed: 0_level_0,Count
Borough,Unnamed: 1_level_1
Central Toronto,9
Downtown Toronto,18
East Toronto,5
East York,5
Etobicoke,12
Mississauga,1
North York,24
Queen's Park,1
Scarborough,17
West Toronto,6


In [23]:
# The code was removed by Watson Studio for sharing.

In [24]:
# define function to explore neighborhoods in toronto
LIMIT = 100

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postcode', 
                  'Postcode Latitude', 
                  'Postcode Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [25]:
toronto_venues = getNearbyVenues(names=df_final['Postcode'], latitudes=df_final['Latitude'], longitudes=df_final['Longitude'])

M1B
M1C
M1E
M1G
M1H
M1J
M1K
M1L
M1M
M1N
M1P
M1R
M1S
M1T
M1V
M1W
M1X
M2H
M2J
M2K
M2L
M2M
M2N
M2P
M2R
M3A
M3B
M3C
M3H
M3J
M3K
M3L
M3M
M3N
M4A
M4B
M4C
M4E
M4G
M4H
M4J
M4K
M4L
M4M
M4N
M4P
M4R
M4S
M4T
M4V
M4W
M4X
M4Y
M5A
M5B
M5C
M5E
M5G
M5H
M5J
M5K
M5L
M5M
M5N
M5P
M5R
M5S
M5T
M5V
M5W
M5X
M6A
M6B
M6C
M6E
M6G
M6H
M6J
M6K
M6L
M6M
M6N
M6P
M6R
M6S
M7A
M7R
M7Y
M8V
M8W
M8X
M8Y
M8Z
M9A
M9B
M9C
M9L
M9M
M9N
M9P
M9R
M9V
M9W


In [26]:
print(toronto_venues.shape)
toronto_venues.head()

(2254, 7)


Unnamed: 0,Postcode,Postcode Latitude,Postcode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M1B,43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,M1B,43.806686,-79.194353,Interprovincial Group,43.80563,-79.200378,Print Shop
2,M1C,43.784535,-79.160497,Chris Effects Painting,43.784343,-79.163742,Construction & Landscaping
3,M1C,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
4,M1E,43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place


In [27]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 279 uniques categories.


### Next, we will make these into one hot codes, group, and standardize

In [28]:
# analyze each postcode categories 
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add postcode column back to dataframe
toronto_onehot['Postcode'] = toronto_venues['Postcode'] 

# move postcode column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

print(toronto_onehot.shape)
toronto_onehot.head()

(2254, 280)


Unnamed: 0,Postcode,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M1C,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M1C,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M1E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
# group rows by postcode and take mean of frequency of occurency in each category
toronto_grouped = toronto_onehot.groupby('Postcode').mean().reset_index()
print(toronto_grouped.shape)
toronto_grouped.head()

(100, 280)


Unnamed: 0,Postcode,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
# put top 5 most common venues in a pandas df
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [31]:
import numpy as np
num_top_venues = 3

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postcode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
postcodes_venues_sorted = pd.DataFrame(columns=columns)
postcodes_venues_sorted['Postcode'] = toronto_grouped['Postcode']

for ind in np.arange(toronto_grouped.shape[0]):
    postcodes_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

postcodes_venues_sorted.head()

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,M1B,Fast Food Restaurant,Print Shop,Dessert Shop
1,M1C,Construction & Landscaping,Bar,Yoga Studio
2,M1E,Intersection,Spa,Electronics Store
3,M1G,Coffee Shop,Korean Restaurant,Convenience Store
4,M1H,Hakka Restaurant,Thai Restaurant,Caribbean Restaurant


### Now perform k means

In [33]:
from sklearn.cluster import KMeans 
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Postcode', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 



array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

### Now we add the cluster labels to our df

In [34]:
# add clustering labels
postcodes_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_final

# merge toronto_grouped with toronto_data to add latitude/longitude for each postcode
toronto_merged = toronto_merged.join(postcodes_venues_sorted.set_index('Postcode'), on='Postcode')

# remove any rows that have NaN
toronto_merged = toronto_merged.dropna()
toronto_merged = toronto_merged.astype({'Cluster Labels': 'int64'})
toronto_merged.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,0,Fast Food Restaurant,Print Shop,Dessert Shop
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,0,Construction & Landscaping,Bar,Yoga Studio
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,0,Intersection,Spa,Electronics Store
3,M1G,Scarborough,Woburn,43.770992,-79.216917,0,Coffee Shop,Korean Restaurant,Convenience Store
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,0,Hakka Restaurant,Thai Restaurant,Caribbean Restaurant


### Generate a map for all the clusters

In [35]:
# !conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library
import matplotlib.cm as cm
import matplotlib.colors as colors

In [36]:
# define center of toronto
latitude = 43.6532
longitude = -79.3832
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Postcode'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Details on the clusters

In [37]:
print('Cluster 0 has a length of: ' ,toronto_merged.loc[toronto_merged['Cluster Labels'] == 0].shape[0])
print('Cluster 1 has a length of: ' ,toronto_merged.loc[toronto_merged['Cluster Labels'] == 1].shape[0])
print('Cluster 2 has a length of: ' ,toronto_merged.loc[toronto_merged['Cluster Labels'] == 2].shape[0])
print('Cluster 3 has a length of: ' ,toronto_merged.loc[toronto_merged['Cluster Labels'] == 3].shape[0])
print('Cluster 4 has a length of: ' ,toronto_merged.loc[toronto_merged['Cluster Labels'] == 4].shape[0])

Cluster 0 has a length of:  83
Cluster 1 has a length of:  1
Cluster 2 has a length of:  13
Cluster 3 has a length of:  1
Cluster 4 has a length of:  2


In [38]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,Scarborough,0,Fast Food Restaurant,Print Shop,Dessert Shop
1,Scarborough,0,Construction & Landscaping,Bar,Yoga Studio
2,Scarborough,0,Intersection,Spa,Electronics Store
3,Scarborough,0,Coffee Shop,Korean Restaurant,Convenience Store
4,Scarborough,0,Hakka Restaurant,Thai Restaurant,Caribbean Restaurant
5,Scarborough,0,Playground,Yoga Studio,Dumpling Restaurant
6,Scarborough,0,Department Store,Playground,Discount Store
7,Scarborough,0,Bus Line,Bakery,Fast Food Restaurant
8,Scarborough,0,Motel,American Restaurant,Yoga Studio
9,Scarborough,0,College Stadium,General Entertainment,Skating Rink


In [39]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
94,Etobicoke,1,Bank,Yoga Studio,Dumpling Restaurant


In [40]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
14,Scarborough,2,Park,Playground,Asian Restaurant
23,North York,2,Park,Convenience Store,Bank
25,North York,2,Fast Food Restaurant,Park,Food & Drink Shop
30,North York,2,Park,Airport,Other Repair Shop
40,East York,2,Park,Coffee Shop,Convenience Store
44,Central Toronto,2,Park,Bus Line,Swim School
50,Downtown Toronto,2,Park,Playground,Trail
64,Central Toronto,2,Trail,Park,Sushi Restaurant
72,North York,2,Pizza Place,Park,Japanese Restaurant
74,York,2,Park,Pharmacy,Market


In [41]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
20,North York,3,Cafeteria,Dumpling Restaurant,Diner


In [42]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
91,Etobicoke,4,Baseball Field,Yoga Studio,Discount Store
97,North York,4,Baseball Field,Yoga Studio,Discount Store
