# Segmenting and Clustering Neighborhoods in Toronto

## Part One: Scrape and Clean the data

In [43]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
df1 = []

# scrape the data using BS
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_url,'lxml')
My_table = soup.find('table',{'class':'wikitable sortable'})

# Ignore cells with a borough that is Not assigned
for tr in My_table.find_all('tr'):
    tds = tr.find_all('td')
    if not tds:
        continue
    a = [td.text.strip() for td in tds]
    if a[1] != 'Not assigned':
        df1.append(a)
        
# Replace neighborhood names Not assigned with Borough 
for i in df1:
    if i[1] != 'Not assigned' and i[2] == 'Not assigned': 
        i[2] = i[1]
df = pd.DataFrame(df1, columns = ['PostalCode', 'Borough', 'Neighborhood'])

# Merge Neighborhood with name PostalCode
cols = ['PostalCode', 'Borough']
df = df.groupby(cols).agg(', '.join).reset_index()

In [44]:
df.shape

(103, 3)

## Part Two: Get Latitude and Longtitude
### Use the given csv file

In [45]:
url = 'https://cocl.us/Geospatial_data'
df_geo = pd.read_csv(url)
df_geo.rename(columns = {'Postal Code': 'PostalCode'}, inplace = True)
df = pd.merge(df, df_geo, on = 'PostalCode')

In [46]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Part Three: Cluster the neighborhoods in Toronto

In [47]:
# get the latitude and longitude of Toronto
from geopy.geocoders import Nominatim
import folium
from pandas.io.json import json_normalize
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
address = 'Toronto City, Ontario'
geolocator = Nominatim(user_agent="ny_explorer", timeout=5)
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

GeocoderTimedOut: Service timed out

In [None]:
# filter only borough contains Toronto
df_toronto = df[df['Borough'].str.contains('toronto', case = False)].reset_index(drop = True)

In [None]:
# Define foursquare credentials and version
CLIENT_ID = 'QQ5UMYZIDNTTVGMTJJUMO2Z4RIAVMAAFOXG55FQYYWVIADIB' # your Foursquare ID
CLIENT_SECRET = 'X2MYQDLFZYYQMROJB1JR3RX10YLOJBXNGUXF2NJE5DBEJHF5' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100
radius = 500
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

In [None]:
# verify neighborhood Queen's Park becasue neighborhoods decrease one to 38
# turns out Queen's Park Neighborhood doesn't have venues returned
neighborhood_latitude = df_toronto.loc[df_toronto['Neighborhood'] == "Queen's Park", 'Latitude'].iloc[0] # neighborhood latitude value
neighborhood_longitude = df_toronto.loc[df_toronto['Neighborhood'] == "Queen's Park", 'Longitude'].iloc[0] # neighborhood longitude value

neighborhood_name = df_toronto.loc[df_toronto['Neighborhood'] == "Queen's Park", 'Neighborhood'].iloc[0] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
venues = requests.get(url).json()
venues

In [None]:
# define a function to process venues in each neighborhood in Toronto
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT=100):
    venues_list=[] # each element represents a list of venues in a neighborhood
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
# get all the venues for all toronto neighborhoods
toronto_venues = getNearbyVenues(names=df_toronto['Neighborhood'],
                                   latitudes=df_toronto['Latitude'],
                                   longitudes=df_toronto['Longitude']
                                  )

In [None]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

In [None]:
# Analyze each neighborhood
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
venue_columns = toronto_onehot.columns
toronto_onehot['Toronto Neighborhood'] = toronto_venues['Neighborhood'] 
# move neighborhood column to the first column
fixed_columns = ['Toronto Neighborhood'] + list(venue_columns)
toronto_onehot = toronto_onehot[fixed_columns]

In [None]:
# group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
toronto_grouped = toronto_onehot.groupby('Toronto Neighborhood').mean().reset_index()

In [None]:
# define a function to return the venues with most frequency
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Toronto Neighborhood']

for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Toronto Neighborhood'] = toronto_grouped['Toronto Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

In [None]:
# set number of clusters
kclusters = 3

toronto_grouped_clustering = toronto_grouped.drop('Toronto Neighborhood', 1)
# fit the clusters
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
# merge
toronto_merged = df_toronto
toronto_merged = pd.merge(toronto_merged.rename(columns = {'Neighborhood': 'Toronto Neighborhood'}), 
         neighborhoods_venues_sorted, on = 'Toronto Neighborhood')

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Toronto Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Conclusion
### First cluster of neighborhoods is outdoor exercise place
### Second cluster of neighborhoods is restaurant 
### Third cluster of neighborhoods is personal care

In [None]:
# Examine clusters
## cluster 1
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

In [None]:
## cluster 2
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

In [None]:
## cluster 3
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]