# Segmenting and Clustering Neighborhoods in Toronto - III

### Fetching the data from Wiki page and transforms into pandas dataframe

In [11]:
!pip install --user folium

Requirement not upgraded as not directly required: folium in /home/dsxuser/.local/lib/python3.5/site-packages
Requirement not upgraded as not directly required: jinja2 in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from folium)
Requirement not upgraded as not directly required: requests in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from folium)
Requirement not upgraded as not directly required: six in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from folium)
Requirement not upgraded as not directly required: numpy in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from folium)
Requirement not upgraded as not directly required: branca>=0.3.0 in /home/dsxuser/.local/lib/python3.5/site-packages (from folium)
Requirement not upgraded as not directly required: MarkupSafe>=0.23 in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from jinja2->folium)
Requirement not upgraded as not directly required: chardet<3.1.0,>=3.0.2 in /opt/co

In [81]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
import urllib
from geopy.geocoders import Nominatim
import folium
import requests
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

Scrap neighborhood information from Wiki page and transforms into dataframe

In [82]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
canada_data = requests.get(url).text
 
tabla = canada_data[canada_data.find("<table"):canada_data.find("</table>")+8]
df = pd.read_html(tabla, header = 0)[0]

The dataframe will consist of three columns: PostCode, Borough, and Neighbourhood

In [83]:
df.dtypes

Postcode         object
Borough          object
Neighbourhood    object
dtype: object

Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [84]:
df = df[df.Borough != "Not assigned"]

If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [85]:
df.Neighbourhood[df.Neighbourhood == "Not assigned"] = df.Borough[df.Neighbourhood == "Not assigned"]

Rows will be combined by Postcode to compose the name of all neighbourhoods.

In [86]:
def neighbourhood_list(grouped):    
    return ', '.join(sorted(grouped['Neighbourhood'].tolist()))
                    
grp = df.groupby(['Postcode', 'Borough'])
neighbourhoods = grp.apply(neighbourhood_list).reset_index(name='Neighbourhood')

In [87]:

neighbourhoods.shape

(103, 3)

Cleaned Dataframe

In [88]:
#neighbourhoods = neighbourhoods.rename(columns = {'Postcode':'PostalCode'})
#newDf

In [89]:
neighbourhoods.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Obtaining geospatial data

Importing the CSV file from https://cocl.us/Geospatial_data

In [91]:
geo_df = pd.read_csv('https://cocl.us/Geospatial_data')
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Perform a Join operation to match up the results, and drop the duplicate column

In [94]:
neighbourhoods = pd.merge(neighbourhoods, geo_df, how='left',left_on='Postcode', right_on='Postal Code', validate="1:1")
neighbourhoods.drop(labels='Postal Code', axis=1, inplace=True)
neighbourhoods.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Postal Code_x,Latitude_x,Longitude_x,Postal Code_y,Latitude_y,Longitude_y,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",M1B,43.806686,-79.194353,M1B,43.806686,-79.194353,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",M1C,43.784535,-79.160497,M1C,43.784535,-79.160497,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711,M1E,43.763573,-79.188711,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917,M1G,43.770992,-79.216917,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476,M1H,43.773136,-79.239476,43.773136,-79.239476


In [95]:
neighbourhoods

Unnamed: 0,Postcode,Borough,Neighbourhood,Postal Code_x,Latitude_x,Longitude_x,Postal Code_y,Latitude_y,Longitude_y,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",M1B,43.806686,-79.194353,M1B,43.806686,-79.194353,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",M1C,43.784535,-79.160497,M1C,43.784535,-79.160497,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711,M1E,43.763573,-79.188711,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917,M1G,43.770992,-79.216917,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476,M1H,43.773136,-79.239476,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,M1J,43.744734,-79.239476,M1J,43.744734,-79.239476,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",M1K,43.727929,-79.262029,M1K,43.727929,-79.262029,43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",M1L,43.711112,-79.284577,M1L,43.711112,-79.284577,43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",M1M,43.716316,-79.239476,M1M,43.716316,-79.239476,43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",M1N,43.692657,-79.264848,M1N,43.692657,-79.264848,43.692657,-79.264848


In [96]:
neighbourhoods.shape

(103, 11)

# Creating maps

Use geopy library to get the latitude and longitude values of Toronto

In [97]:
address = 'Toronto'

geolocator = Nominatim(user_agent="my_application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [98]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighbourhood, pc in zip(neighbourhoods['Latitude'], neighbourhoods['Longitude'], neighbourhoods['Borough'], neighbourhoods['Neighbourhood'], neighbourhoods['Postcode']):
    label = '{}: {}, {}'.format(pc, neighbourhood, borough)
    label = folium.Popup(label)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

# Getting foursquare data

In [109]:
CLIENT_ID = 'YPBVFDUZOP1M24BKCWGXIYZ3RFACOE3V35WSFY4DSCMRU44L' # your Foursquare ID
CLIENT_SECRET = 'VYHYTBSRIZBPYAOCP5ZEFV3YM4C40YEQCQWCUO4NC1JTPNJM' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: YPBVFDUZOP1M24BKCWGXIYZ3RFACOE3V35WSFY4DSCMRU44L
CLIENT_SECRET:VYHYTBSRIZBPYAOCP5ZEFV3YM4C40YEQCQWCUO4NC1JTPNJM


In [110]:
def getNearbyVenues(postcode, names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for pc, names, lat, lng in zip(postcode, names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            pc,
            names, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])


    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postcode',
                  'Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [111]:
toronto_venues = getNearbyVenues(postcode=neighbourhoods['Postcode'], names=neighbourhoods['Neighbourhood'],
                                   latitudes=neighbourhoods['Latitude'],
                                   longitudes=neighbourhoods['Longitude']
                                  )
toronto_venues.head()

Unnamed: 0,Postcode,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M1B,"Malvern, Rouge",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,M1C,"Highland Creek, Port Union, Rouge Hill",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,M1E,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
3,M1E,"Guildwood, Morningside, West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
4,M1E,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Big Bite Burrito,43.766299,-79.19072,Mexican Restaurant


We now analyze each neighbourhood

In [112]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Postcode'] = toronto_venues['Postcode']

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Postcode,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M1C,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M1E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M1E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M1E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


We can now group the venues by Postcode

In [113]:

toronto_grouped = toronto_onehot.groupby('Postcode').mean().reset_index()

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [114]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [115]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postcode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
postcode_venues_sorted = pd.DataFrame(columns=columns)
postcode_venues_sorted['Postcode'] = toronto_grouped['Postcode']

for ind in np.arange(toronto_grouped.shape[0]):
    postcode_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

postcode_venues_sorted.head()

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Fast Food Restaurant,Yoga Studio,Ethiopian Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Empanada Restaurant,Event Space
1,M1C,Bar,Yoga Studio,Ethiopian Restaurant,Drugstore,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Empanada Restaurant,Event Space,Doner Restaurant
2,M1E,Mexican Restaurant,Electronics Store,Rental Car Location,Breakfast Spot,Pizza Place,Medical Center,Donut Shop,Drugstore,Dumpling Restaurant,Eastern European Restaurant
3,M1G,Coffee Shop,Korean Restaurant,Yoga Studio,Empanada Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant
4,M1H,Caribbean Restaurant,Fried Chicken Joint,Athletics & Sports,Hakka Restaurant,Bakery,Bank,Thai Restaurant,Donut Shop,Drugstore,Dumpling Restaurant


# Cluster postcodes

In [116]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Postcode', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

Let's put everything together!

In [117]:
postcode_venues_sorted['Cluster label'] = kmeans.labels_
toronto_merged = pd.merge(postcode_venues_sorted, neighbourhoods, how='left',
        on='Postcode', validate="1:1")
toronto_merged.head()

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,...,Borough,Neighbourhood,Postal Code_x,Latitude_x,Longitude_x,Postal Code_y,Latitude_y,Longitude_y,Latitude,Longitude
0,M1B,Fast Food Restaurant,Yoga Studio,Ethiopian Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Empanada Restaurant,...,Scarborough,"Malvern, Rouge",M1B,43.806686,-79.194353,M1B,43.806686,-79.194353,43.806686,-79.194353
1,M1C,Bar,Yoga Studio,Ethiopian Restaurant,Drugstore,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Empanada Restaurant,Event Space,...,Scarborough,"Highland Creek, Port Union, Rouge Hill",M1C,43.784535,-79.160497,M1C,43.784535,-79.160497,43.784535,-79.160497
2,M1E,Mexican Restaurant,Electronics Store,Rental Car Location,Breakfast Spot,Pizza Place,Medical Center,Donut Shop,Drugstore,Dumpling Restaurant,...,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711,M1E,43.763573,-79.188711,43.763573,-79.188711
3,M1G,Coffee Shop,Korean Restaurant,Yoga Studio,Empanada Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,...,Scarborough,Woburn,M1G,43.770992,-79.216917,M1G,43.770992,-79.216917,43.770992,-79.216917
4,M1H,Caribbean Restaurant,Fried Chicken Joint,Athletics & Sports,Hakka Restaurant,Bakery,Bank,Thai Restaurant,Donut Shop,Drugstore,...,Scarborough,Cedarbrae,M1H,43.773136,-79.239476,M1H,43.773136,-79.239476,43.773136,-79.239476


#Clusters on map