<h1>Exploring,segmenting, and clustering the neighborhoods in the city of Toronto based on the postalcode and borough information</h1>
Import all required libraries

In [1]:
#Importing all the required libraries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

<b>Sending a Get request</b>

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url)
html= page.text

<b>Create a BeautifulSoup object</b>

In [3]:
soup = BeautifulSoup(html,'lxml')


<b>Extract the information from website into a table as below:</b>

In [4]:
table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

<b>Create a dataframe and display contents of the table</b>

In [5]:
# print(table_contents)
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills North
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


<b>Grouping Neighborhoods by Postal Codes</b>

In [6]:
df_new = df.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()

df_new.head(10) 

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


<b>Assign the Latitude and Longitute to the neighborhoods using the latitude and longitude data provided </b>

In [7]:
latlog_df = pd.read_csv('https://cocl.us/Geospatial_data', index_col='Postal Code')

df_toronto = df_new.join(latlog_df, on='PostalCode')
df_toronto.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


Selecting only Boroughs with Toronto in them

In [8]:
df_toronto_only = df_toronto[df_toronto['Borough'].str.contains("Toronto") == 1].reset_index(drop=True)
df_toronto_only.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4J,East York/East Toronto,The Danforth East,43.685347,-79.338106
2,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
3,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
4,M4M,East Toronto,Studio District,43.659526,-79.340923
5,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
6,M4P,Central Toronto,Davisville North,43.712751,-79.390197
7,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
8,M4S,Central Toronto,Davisville,43.704324,-79.38879
9,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316


<b>Toronto Map visualisation based on the "df_toronto_only" data</b>

In [9]:
#!conda install -c conda-forge folium=0.5.0 --yes
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

import folium
import json
toronto_map = folium.Map(location=[43.6511, -79.3471], zoom_start=10)

for lat, lng, borough, neighborhood in zip(df_toronto_only['Latitude'], df_toronto_only['Longitude'], df_toronto_only['Borough'], df_toronto_only['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)
    
toronto_map

In [10]:
#Pick the first neighborhood in df_toronto_only
df_toronto_only.loc[0,'Neighborhood']

'The Beaches'

In [11]:
#Finding the latitude and longitude of "The Beaches"
neighborhood_lad = df_toronto_only.loc[0,'Latitude']
neighborhood_log = df_toronto_only.loc[0,'Longitude']
neighborhood_name = df_toronto_only.loc[0,'Neighborhood']

print('Latitude and Longitude values of {} are {},{}.'.format(neighborhood_name,neighborhood_lad,neighborhood_log))

Latitude and Longitude values of The Beaches are 43.67635739999999,-79.2930312.


In [12]:
CLIENT_ID = 'FUEDBSWGZV0XDYAJ25PQ450AFJ4XXDUQTTPMCCGXNT0GIGYX'
CLIENT_SECRET = 'WJDCBO3JW4X5MANWN0WPDHKWRROLVOJG4HD0OAM11ED35HRT'
VERSION = '20201001'
LIMIT = 100
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID,
    CLIENT_SECRET,
    VERSION,
    neighborhood_lad,
    neighborhood_log,
    radius,
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?client_id=FUEDBSWGZV0XDYAJ25PQ450AFJ4XXDUQTTPMCCGXNT0GIGYX&client_secret=WJDCBO3JW4X5MANWN0WPDHKWRROLVOJG4HD0OAM11ED35HRT&v=20201001&ll=43.67635739999999,-79.2930312&radius=500&limit=100'

In [13]:
results = requests.get(url).json
results

<bound method Response.json of <Response [200]>>

In [14]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        lng,
        radius,
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    venues_list.append([(
        name,
        lat,
        lng,
        v['venue']['name'],
        v['venue']['location']['lat'],
        v['venue']['location']['lng'],
        v['venue']['categories'][0]['name']) for v in results])
    
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
    
    return(nearby_venues)

In [15]:
toronto_only_venues = getNearbyVenues(names=df_toronto_only['Neighborhood'],
                                latitudes=df_toronto_only['Latitude'],
                                longitudes=df_toronto_only['Longitude'])

toronto_only_venues.shape

The Beaches
The Danforth  East
The Danforth West, Riverdale
India Bazaar, The Beaches West
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park, Summerhill East
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
Rosedale
St. James Town, Cabbagetown
Church and Wellesley
Regent Park, Harbourfront
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North & West
The Annex, North Midtown, Yorkville
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Enclave of M5E
First Canadian Place, Underground city
Christie
Dufferin, Dovercourt Village
Little Portugal, Trinity
Brockton, Parkdale Village, Exhibition Place
High Park, The Junction 

(15, 7)

In [16]:
print(toronto_only_venues.shape)
print('There are {} uniques categories.'.format(len(toronto_only_venues['Venue Category'].unique())))
toronto_only_venues

(15, 7)
There are 15 uniques categories.


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Enclave of M4L,43.662744,-79.321558,Rorschach Brewing Co.,43.663483,-79.319824,Brewery
1,Enclave of M4L,43.662744,-79.321558,Leslieville Farmers Market,43.664901,-79.319784,Farmers Market
2,Enclave of M4L,43.662744,-79.321558,The Sidekick,43.664484,-79.325162,Comic Shop
3,Enclave of M4L,43.662744,-79.321558,Chino Locos,43.664653,-79.325584,Burrito Place
4,Enclave of M4L,43.662744,-79.321558,Chick-n-Joy,43.665181,-79.321403,Fast Food Restaurant
5,Enclave of M4L,43.662744,-79.321558,Queen Margherita Pizza,43.664685,-79.324164,Pizza Place
6,Enclave of M4L,43.662744,-79.321558,East End Garden Centre & Hardware,43.664564,-79.324471,Garden Center
7,Enclave of M4L,43.662744,-79.321558,The Green Wood,43.664728,-79.324117,Restaurant
8,Enclave of M4L,43.662744,-79.321558,Amin Car Repair Garage,43.663544,-79.32013,Auto Workshop
9,Enclave of M4L,43.662744,-79.321558,The Ashbridge Estate,43.664691,-79.321805,Garden


In [17]:
toronto_only_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Enclave of M4L,15,15,15,15,15,15


In [18]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_only_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_only_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighborhood,Auto Workshop,Brewery,Burrito Place,Comic Shop,Farmers Market,Fast Food Restaurant,Garden,Garden Center,Gym / Fitness Center,Park,Pizza Place,Recording Studio,Restaurant,Skate Park,Yoga Studio
0,Enclave of M4L,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Enclave of M4L,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,Enclave of M4L,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,Enclave of M4L,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,Enclave of M4L,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [19]:
print(toronto_onehot.shape)
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

(15, 16)


Unnamed: 0,Neighborhood,Auto Workshop,Brewery,Burrito Place,Comic Shop,Farmers Market,Fast Food Restaurant,Garden,Garden Center,Gym / Fitness Center,Park,Pizza Place,Recording Studio,Restaurant,Skate Park,Yoga Studio
0,Enclave of M4L,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667


In [20]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Enclave of M4L----
            venue  freq
0   Auto Workshop  0.07
1         Brewery  0.07
2   Burrito Place  0.07
3      Comic Shop  0.07
4  Farmers Market  0.07




Defining a fucntion for returning most common venues

In [21]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [22]:
#Return the top 10 venues
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Enclave of M4L,Yoga Studio,Skate Park,Restaurant,Recording Studio,Pizza Place,Park,Gym / Fitness Center,Garden Center,Garden,Fast Food Restaurant


In [23]:
#Clustering the Neighborhoods
kmeans = KMeans(n_clusters=5, init='k-means++', max_iter=15, random_state=8)
kmeans

KMeans(max_iter=15, n_clusters=5, random_state=8)