### Notebook for Clustering and Segmenting  of Toronto Neighbourhood

##### Question1

##### Scraping the tabular data with pandas from the specified URL

In [1]:
import pandas as pd
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df_list = pd.read_html(url)

Since the page has many tables, check the table count and access the required tables with index value accordingly. Here it is [0]

In [2]:
#len(df_list)
df_list[0].head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Size of the dataframe before cleaning the data i.e ignoring the Borough = = Not Assigned

In [3]:
df_list[0].shape


(180, 3)

Delete the Borough with "Not Assigned" Value

In [4]:
df_list[0].drop(df_list[0][df_list[0].Borough == 'Not assigned'].index, inplace =True)
print(df_list[0].shape)


(103, 3)


Check for Neighbourhood with "Not Assigned" Value and assign the Borough value to neighbourhood. Since all the values are specified here no need to explicitly assign

In [5]:
df_list[0].loc[df_list[0]['Neighbourhood'] == 'Not Assigned']

# code to assign the borough value to neighbourhood if the Neighbourhood is not assigned.
#df_list[0].loc[df_list[0].Neighbourhood == 'Not assigned',"Neighbourhood"] = df_list[0].Borough

Unnamed: 0,Postal Code,Borough,Neighbourhood


To check for the duplicate postal code is any and join the neighbourhood values. In the data specified , since there is no repetition of postal code we ignore merging the neighbourhood values.

In [6]:
df1 = df_list[0].groupby(['Postal Code']).filter(lambda x: len(x) > 1)
df1

# Code to merge the neighbourhood values if the postal code is repeated..

#df_list[0].groupby('Postal Code')['Neighbourhood'].apply(','.join).reset_index()
#df_list[0].shape

Unnamed: 0,Postal Code,Borough,Neighbourhood


Size of the Dataframe

In [7]:
df_list[0].shape

(103, 3)

#### Question 2 --- Linking the latitude and longitude

Sample code to get the lat and long. Since it takes more time, csv is used to get the lat and long for the postal code

In [None]:
#!pip install geocoder
import geocoder

lat_lon=None
postal_code = 'M5G'

while (lat_lon is None):
    g = geocoder.google('{},Toronto,Ontario'.format(postal_code))
    lat_lon= g.latlng

print ('lat : {} , Long : {} '.format(lat_lon[0],lat_lon[1]))


Get the geospatial_coordinates.csv  and convert t0 data frame

In [9]:
df_geo = pd.read_csv("https://cocl.us/Geospatial_data/Geospatial_Coordinates.csv")
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Match the postal codes in both data frames and get the relevant latitude and longitude

In [10]:
df_result = pd.merge(df_list[0],
                 df_geo[['Postal Code', 'Latitude', 'Longitude']],
                 on='Postal Code')
df_result.head(10)    
    

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


Select only the Boroughs containing the word Toronto 

#### Question 3

In [11]:
df_bor_toronto = df_result[df_result['Borough'].str.contains('Toronto')].reset_index(drop=True)
print(df_bor_toronto.shape)
df_bor_toronto.head()

(39, 5)


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


Set the parameters for calling the four square api

In [12]:
import requests



Write a function to return the data frame of the food venues for the given neighnourhood.

In [13]:
def getVenuesNeighbour (names, latitudes, longitudes,radius =200):
    venues_list=[]
    for name , lat , lon  in zip(names,latitudes, longitudes):
        #print(name)
        url ='https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&categoryId={}'.format(
            CLIENT_ID,
            CLIENT_SECRET,
            VERSION,
            lat,
            lon,
            radius,
            LIMIT,
            Category
        )
        #print (url)
        results = requests.get(url).json()["response"]['venues']
        venues_list.append([(
            name, 
            lat, 
            lon, 
            v['name'], 
            v['location']['lat'], 
            v['location']['lng'],  
            v['categories'][0]['name']) for v in results])
        
        nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
        nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [14]:
Toronto_food_venues =getVenuesNeighbour (names=df_bor_toronto['Neighbourhood'], latitudes=df_bor_toronto['Latitude'], longitudes=df_bor_toronto['Longitude'])

In [15]:
Toronto_food_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
1,"Regent Park, Harbourfront",43.65426,-79.360636,Terroni Sud Forno Produzione e Spaccio,43.653903,-79.360018,Gourmet Shop
2,"Regent Park, Harbourfront",43.65426,-79.360636,Gusto 501,43.65481,-79.359595,Italian Restaurant
3,"Regent Park, Harbourfront",43.65426,-79.360636,Bar Reyna,43.653446,-79.36224,Mediterranean Restaurant
4,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery


Analyze each category of food venues available in the neighnourhood

In [16]:
# one hot encoding
tor_onehot = pd.get_dummies(Toronto_food_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
tor_onehot['Neighborhood'] = Toronto_food_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [tor_onehot.columns[-1]] + list(tor_onehot.columns[:-1])
tor_onehot = tor_onehot[fixed_columns]

tor_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,Asian Restaurant,BBQ Joint,Bakery,Bar,Breakfast Spot,Bubble Tea Shop,Burger Joint,Cafeteria,...,Smoothie Shop,Southern / Soul Food Restaurant,Sports Bar,Steakhouse,Sushi Restaurant,Tea Room,Thai Restaurant,Theme Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant
0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Regent Park, Harbourfront",0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Mean of the frequency of occurance of the food venue category in each neighbourhood

In [17]:
tor_grouped = tor_onehot.groupby('Neighborhood').mean().reset_index()
tor_grouped

Unnamed: 0,Neighborhood,American Restaurant,Asian Restaurant,BBQ Joint,Bakery,Bar,Breakfast Spot,Bubble Tea Shop,Burger Joint,Cafeteria,...,Smoothie Shop,Southern / Soul Food Restaurant,Sports Bar,Steakhouse,Sushi Restaurant,Tea Room,Thai Restaurant,Theme Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Central Bay Street,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Christie,0.111111,0.0,0.0,0.222222,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Church and Wellesley,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.1
6,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Davisville North,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0
9,"Dufferin, Dovercourt Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0


In [18]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Top 5 food venues in each neighbourhood

In [19]:
import numpy as np
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = tor_grouped['Neighborhood']

for ind in np.arange(tor_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(tor_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Berczy Park,Coffee Shop,Food Truck,Breakfast Spot,Creperie,French Restaurant
1,"Brockton, Parkdale Village, Exhibition Place",Café,Vietnamese Restaurant,Restaurant,Coffee Shop,Falafel Restaurant
2,"Business reply mail Processing Centre, South C...",Restaurant,Vietnamese Restaurant,Creperie,French Restaurant,Food Truck
3,Central Bay Street,Coffee Shop,Italian Restaurant,Mediterranean Restaurant,Bar,Ice Cream Shop
4,Christie,Bakery,American Restaurant,Café,Coffee Shop,Candy Store


In [20]:
neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Berczy Park,Coffee Shop,Food Truck,Breakfast Spot,Creperie,French Restaurant
1,"Brockton, Parkdale Village, Exhibition Place",Café,Vietnamese Restaurant,Restaurant,Coffee Shop,Falafel Restaurant
2,"Business reply mail Processing Centre, South C...",Restaurant,Vietnamese Restaurant,Creperie,French Restaurant,Food Truck
3,Central Bay Street,Coffee Shop,Italian Restaurant,Mediterranean Restaurant,Bar,Ice Cream Shop
4,Christie,Bakery,American Restaurant,Café,Coffee Shop,Candy Store
5,Church and Wellesley,Coffee Shop,Ramen Restaurant,Ice Cream Shop,Poke Place,Korean Restaurant
6,"Commerce Court, Victoria Hotel",Coffee Shop,Restaurant,Café,Mediterranean Restaurant,Japanese Restaurant
7,Davisville,French Restaurant,Coffee Shop,Italian Restaurant,Café,New American Restaurant
8,Davisville North,Tea Room,Breakfast Spot,Restaurant,Vietnamese Restaurant,Dumpling Restaurant
9,"Dufferin, Dovercourt Village",Sandwich Place,Portuguese Restaurant,Sushi Restaurant,Diner,Donut Shop


Cluster the neighborhood to 5 clusters

In [21]:
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 5

tor_grouped_clustering = tor_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(tor_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 1, 1, 0, 0, 1, 0, 0, 0], dtype=int32)

add the cluster column and create the new data frame

In [35]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

tor_merged = df_bor_toronto

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
tor_merged = tor_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

tor_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0.0,Italian Restaurant,Food Truck,Breakfast Spot,Mediterranean Restaurant,Coffee Shop
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,1.0,Restaurant,Café,Coffee Shop,Thai Restaurant,Italian Restaurant
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0.0,Restaurant,Mexican Restaurant,Diner,Asian Restaurant,Thai Restaurant
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,1.0,Coffee Shop,Japanese Restaurant,Restaurant,Poke Place,Fast Food Restaurant
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,,,,,,


In [36]:
tor_merged

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0.0,Italian Restaurant,Food Truck,Breakfast Spot,Mediterranean Restaurant,Coffee Shop
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,1.0,Restaurant,Café,Coffee Shop,Thai Restaurant,Italian Restaurant
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0.0,Restaurant,Mexican Restaurant,Diner,Asian Restaurant,Thai Restaurant
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,1.0,Coffee Shop,Japanese Restaurant,Restaurant,Poke Place,Fast Food Restaurant
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,,,,,,
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,0.0,Coffee Shop,Food Truck,Breakfast Spot,Creperie,French Restaurant
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,1.0,Coffee Shop,Italian Restaurant,Mediterranean Restaurant,Bar,Ice Cream Shop
7,M6G,Downtown Toronto,Christie,43.669542,-79.422564,0.0,Bakery,American Restaurant,Café,Coffee Shop,Candy Store
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568,0.0,Food Court,Sushi Restaurant,Pizza Place,Bar,Noodle House
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259,0.0,Sandwich Place,Portuguese Restaurant,Sushi Restaurant,Diner,Donut Shop


In [37]:
tor_merged= tor_merged.dropna()


In [38]:
tor_merged.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0.0,Italian Restaurant,Food Truck,Breakfast Spot,Mediterranean Restaurant,Coffee Shop
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,1.0,Restaurant,Café,Coffee Shop,Thai Restaurant,Italian Restaurant
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0.0,Restaurant,Mexican Restaurant,Diner,Asian Restaurant,Thai Restaurant
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,1.0,Coffee Shop,Japanese Restaurant,Restaurant,Poke Place,Fast Food Restaurant
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,0.0,Coffee Shop,Food Truck,Breakfast Spot,Creperie,French Restaurant


Plotting the clusters 

In [42]:
#!pip install folium
import folium
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors

address = 'Toronto'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(tor_merged['Latitude'], tor_merged['Longitude'], tor_merged['Neighbourhood'], tor_merged['Cluster Labels'].astype(int)):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.
