In [None]:
# The code was removed by Watson Studio for sharing.

## Week 3 assignment - Segmenting and Clustering neighborhoods in Toronto <br>

### Part 1 - Setting up neighborhoods data into a pandas dataframe 

In [1]:
# Load data from wikipedia page

import numpy as np
import pandas as pd
d = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
df_toronto = d[0]       # First element of the list is the required dataframe 
df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [1]:
# Check for not assigned rows
print("Number of not assigned neighbourhoods: ", len(df_toronto[df_toronto['Neighbourhood'] =='Not assigned']))
print("Borough value for such rows:", df_toronto[df_toronto['Neighbourhood'] =='Not assigned']['Borough'].value_counts())

NameError: name 'df_toronto' is not defined

Hence we see that wherever neighborhood is not assigned, borough is also not assigned. So we drop these rows.

In [2]:
df_toronto.drop(df_toronto[df_toronto['Neighbourhood']=='Not assigned'].index, axis=0, inplace=True)
df_toronto.reset_index(drop=True, inplace=True)
df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


In [3]:
# Merge rows by postal code
df_toronto = df_toronto.groupby('Postcode', as_index=False).aggregate({'Borough': 'first', 'Neighbourhood':', '.join})
df_toronto.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [5]:
# Display random rows from the dataframe
x = np.random.randint(0, len(df_toronto), 10)
df_toronto.loc[x,:]

Unnamed: 0,Postcode,Borough,Neighbourhood
36,M4C,East York,Woodbine Heights
60,M5K,Downtown Toronto,"Design Exchange, Toronto Dominion Centre"
19,M2K,North York,Bayview Village
18,M2J,North York,"Fairview, Henry Farm, Oriole"
79,M6L,North York,"Downsview, North Park, Upwood Park"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
50,M4W,Downtown Toronto,Rosedale
0,M1B,Scarborough,"Rouge, Malvern"
13,M1T,Scarborough,"Clarks Corners, Sullivan, Tam O'Shanter"


In [6]:
print("Number or postcodes/ rows:", df_toronto.shape[0])

Number or postcodes/ rows: 103


<br>
<br>

### Part 2 - Add latitude and longitude coordinates of neighborhoods

In [4]:
df_ll = pd.read_csv('http://cocl.us/Geospatial_data')
df_ll.rename(columns={'Postal Code':'Postcode'}, inplace=True)
df_toronto = pd.merge(df_toronto, df_ll, on='Postcode')
df_toronto.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


<br>
<br>

### Part - 3 Exploring and clustering the neighborhoods in Toronto 

<br>

In [5]:
# Install dependencies

!pip install geopy
!pip install folium

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 6.7MB/s eta 0:00:011
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/81/6d/31c83485189a2521a75b4130f1fee5364f772a0375f81afff619004e5237/branca-0.4.0-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.0 folium-0.10.1


In [6]:
import json
from pandas.io.json import json_normalize  # transform json file into pandas dataframe
import requests

In [10]:
# Take a look at number of boroughs and neighbourhoods
print('Toronto has {} boroughs and {} neighborhoods.'.format(
        len(df_toronto['Borough'].unique()),
        df_toronto.shape[0]
    )
)

Toronto has 10 boroughs and 103 neighborhoods.


In [11]:
# Let's check how many postcodes belong to each borough
df_toronto['Borough'].value_counts()

North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
York                 5
East Toronto         5
East York            5
Mississauga          1
Name: Borough, dtype: int64

<br>
Now let us visualize the ten boroughs on a map. For this I approximate the location of each borough as the average value of the coordinates for all its neighborhoods. 

In [12]:
# Let's visualize the ten boroughs on a map
# df_map = df_toronto.drop('Neighbourhood', axis=1).groupby('Borough').mean()
df_map = df_toronto.drop('Neighbourhood', axis=1).groupby('Borough', as_index=False).mean()

In [56]:
from geopy.geocoders import Nominatim
import folium

In [57]:
# Get coordinates of Toronto

address = 'Toronto, ON'

geolocator = Nominatim(user_agent="ttt")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [15]:
# Make a map of toronto showing all the boroughs
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, long, borough in zip(df_map['Latitude'], df_map['Longitude'], df_map['Borough']):
    folium.Marker(
    location=[lat, long],
    popup=borough).add_to(map_toronto)
    
map_toronto

<br>
Okay now we can start exploring. Let us explore the top restaurants in all neighborhoods of Central Toronto and Downtown Toronto and sort them by average rating. <br>

In [7]:
# Foursquare Credentials
CLIENT_ID = 'YQ00SZDIYWDHV53ISV2C3ICXA2IGKW5E3ISH31HKIYLVYSSZ' # your Foursquare ID
CLIENT_SECRET = 'VBVZWZHFP3BVWZMZJFIEEJIHTB0LBIY0ZYA3MBPGE532JXQF' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [20]:
# This is a test cell
# First let's just pick one particular neighbourhood/ postcode and get all the coffee shops in them

df_toronto[df_toronto['Borough']=='Central Toronto']

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
47,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
49,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049
63,M5N,Central Toronto,Roselawn,43.711695,-79.416936
64,M5P,Central Toronto,"Forest Hill North, Forest Hill West",43.696948,-79.411307
65,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",43.67271,-79.405678


In [8]:
# Now let me explore coffee shops in Lawrence Park/ M4N

radius = 500
lat = df_toronto.loc[45, 'Latitude']
long = df_toronto.loc[45, 'Longitude']
query = 'Restaurant'
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&v={}&ll={},{}&query={}&radius={}'.format(
    CLIENT_ID, CLIENT_SECRET, VERSION, lat, long, query, radius)
url

'https://api.foursquare.com/v2/venues/search?client_id=YQ00SZDIYWDHV53ISV2C3ICXA2IGKW5E3ISH31HKIYLVYSSZ&client_secret=VBVZWZHFP3BVWZMZJFIEEJIHTB0LBIY0ZYA3MBPGE532JXQF&v=20180605&ll=43.7127511,-79.3901975&query=Restaurant&radius=500'

In [9]:
results = requests.get(url).json()['response']['venues']
results

[{'id': '4adb2fd3f964a520c42421e3',
  'name': 'Homeway Restaurant & Brunch',
  'location': {'address': '955 Mount Pleasant',
   'crossStreet': 'Mount Pleasant & Erskine',
   'lat': 43.71264120397444,
   'lng': -79.39155655199944,
   'labeledLatLngs': [{'label': 'display',
     'lat': 43.71264120397444,
     'lng': -79.39155655199944}],
   'distance': 110,
   'postalCode': 'M4P 2L7',
   'cc': 'CA',
   'city': 'Toronto',
   'state': 'ON',
   'country': 'Canada',
   'formattedAddress': ['955 Mount Pleasant (Mount Pleasant & Erskine)',
    'Toronto ON M4P 2L7',
    'Canada']},
  'categories': [{'id': '4bf58dd8d48988d143941735',
    'name': 'Breakfast Spot',
    'pluralName': 'Breakfast Spots',
    'shortName': 'Breakfast',
    'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/breakfast_',
     'suffix': '.png'},
    'primary': True}],
  'referralId': 'v-1584338813',
  'hasPerk': False},
 {'id': '591984c1b3d8e245316549a3',
  'name': 'Maybes Restaurant',
  'location': {'address

In [16]:
print(results[2]['name'])
print(results[2]['location']['address'])
print(results[2]['location']['postalCode'])

808 Restaurant - Best western
808 Mount Pleasent Road
M4P 2L2


In [13]:
unid = results[2]['id']
# lat = results['response']['venues'][0]['location']['lat']
# long = results['response']['venues'][0]['location']['lng']
url = 'https://api.foursquare.com/v2/venues/{}?client_id={}&client_secret={}&v={}'.format(
    unid, CLIENT_ID, CLIENT_SECRET, VERSION)
url

'https://api.foursquare.com/v2/venues/56af5041498ea6e0ef5b1e27?client_id=YQ00SZDIYWDHV53ISV2C3ICXA2IGKW5E3ISH31HKIYLVYSSZ&client_secret=VBVZWZHFP3BVWZMZJFIEEJIHTB0LBIY0ZYA3MBPGE532JXQF&v=20180605'

In [14]:
resulta = requests.get(url).json()['response']['venue']
resulta

{'id': '56af5041498ea6e0ef5b1e27',
 'name': '808 Restaurant - Best western',
 'contact': {'phone': '4164875101', 'formattedPhone': '(416) 487-5101'},
 'location': {'address': '808 Mount Pleasent Road',
  'lat': 43.70940971907415,
  'lng': -79.39059614124203,
  'labeledLatLngs': [{'label': 'display',
    'lat': 43.70940971907415,
    'lng': -79.39059614124203}],
  'postalCode': 'M4P 2L2',
  'cc': 'CA',
  'city': 'Toronto',
  'state': 'ON',
  'country': 'Canada',
  'formattedAddress': ['808 Mount Pleasent Road',
   'Toronto ON M4P 2L2',
   'Canada']},
 'canonicalUrl': 'https://foursquare.com/v/808-restaurant--best-western/56af5041498ea6e0ef5b1e27',
 'categories': [{'id': '4bf58dd8d48988d143941735',
   'name': 'Breakfast Spot',
   'pluralName': 'Breakfast Spots',
   'shortName': 'Breakfast',
   'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/breakfast_',
    'suffix': '.png'},
   'primary': True}],
 'verified': False,
 'stats': {'tipCount': 0},
 'price': {'tier': 1, 'messa

In [20]:
resulta['price']['tier']

1

In [10]:
# First let us make a subset of the original dataframe containing all the boroughs having the name Toronto
df_new = df_toronto[df_toronto['Borough'].str.contains("Toronto")]
print("Shape of new dataframe", df_new.shape)
df_new.head()

Shape of new dataframe (39, 5)


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [11]:
df_new.reset_index(drop=True, inplace=True)

In [49]:
# Now we have to make a new dataframe containing postcode, borough, neighborhood, restaurant name, ratings.
df_restaurants = pd.DataFrame(columns=['Postcode', 'Borough', 'Neighborhood', 'Restaurant', 'Rating', 'Address', 'Latitude', 'Longitude'])
df_restaurants

Unnamed: 0,Postcode,Borough,Neighborhood,Restaurant,Rating,Address,Latitude,Longitude


In [17]:
def get_venues_table_print(postcodes, boroughs, neighborhoods, latitudes, longitudes, query, radius=500):
    
    venues_list=[]
    for pc, bor, n, lat, long in zip(postcodes, boroughs, neighborhoods, latitudes, longitudes):
        
        print("\n Moving to a new postcode", pc, "\n")
        
        url1 = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&v={}&ll={},{}&query={}&radius={}'.format(
            CLIENT_ID, CLIENT_SECRET, VERSION, lat, long, query, radius)
        
        result1 = requests.get(url1).json()['response']['venues']
        for v in result1:
            
            try:
                print(v['name'], v['location']['address'], v['location']['postalCode'])
            except:
                try:
                    print(v['name'], "N/A", v['location']['postalCode'])
                except:
                    try:
                        print(v['name'], v['location']['address'], "N/A")
                    except:
                        print(v['name'], "N/A", "N/A")
            
#            url2 = 'https://api.foursquare.com/v2/venues/{}?client_id={}&client_secret={}&v={}'.format(
#                v['id'], CLIENT_ID, CLIENT_SECRET, VERSION)
#            result2 = requests.get(url2).json()['response']['venue']
#            
#            try:
#                print(" Rating: ", result2['rating'])
#            except:
#                print(" Rating: ", "N/A")
    
    return 


In [10]:
def get_venues_table(postcodes, boroughs, neighborhoods, latitudes, longitudes, query, radius=500):
    
    venues_list=[]
    for pc, bor, n, lat, long in zip(postcodes, boroughs, neighborhoods, latitudes, longitudes):
        
        url1 = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&v={}&ll={},{}&query={}&radius={}'.format(
            CLIENT_ID, CLIENT_SECRET, VERSION, lat, long, query, radius)
        
        result1 = requests.get(url1).json()['response']['venues']
        
        for v in result1:
            
            v_list = []
            try:
                v_list = v_list + [pc, bor, n, v['name'], v['location']['address'], v['location']['postalCode']]
            except:
                v_list = v_list + [pc, bor, n, v['name'], "N/A", "N/A"]
    
            url2 = 'https://api.foursquare.com/v2/venues/{}?client_id={}&client_secret={}&v={}'.format(
                v['id'], CLIENT_ID, CLIENT_SECRET, VERSION)
            result2 = requests.get(url2).json()['response']['venue']
            
            try:
                v_list = v_list + [result2['rating'], result2['price']['tier'], v['location']['lat'], v['location']['lng']]
            except:
                try:
                    v_list = v_list + ["N/A", result2['price']['tier'], v['location']['lat'], v['location']['lng']]
                except:
                    try:
                        v_list = v_list + [result2['rating'], "N/A", v['location']['lat'], v['location']['lng']]
                    except:
                        v_list = v_list + ["N/A", "N/A", v['location']['lat'], v['location']['lng']]
            
            venues_list.append(v_list)
    
    return venues_list


##### Loop through df_new and send the list of postcode, neighborhood, lat, long to a function. The function generates a url to get list of venues nearby each postcode. Use the response and appropriate key in the response to get restaurant name, latlong, address. Generate another response using id to get the rating for each restaurant. Append these items to a list. Finally return the list from the function. Use the list to generate the new dataframe.  

In [21]:
# Call the func

get_venues_table_print(df_new.loc[0:3, 'Postcode'], df_new.loc[0:3,'Borough'], df_new.loc[0:3,'Neighbourhood'],
                       df_new.loc[0:3,'Latitude'], df_new.loc[0:3,'Longitude'], "Restaurant")
#df_test


 Moving to a new postcode M4E 

Seaspray Restaurant 629 Kingston Rd N/A

 Moving to a new postcode M4K 

Herby Restaurant 397 Danforth Ave M4K 1P1
Beiteddine Lebanese Restaurant N/A M4K 1P5
Osmow’s Authentic Mediterranean Restaurant 497 Danforth Avenue M4K 1P5
Megas Restaurant 402 Danforth Ave M4K
Katsu Japanese Restaurant 572 Danforth Ave M4K 1R1
Florentina's Italian Restaurant N/A N/A
Friendly Greek Restaurant 494 Danforth Ave. M4K 1P7
The Palace Restaurant 722 Pape Avenue M4K
Simone's Caribbean Restaurant 596 Danforth Avenue M4K 1R1
Dairy Queen 1040 Broadview Ave M4K 2S2
Pantheon 407 Danforth Ave. M4K 1P1

 Moving to a new postcode M4L 

Occasions Restaurant 30 Eastwood Road N/A
JP Restaurant 270 Coxwell Ave N/A
Haandi Restaurant N/A N/A
New Town Restaurant 266 Coxwell Ave. N/A

 Moving to a new postcode M4M 

Icy Spicy Fusion Restaurant 99 Pape Ave. N/A
Caribbean Sunset Restaurant and Bar 753A Queen St E M4M 1H3


In [12]:
# Call the func

df_restaurants = pd.DataFrame(data = get_venues_table(df_new.loc[0:3, 'Postcode'], df_new.loc[0:3, 'Borough'], df_new.loc[0:3, 'Neighbourhood'],
                    df_new.loc[0:3, 'Latitude'], df_new.loc[0:3, 'Longitude'], "Restaurant"), 
                       columns = ['Postcode', 'Borough', 'Neigborhood', 'Restarant', 'Address', 'Rating', 'Price Tier', 'Latitude', 'Longitude'])
df_restaurants.head(20)

Unnamed: 0,Postcode,Borough,Neigborhood,Restarant,Address,Rating,Price Tier,Latitude,Longitude
0,M4E,East Toronto,The Beaches,Seaspray Restaurant,629 Kingston Rd,,1.0,43.678888,-79.298167
1,M4K,East Toronto,"The Danforth West, Riverdale",Herby Restaurant,397 Danforth Ave,,,43.67745,-79.35129
2,M4K,East Toronto,"The Danforth West, Riverdale",Beiteddine Lebanese Restaurant,,,,43.678065,-79.348542
3,M4K,East Toronto,"The Danforth West, Riverdale",Osmow’s Authentic Mediterranean Restaurant,497 Danforth Avenue,,2.0,43.677883,-79.34925
4,M4K,East Toronto,"The Danforth West, Riverdale",Megas Restaurant,402 Danforth Ave,6.5,2.0,43.677692,-79.351522
5,M4K,East Toronto,"The Danforth West, Riverdale",Florentina's Italian Restaurant,,,2.0,43.676562,-79.355699
6,M4K,East Toronto,"The Danforth West, Riverdale",Katsu Japanese Restaurant,572 Danforth Ave,6.9,2.0,43.678619,-79.347024
7,M4K,East Toronto,"The Danforth West, Riverdale",The Palace Restaurant,722 Pape Avenue,,2.0,43.679827,-79.345331
8,M4K,East Toronto,"The Danforth West, Riverdale",Simone's Caribbean Restaurant,596 Danforth Avenue,7.6,2.0,43.678655,-79.346582
9,M4K,East Toronto,"The Danforth West, Riverdale",Friendly Greek Restaurant,494 Danforth Ave.,6.6,2.0,43.678428,-79.347642


In [16]:
df_restaurants.duplicated().value_counts() # Check for any duplicates

False    408
dtype: int64

In [17]:
# Check how many restaurants were returned across the 4 boroughs
df_restaurants.shape 

(408, 9)

In [24]:
from project_lib import Project
project = Project(project_id="d7fc9b06-7137-49bd-b806-51efcefa6262", project_access_token="p-ce4f33b4e2430cd1f9ae53b2b2a616a5ea3bab8d")
project.save_data(file_name = "toronto_restaurants.csv", data = df_restaurants.to_csv(index=False))

{'file_name': 'toronto_restaurants.csv',
 'message': 'File saved to project storage.',
 'bucket_name': 'datasciencecapstone-donotdelete-pr-9xufyyyz2be1rz',
 'asset_id': '132a9347-6bcf-4ab6-b638-0a555aa475b8'}

Wow, 408 restaurants. That's too many. Probably it is because we chose a large radius, 500m. Hence searches often criss-crossed across neighborhoods. So we have to check for duplicates and delete them.

In [38]:
df_restaurants[df_restaurants.duplicated(subset=['Restarant', 'Latitude', 'Longitude'])].sort_values(by='Restarant').head(30)

Unnamed: 0,Postcode,Borough,Neigborhood,Restarant,Address,Rating,Price Tier,Latitude,Longitude
195,M5H,Downtown Toronto,"Adelaide, King, Richmond",A&W,1 Richmond Street West,5.6,1,43.651378,-79.378986
270,M5L,Downtown Toronto,"Commerce Court, Victoria Hotel",A&W,1 Richmond Street West,5.6,1,43.651378,-79.378986
402,M7A,Downtown Toronto,Queen's Park,A&W,496 Yonge Street,6.0,1,43.663094,-79.383971
241,M5K,Downtown Toronto,"Design Exchange, Toronto Dominion Centre",A&W,1 Richmond Street West,5.6,1,43.651378,-79.378986
363,M5X,Downtown Toronto,"First Canadian Place, Underground city",A&W,1 Richmond Street West,5.6,1,43.651378,-79.378986
142,M5G,Downtown Toronto,Central Bay Street,Adega Restaurant,33 Elm St,7.2,3,43.657519,-79.383462
163,M5G,Downtown Toronto,Central Bay Street,Akashiro Japanese Restaurant & Bar,220 Yonge St.,5.6,2,43.655965,-79.380541
160,M5G,Downtown Toronto,Central Bay Street,Alio Restaurant & Wine Bar,108 Dundas St W,5.2,4,43.655655,-79.384124
188,M5H,Downtown Toronto,"Adelaide, King, Richmond",Anoush Restaurant,250 Dundas St W,,2,43.654588,-79.389692
341,M5X,Downtown Toronto,"First Canadian Place, Underground city",Azure Restaurant & Bar,225 Front St W,5.9,4,43.644749,-79.385113


<br>
Next jobs: for my own reference - 

1. Eliminate duplicate rows. First find closest neighborhood using latitude longitude coordinates and delete other entries.
2. Plot all the restaurants on a map.
3. Delete unwanted columns from dataframe
4. Normalize columns that will be used to find euclidean distance
5. Plot clustered map.
<br>

In [8]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0,Postcode,Borough,Neigborhood,Restarant,Address,Rating,Price Tier,Latitude,Longitude
0,M4E,East Toronto,The Beaches,Seaspray Restaurant,629 Kingston Rd,,1.0,43.678888,-79.298167
1,M4K,East Toronto,"The Danforth West, Riverdale",Herby Restaurant,397 Danforth Ave,,,43.67745,-79.35129
2,M4K,East Toronto,"The Danforth West, Riverdale",Beiteddine Lebanese Restaurant,,,,43.678065,-79.348542
3,M4K,East Toronto,"The Danforth West, Riverdale",Osmow’s Authentic Mediterranean Restaurant,497 Danforth Avenue,,2.0,43.677883,-79.34925
4,M4K,East Toronto,"The Danforth West, Riverdale",Megas Restaurant,402 Danforth Ave,6.5,2.0,43.677692,-79.351522


In [12]:
### This cell is created on 16/03/2020. Do not run this cell today.

df_rest.rename(columns = {'Restarant':'Restaurant', 'Latitude':'lat_R', 'Longitude':'long_R'}, inplace=True)
df_rest = df_rest.merge(df_new[['Postcode', 'Latitude', 'Longitude']], on='Postcode')
df_rest['Distance_sq'] = np.square((df_rest['lat_R'] - df_rest['Latitude'])) + np.square((df_rest['long_R'] - df_rest['Longitude']))
df_rest.sort_values(by=['Restaurant', 'Distance_sq'], axis=0, inplace=True)
df_rest.drop_duplicates(subset=['Restaurant', 'lat_R', 'long_R'], keep='first', inplace=True)
df_rest.sort_values(by=['Postcode'], inplace=True)

# Add a statement to drop columns latitude, longitude, distance_sq
df_rest.drop(columns=['Latitude', 'Longitude', 'Distance_sq'], inplace=True)
df_rest.reset_index(drop=True, inplace=True)

### Run this cell only from 17/03/2020

In [13]:
df_rest.head()

Unnamed: 0,Postcode,Borough,Neigborhood,Restaurant,Address,Rating,Price Tier,lat_R,long_R
0,M4E,East Toronto,The Beaches,Seaspray Restaurant,629 Kingston Rd,,1.0,43.678888,-79.298167
1,M4K,East Toronto,"The Danforth West, Riverdale",Friendly Greek Restaurant,494 Danforth Ave.,6.6,2.0,43.678428,-79.347642
2,M4K,East Toronto,"The Danforth West, Riverdale",Megas Restaurant,402 Danforth Ave,6.5,2.0,43.677692,-79.351522
3,M4K,East Toronto,"The Danforth West, Riverdale",Osmow’s Authentic Mediterranean Restaurant,497 Danforth Avenue,,2.0,43.677883,-79.34925
4,M4K,East Toronto,"The Danforth West, Riverdale",Pantheon,407 Danforth Ave.,8.7,2.0,43.677621,-79.351434


In [14]:
df_rest.shape

(234, 9)

In [15]:
print("Ok, so there are a total of {} restaurnats in Toronto".format(len(df_rest)))

Ok, so there are a total of 234 restaurnats in Toronto


In [55]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="ttt")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

NameError: name 'Nominatim' is not defined

In [83]:
# Now plot all the restaurants on a map
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, long, label in zip(df_rest['lat_R'], df_rest['long_R'], df_rest['Restaurant']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=4,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [86]:
df_new.shape

(39, 5)

<br>
So, now let us get to the clustering part. We have to cluster the 39 neighborhoods in 4 boroughs in Toronto. The parameters that we will use for clustering are number of restaurants, average rating of restaurants, price tier/ how costly the restaurants are in a given neighborhood. As the number of parameters are less, I will only be dividing the neighborhoods into 3 clusters.
<br>

In [16]:
# Now we can drop columns that are not required
df_rest2 = df_rest.drop(columns=['Address', 'lat_R', 'long_R'])

In [17]:
df_rest2.head()

Unnamed: 0,Postcode,Borough,Neigborhood,Restaurant,Rating,Price Tier
0,M4E,East Toronto,The Beaches,Seaspray Restaurant,,1.0
1,M4K,East Toronto,"The Danforth West, Riverdale",Friendly Greek Restaurant,6.6,2.0
2,M4K,East Toronto,"The Danforth West, Riverdale",Megas Restaurant,6.5,2.0
3,M4K,East Toronto,"The Danforth West, Riverdale",Osmow’s Authentic Mediterranean Restaurant,,2.0
4,M4K,East Toronto,"The Danforth West, Riverdale",Pantheon,8.7,2.0


In [21]:
df_rest2 = df_rest2.groupby('Postcode', as_index=False).aggregate(
    {'Borough':'first', 'Neigborhood':'first', 'Restaurant':'count', 'Rating':'mean', 'Price Tier':'mean'})

#### The final steps... drop those rows which have a nan value. Apply clustering with k=3. Display on a map. Interpret the results.

In [24]:
df_rest2.dropna(inplace=True)
df_rest2.reset_index(drop=True, inplace=True)

In [25]:
df_rest2

Unnamed: 0,Postcode,Borough,Neigborhood,Restaurant,Rating,Price Tier
0,M4K,East Toronto,"The Danforth West, Riverdale",11,7.26,1.888889
1,M4L,East Toronto,"The Beaches West, India Bazaar",4,6.0,1.75
2,M4P,Central Toronto,Davisville North,4,7.1,1.75
3,M4R,Central Toronto,North Toronto West,2,6.6,2.0
4,M4S,Central Toronto,Davisville,5,6.3,1.5
5,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",4,8.0,2.333333
6,M4X,Downtown Toronto,"Cabbagetown, St. James Town",8,6.566667,1.428571
7,M4Y,Downtown Toronto,Church and Wellesley,16,6.457143,1.8
8,M5A,Downtown Toronto,Harbourfront,6,7.5,1.666667
9,M5B,Downtown Toronto,"Ryerson, Garden District",13,6.842857,1.923077


In [26]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [32]:
X = df_rest2.values[:,3:6]
cluster_dataset = StandardScaler().fit_transform(X)
cluster_dataset



array([[ 0.35648667,  0.50650807, -0.04254172],
       [-0.72485623, -1.6849412 , -0.4190453 ],
       [-0.72485623,  0.2282288 , -0.4190453 ],
       [-1.03381134, -0.64139393,  0.25866115],
       [-0.57037867, -1.16316757, -1.09675174],
       [-0.72485623,  1.79354971,  1.16226974],
       [-0.106946  , -0.69936878, -1.29038216],
       [ 1.12887445, -0.88985757, -0.28350401],
       [-0.41590111,  0.92392698, -0.64494745],
       [ 0.66544178, -0.21900575,  0.05013609],
       [ 0.97439689,  0.59540284,  1.0331828 ],
       [ 2.21021735, -0.53436344, -0.02668894],
       [ 0.20200911, -0.29354484,  1.16226974],
       [ 0.51096423,  0.94877334,  0.4845633 ],
       [-0.41590111, -0.93126817,  1.16226974],
       [ 0.35648667, -0.66624029,  0.50509985],
       [-0.72485623, -1.27911726, -1.77445819],
       [ 3.29156024, -0.23556999, -0.86305987],
       [ 0.04753156,  0.150929  , -0.04254172],
       [-0.87933378,  1.53266289,  2.06587834],
       [-0.87933378,  0.75000243,  1.162

In [35]:
df_rest2.shape

(26, 6)

In [42]:
num_clusters = 3

k_means = KMeans(init="k-means++", n_clusters=num_clusters, n_init=12)
k_means.fit(cluster_dataset)
labels = k_means.labels_

print(labels)

[0 1 1 1 1 0 1 2 0 0 0 2 0 0 0 0 1 2 0 0 0 0 1 0 1 1]


In [43]:
labels

array([0, 1, 1, 1, 1, 0, 1, 2, 0, 0, 0, 2, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0,
       1, 0, 1, 1], dtype=int32)

In [44]:
# add Labels to the original dataframe
df_rest2['Label'] = labels

In [45]:
df_rest2

Unnamed: 0,Postcode,Borough,Neigborhood,Restaurant,Rating,Price Tier,Label
0,M4K,East Toronto,"The Danforth West, Riverdale",11,7.26,1.888889,0
1,M4L,East Toronto,"The Beaches West, India Bazaar",4,6.0,1.75,1
2,M4P,Central Toronto,Davisville North,4,7.1,1.75,1
3,M4R,Central Toronto,North Toronto West,2,6.6,2.0,1
4,M4S,Central Toronto,Davisville,5,6.3,1.5,1
5,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",4,8.0,2.333333,0
6,M4X,Downtown Toronto,"Cabbagetown, St. James Town",8,6.566667,1.428571,1
7,M4Y,Downtown Toronto,Church and Wellesley,16,6.457143,1.8,2
8,M5A,Downtown Toronto,Harbourfront,6,7.5,1.666667,0
9,M5B,Downtown Toronto,"Ryerson, Garden District",13,6.842857,1.923077,0


In [47]:
df_rest2[['Restaurant', 'Rating', 'Price Tier', 'Label']].groupby('Label').mean()

Unnamed: 0_level_0,Restaurant,Rating,Price Tier
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,8.214286,7.319252,2.139463
1,4.666667,6.52963,1.587302
2,23.0,6.650672,1.760315


Interpretation of results - label 0: Neighbourhoods with expensive restaurants and moderate number of eating options. <br>                            label 1: Neighbourhoods with cheap restaurants and low number of eating options. <br>
label 2: Neighbourhoods with cheap restaurants and high number of eating options.<br>
On the whole we see that expensive restaurants class has a higher rating.
<br>

In [53]:
# Merge df_new with labels
df_new2 = df_new.merge(df_rest2[['Postcode', 'Label']], on='Postcode')

In [78]:
mark_color = df_new2['Label'].replace({0: 'purple', 1:'red', 2:'blue'})
mark_color

0     purple
1        red
2        red
3        red
4        red
5     purple
6        red
7       blue
8     purple
9     purple
10    purple
11      blue
12    purple
13    purple
14    purple
15    purple
16       red
17      blue
18    purple
19    purple
20    purple
21    purple
22       red
23    purple
24       red
25       red
Name: Label, dtype: object

In [76]:
# Let's define color codes and cluster label names
marker_color = ['darkpurple', 'red', 'blue']
marker_label = ['Expensive/ Medium', 'Cheap/ Low', 'Cheap/ High']  # Here L:low, M:medium, H:high density of restaurants in neighbourhood


In [79]:
# Let's visualize the clusters on a map
map_cluster = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, long, n, c in zip(df_new2['Latitude'], df_new2['Longitude'], df_new2['Neighbourhood'], mark_color):
    label = folium.Popup(n, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=7,
        popup=label,
        color=c,
        fill=True,
        fill_color=c,
        fill_opacity=0.6,
        parse_html=False).add_to(map_cluster)  
    
map_cluster

Thus we can see that neighborhoods in Downtown Toronto have a high density of restaurants which are expensive on the average. As we move inland into neighborhoods in Central Toronto there is less density of restaurants and they are cheaper. 