In [8]:
pip install lxml

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np 

<B>Importing tables from Wikipedia

In [2]:
tables = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M", header=0)

print(tables[0])

    Postal Code           Borough  \
0           M1A      Not assigned   
1           M2A      Not assigned   
2           M3A        North York   
3           M4A        North York   
4           M5A  Downtown Toronto   
..          ...               ...   
175         M5Z      Not assigned   
176         M6Z      Not assigned   
177         M7Z      Not assigned   
178         M8Z         Etobicoke   
179         M9Z      Not assigned   

                                          Neighborhood  
0                                                  NaN  
1                                                  NaN  
2                                            Parkwoods  
3                                     Victoria Village  
4                            Regent Park, Harbourfront  
..                                                 ...  
175                                                NaN  
176                                                NaN  
177                                       

In [3]:
df=tables[0]

In [4]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


<B> Dropping Boroughs that have values 'Not assigned'

In [5]:
indexnames = df[ df['Borough'] == 'Not assigned' ].index
df.drop(indexnames, inplace=True)

<B> Checking Neighborhoods with 'null' values

In [6]:
df.loc[df['Neighborhood'] == 'NaN']

Unnamed: 0,Postal Code,Borough,Neighborhood


In [68]:
df.head(10)

Unnamed: 0,level_0,index,Postal Code,Borough,Neighborhood
0,0,2,M3A,North York,Parkwoods
1,1,3,M4A,North York,Victoria Village
2,2,4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,3,5,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,4,6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,5,8,M9A,Etobicoke,Islington Avenue
6,6,9,M1B,Scarborough,"Malvern, Rouge"
7,7,11,M3B,North York,Don Mills
8,8,12,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,9,13,M5B,Downtown Toronto,"Garden District, Ryerson"


In [7]:
df.reset_index(inplace=True)
df.head()

Unnamed: 0,index,Postal Code,Borough,Neighborhood
0,2,M3A,North York,Parkwoods
1,3,M4A,North York,Victoria Village
2,4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,5,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [8]:
Data=df[['Postal Code', 'Borough', 'Neighborhood']].copy()

In [9]:
Data.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


<B> Grouping cells by Postal Code and Borough

In [10]:
Data_grouped = Data.groupby(["Postal Code", "Borough"], as_index=False).agg(lambda x: ", ".join(x))

In [78]:
Data_grouped.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [11]:
Data_grouped.shape

(103, 3)

In [12]:
latlong = pd.read_csv('https://cocl.us/Geospatial_data')

In [13]:
latlong.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
Toronto_df=pd.merge(Data, latlong, on='Postal Code')

In [15]:
Toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [16]:
Toronto_df.shape

(103, 5)

<B> Creating a Map of Toronto

In [17]:
import json # library to handle JSON files

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Libraries imported.


<b> Getting Torornto's Latitude and Longitude

In [18]:
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

In [54]:
address = 'Toronto'

geolocator = Nominatim(user_agent="TO_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


<B> Creating Toronto's map

In [55]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(Toronto_df['Latitude'], Toronto_df['Longitude'], Toronto_df['Borough'], Toronto_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='Red',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)

map_toronto

<b> Defining Foursquare Credentials and data before Corona Pandemic

In [56]:
CLIENT_ID = 'LIMDOOOILFVSHTR5WM1WIJOZQVM30NQCOPMSZ5PQTIG0KRIC' # your Foursquare ID
CLIENT_SECRET = 'CZC1Y0O2K5V1BQYWSBPHIBFLPLTWZ4CQ4JH44CQX3IU40PTX' # your Foursquare Secret
VERSION = '20200105' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: LIMDOOOILFVSHTR5WM1WIJOZQVM30NQCOPMSZ5PQTIG0KRIC
CLIENT_SECRET:CZC1Y0O2K5V1BQYWSBPHIBFLPLTWZ4CQ4JH44CQX3IU40PTX


<b> Get top 100 venues within 3000m radius

In [57]:
LIMIT = 100

venues = []

for lat, long, neighborhood in zip(Toronto_df['Latitude'], Toronto_df['Longitude'], Toronto_df['Neighborhood']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [75]:
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 'VenueName', 'Venue Latitude', 'Venue Longitude', 'Venue Category']

print(venues_df.shape)
venues_df.head()

(9785, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,VenueName,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Allwyn's Bakery,43.75984,-79.324719,Caribbean Restaurant
1,Parkwoods,43.753259,-79.329656,Donalda Golf & Country Club,43.752816,-79.342741,Golf Course
2,Parkwoods,43.753259,-79.329656,Island Foods,43.745866,-79.346035,Caribbean Restaurant
3,Parkwoods,43.753259,-79.329656,Galleria Supermarket,43.75352,-79.349518,Supermarket
4,Parkwoods,43.753259,-79.329656,Graydon Hall Manor,43.763923,-79.342961,Event Space


<b> Save dataframe as csv to check Indian restaurants

In [64]:
venues_df.to_csv(r'C:\Users\ravii\Desktop\venues_df.csv')

<b> Check how many venues returned by each neighborhood

In [82]:
venues_df.groupby(["Neighborhood"]).count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,VenueName,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,100,100,100,100,100,100
"Alderwood, Long Branch",100,100,100,100,100,100
"Bathurst Manor, Wilson Heights, Downsview North",100,100,100,100,100,100
Bayview Village,100,100,100,100,100,100
"Bedford Park, Lawrence Manor East",100,100,100,100,100,100
...,...,...,...,...,...,...
"Willowdale, Newtonbrook",100,100,100,100,100,100
Woburn,100,100,100,100,100,100
Woodbine Heights,100,100,100,100,100,100
York Mills West,81,81,81,81,81,81


<b> Counting unique categories

In [67]:
print('There are {} uniques categories.'.format(len(venues_df['Venue Category'].unique())))

There are 293 uniques categories.


<b> Print out the list of categories

In [69]:
venues_df['Venue Category'].unique()[:50]

array(['Caribbean Restaurant', 'Golf Course', 'Supermarket',
       'Event Space', 'Liquor Store', 'Park', 'Mediterranean Restaurant',
       'Bagel Shop', 'Seafood Restaurant', 'Greek Restaurant',
       'Gym / Fitness Center', 'Italian Restaurant',
       'Middle Eastern Restaurant', 'Frozen Yogurt Shop', 'Café',
       'Shopping Mall', 'Asian Restaurant', 'Grocery Store',
       'Mexican Restaurant', 'Cantonese Restaurant', 'Coffee Shop',
       'Movie Theater', 'Falafel Restaurant', 'Burger Joint',
       'Vietnamese Restaurant', 'Indian Restaurant',
       'Japanese Restaurant', 'Burrito Place', 'Thai Restaurant',
       'Pool Hall', 'American Restaurant', 'Sushi Restaurant',
       'New American Restaurant', 'Restaurant', 'Steakhouse',
       'Ice Cream Shop', 'Discount Store', 'Gourmet Shop',
       'Korean Restaurant', 'Bank', 'Persian Restaurant', 'Salad Place',
       'Chocolate Shop', 'Gym', 'Pizza Place', 'Toy / Game Store',
       'Chinese Restaurant', "Women's Store", 'Ha

<b> Analysing each neighborhood

In [81]:
# one hot encoding
Toronto_onehot = pd.get_dummies(venues_df[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Toronto_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

print(Toronto_onehot.shape)
Toronto_onehot.head()

(9785, 294)


Unnamed: 0,Neighborhoods,Accessories Store,Afghan Restaurant,Airport,American Restaurant,Amphitheater,Antique Shop,Arcade,Art Gallery,Arts & Crafts Store,...,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [83]:
Toronto_grouped = Toronto_onehot.groupby('Neighborhoods').mean().reset_index()
Toronto_grouped

Unnamed: 0,Neighborhoods,Accessories Store,Afghan Restaurant,Airport,American Restaurant,Amphitheater,Antique Shop,Arcade,Art Gallery,Arts & Crafts Store,...,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit
0,Agincourt,0.0,0.00,0.00,0.02,0.0,0.0,0.00,0.0,0.01,...,0.02,0.00,0.0,0.0,0.00,0.0,0.00,0.00,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.00,0.00,0.01,0.0,0.0,0.01,0.0,0.01,...,0.00,0.01,0.0,0.0,0.01,0.0,0.00,0.01,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.00,0.01,0.01,0.0,0.0,0.00,0.0,0.02,...,0.00,0.01,0.0,0.0,0.00,0.0,0.00,0.00,0.0,0.0
3,Bayview Village,0.0,0.00,0.00,0.00,0.0,0.0,0.00,0.0,0.00,...,0.00,0.00,0.0,0.0,0.00,0.0,0.00,0.00,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.00,0.00,0.01,0.0,0.0,0.00,0.0,0.01,...,0.00,0.01,0.0,0.0,0.01,0.0,0.00,0.01,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,"Willowdale, Newtonbrook",0.0,0.00,0.00,0.00,0.0,0.0,0.00,0.0,0.00,...,0.01,0.00,0.0,0.0,0.00,0.0,0.00,0.01,0.0,0.0
94,Woburn,0.0,0.00,0.00,0.01,0.0,0.0,0.00,0.0,0.00,...,0.01,0.00,0.0,0.0,0.01,0.0,0.01,0.01,0.0,0.0
95,Woodbine Heights,0.0,0.01,0.00,0.02,0.0,0.0,0.00,0.0,0.00,...,0.00,0.01,0.0,0.0,0.00,0.0,0.00,0.01,0.0,0.0
96,York Mills West,0.0,0.00,0.00,0.00,0.0,0.0,0.00,0.0,0.00,...,0.00,0.00,0.0,0.0,0.00,0.0,0.00,0.00,0.0,0.0


In [84]:
Toronto_grouped.shape

(98, 294)

### Combing data for Indian Restaurants only

In [None]:
Toronto_IR = Toronto_grouped[["Neighborhoods","Indian Restaurant"]]

In [86]:
Toronto_IR.head()

Unnamed: 0,Neighborhoods,Indian Restaurant
0,Agincourt,0.05
1,"Alderwood, Long Branch",0.01
2,"Bathurst Manor, Wilson Heights, Downsview North",0.01
3,Bayview Village,0.0
4,"Bedford Park, Lawrence Manor East",0.01


<b> Putting in a Pandas Dataframe

### Clustering Neighborhoods

In [110]:
# set number of clusters
kclusters = 3

Toronto_IR_clustering = Toronto_IR.drop('Neighborhoods', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_IR_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 0, 0, 0, 0, 0, 0, 0, 1, 0])

Create dataframe that includes clusters as well as neighborhood

In [111]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
Toronto_merged = Toronto_IR.copy()

# add clustering labels
Toronto_merged["Cluster Labels"] = kmeans.labels_

In [112]:
Toronto_merged.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
Toronto_merged = Toronto_merged.join(Toronto_df.set_index("Neighborhood"), on="Neighborhood")

Toronto_merged.head()

Unnamed: 0,Neighborhood,Indian Restaurant,Cluster Labels,Postal Code,Borough,Latitude,Longitude
0,Agincourt,0.05,2,M1S,Scarborough,43.7942,-79.262029
1,"Alderwood, Long Branch",0.01,0,M8W,Etobicoke,43.602414,-79.543484
2,"Bathurst Manor, Wilson Heights, Downsview North",0.01,0,M3H,North York,43.754328,-79.442259
3,Bayview Village,0.0,0,M2K,North York,43.786947,-79.385975
4,"Bedford Park, Lawrence Manor East",0.01,0,M5M,North York,43.733283,-79.41975


<b> Create a map to visualise the data

In [113]:
# create map
map_Toronto_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Neighborhood'], Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_Toronto_clusters)
       
map_Toronto_clusters

### Examining Clusters

In [114]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 0]

Unnamed: 0,Neighborhood,Indian Restaurant,Cluster Labels,Postal Code,Borough,Latitude,Longitude
1,"Alderwood, Long Branch",0.010000,0,M8W,Etobicoke,43.602414,-79.543484
2,"Bathurst Manor, Wilson Heights, Downsview North",0.010000,0,M3H,North York,43.754328,-79.442259
3,Bayview Village,0.000000,0,M2K,North York,43.786947,-79.385975
4,"Bedford Park, Lawrence Manor East",0.010000,0,M5M,North York,43.733283,-79.419750
5,Berczy Park,0.000000,0,M5E,Downtown Toronto,43.644771,-79.373306
...,...,...,...,...,...,...,...
92,Willowdale,0.000000,0,M2N,North York,43.770120,-79.408493
92,Willowdale,0.000000,0,M2R,North York,43.782736,-79.442259
93,"Willowdale, Newtonbrook",0.010000,0,M2M,North York,43.789053,-79.408493
96,York Mills West,0.012346,0,M2P,North York,43.752758,-79.400049


In [115]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 1]

Unnamed: 0,Neighborhood,Indian Restaurant,Cluster Labels,Postal Code,Borough,Latitude,Longitude
8,Business reply mail Processing Centre,0.02,1,M7Y,East Toronto,43.662744,-79.321558
10,Caledonia-Fairbanks,0.03,1,M6E,York,43.689026,-79.453512
11,Canada Post Gateway Processing Centre,0.024691,1,M7R,Mississauga,43.636966,-79.615819
17,"Cliffside, Cliffcrest, Scarborough Village West",0.02381,1,M1M,Scarborough,43.716316,-79.239476
19,Davisville,0.03,1,M4S,Central Toronto,43.704324,-79.38879
20,Davisville North,0.03,1,M4P,Central Toronto,43.712751,-79.390197
23,"Dorset Park, Wexford Heights, Scarborough Town...",0.03,1,M1P,Scarborough,43.75741,-79.273304
40,Humewood-Cedarvale,0.03,1,M6C,York,43.693781,-79.428191
41,"India Bazaar, The Beaches West",0.02,1,M4L,East Toronto,43.668999,-79.315572
47,Lawrence Park,0.02,1,M4N,Central Toronto,43.72802,-79.38879


In [116]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 2]

Unnamed: 0,Neighborhood,Indian Restaurant,Cluster Labels,Postal Code,Borough,Latitude,Longitude
0,Agincourt,0.05,2,M1S,Scarborough,43.7942,-79.262029
12,Cedarbrae,0.04,2,M1H,Scarborough,43.773136,-79.239476
25,"Dufferin, Dovercourt Village",0.04,2,M6H,West Toronto,43.669005,-79.442259
30,Forest Hill North & West,0.04,2,M5P,Central Toronto,43.696948,-79.411307
38,Humber Summit,0.057692,2,M9L,North York,43.756303,-79.565963
72,"South Steeles, Silverstone, Humbergate, Jamest...",0.059524,2,M9V,Etobicoke,43.739416,-79.588437
95,Woodbine Heights,0.06,2,M4C,East York,43.695344,-79.318389


It can be seen that Cluster 1 and C2uster 2 have moderate and high density of Indian restaurants. In cluster 0, very little restaurants are there. This data when combined with data for density of Indian diaspora in different areas can provide good insights where to open a new restaurant. Cluster 2 may have oversupply problem. Cluster 1 may be saturated as of now due to limited demand. This map gives a good insight as to which areas are potentially viable. Further research into those areas may help zero in on a location.