# Notebook created as a part of Capstone project

#### We simply use read_html method of pandas to obtain the list of tables from the wikipedia page.

In [1]:
import pandas as pd

df = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

In [2]:
df

[    Postcode           Borough  \
 0        M1A      Not assigned   
 1        M2A      Not assigned   
 2        M3A        North York   
 3        M4A        North York   
 4        M5A  Downtown Toronto   
 5        M5A  Downtown Toronto   
 6        M6A        North York   
 7        M6A        North York   
 8        M7A      Queen's Park   
 9        M8A      Not assigned   
 10       M9A         Etobicoke   
 11       M1B       Scarborough   
 12       M1B       Scarborough   
 13       M2B      Not assigned   
 14       M3B        North York   
 15       M4B         East York   
 16       M4B         East York   
 17       M5B  Downtown Toronto   
 18       M5B  Downtown Toronto   
 19       M6B        North York   
 20       M7B      Not assigned   
 21       M8B      Not assigned   
 22       M9B         Etobicoke   
 23       M9B         Etobicoke   
 24       M9B         Etobicoke   
 25       M9B         Etobicoke   
 26       M9B         Etobicoke   
 27       M1C       

#### We only require the first table in the list, so we extract it.

In [3]:
df = df[0]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### Then we remove the cells in which Borough is not assigned

In [4]:
df = df[df['Borough'] != 'Not assigned']

In [5]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


#### Resetting indices

In [6]:
df.reset_index(drop=True, inplace=True)

In [7]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


#### Appending lists of neighborhoods together requires us to iterate through the whole dataframe. If the postal code is repeated, instead of inserting a new row into our new dataframe, we append the new neighbourhood value to the neighbourhood value of the last row of the new dataframe

In [8]:
df2 = pd.DataFrame({'PostalCode': [df.loc[0]['Postcode']], 'Borough': [df.loc[0]['Borough']], 'Neighbourhood': [df.loc[0]['Neighbourhood']]})

In [9]:
i = 1
while i<df.shape[0]:
    if df.loc[i]['Postcode'] != df.loc[i-1]['Postcode']:
        
        df3 = {'PostalCode': df.loc[i]['Postcode'], 'Borough': df.loc[i]['Borough'], 'Neighbourhood': df.loc[i]['Neighbourhood']}
        df2 = df2.append(df3, ignore_index=True)
    else:
        df2.loc[df2.shape[0]-1]['Neighbourhood'] += ", " + df.loc[i]['Neighbourhood']
    i += 1

#### We check if the code is successful by displaying the entire dataframe

In [10]:
df2.head(df2.shape[0])

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Not assigned
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [11]:
df = df2

#### If the value of Neighbourhood is not assigned, we assign the value of the respective borough to it.

In [12]:
df.loc[df['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = df[df['Neighbourhood'] == 'Not assigned']['Borough']

In [13]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [14]:
df.shape

(103, 3)

#### I'm using the CSV File provided

In [15]:
df2 = pd.read_csv('Geospatial_Coordinates.csv')
df2['PostalCode'] = df2['Postal Code']
df2 = df2[['PostalCode', 'Latitude', 'Longitude']]

In [16]:
df2.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### We merge the two dataframes on the values of the column Postalcode

In [17]:
df = pd.merge(df, df2, on='PostalCode')

In [18]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [19]:
import folium
from sklearn.cluster import KMeans

#### We visualize the postal code regions first

In [20]:
map_toronto = folium.Map(location=[43.6532, -79.3832], zoom_start=10)

for lat, lng, pc, borough in zip(df['Latitude'], df['Longitude'], df['PostalCode'], df['Borough']):
    label = '{}, {}'.format(pc, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### First, we cluster the postal code regions into 11 clusters based on latitude, longitude

In [21]:
df_clustering = df.drop(['PostalCode', 'Borough', 'Neighbourhood'], 1)

kmeans = KMeans(n_clusters=11, random_state=0).fit(df_clustering)

In [22]:
df['Cluster'] = list(kmeans.labels_)

In [23]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude,Cluster
0,M3A,North York,Parkwoods,43.753259,-79.329656,0
1,M4A,North York,Victoria Village,43.725882,-79.315572,5
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,2
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763,1
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494,2


In [24]:
colors = [
    'red',
    'blue',
    'gray',
    'darkred',
    'orange',
    'green',
    'darkgreen',
    'cadetblue',
    'darkblue',
    'purple',
    'darkpurple'
]

map_clusters = folium.Map(location=[43.6532, -79.3832], zoom_start=10)

for lat, lon, pc, cluster in zip(df['Latitude'], df['Longitude'], df['PostalCode'], df['Cluster']):
    label = folium.Popup(str(pc) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=colors[cluster],
        fill=True,
        fill_color=colors[cluster],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Explore 1st neighbourhood in our dataset

In [25]:
CLIENT_ID = 'OUDXSF2ALHUJPRH5YYLZ22BQUK3BKR1RT3YTTLVA3ZQJAKB5' # your Foursquare ID
CLIENT_SECRET = 'EWMDVFRD4GCTC5GVLSL4H0VNV4QO20CF40Q2Z0DWUMMKFYYP' # your Foursquare Secret
VERSION = '20190605' # Foursquare API version

In [27]:
lat = df[0:1]['Latitude'][0]
long = df[0:1]['Longitude'][0]

#### Now, let's get the top 100 venues that are in M3A within a radius of 500 meters.

In [28]:
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    lat, 
    long, 
    500, 
    100)

In [30]:
import requests
results = requests.get(url).json()

In [31]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [32]:
from pandas.io.json import json_normalize

venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Brookbanks Park,Park,43.751976,-79.33214
1,KFC,Fast Food Restaurant,43.754387,-79.333021
2,TTC stop #8380,Bus Stop,43.752672,-79.326351
3,Variety Store,Food & Drink Shop,43.751974,-79.333114


#### Exploring Neighbourhoods in Toronto

#### Let's create a function to repeat the same process to all the neighborhoods in Toronto

In [36]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            100)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Running this function over all neighbourhoods

In [37]:
toronto_venues = getNearbyVenues(names=df['PostalCode'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

M3A
M4A
M5A
M6A
M7A
M9A
M1B
M3B
M4B
M5B
M6B
M9B
M1C
M3C
M4C
M5C
M6C
M9C
M1E
M4E
M5E
M6E
M1G
M4G
M5G
M6G
M1H
M2H
M3H
M4H
M5H
M6H
M1J
M2J
M3J
M4J
M5J
M6J
M1K
M2K
M3K
M4K
M5K
M6K
M1L
M2L
M3L
M4L
M5L
M6L
M9L
M1M
M2M
M3M
M4M
M5M
M6M
M9M
M1N
M2N
M3N
M4N
M5N
M6N
M9N
M1P
M2P
M4P
M5P
M6P
M9P
M1R
M2R
M4R
M5R
M6R
M7R
M9R
M1S
M4S
M5S
M6S
M1T
M4T
M5T
M1V
M4V
M5V
M8V
M9V
M1W
M4W
M5W
M8W
M9W
M1X
M4X
M5X
M8X
M4Y
M7Y
M8Y
M8Z


In [39]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 278 uniques categories.


#### Analyzing each neighbourhood

In [40]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [42]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,M1B,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
1,M1C,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
2,M1E,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
3,M1G,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
4,M1H,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
5,M1J,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.333333
6,M1K,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
7,M1L,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
8,M1M,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
9,M1N,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000


#### Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [43]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [46]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in range(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in range(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Fast Food Restaurant,Women's Store,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dive Bar,Dog Run,Doner Restaurant,Drugstore
1,M1C,Bar,Women's Store,Donut Shop,Diner,Discount Store,Dive Bar,Dog Run,Doner Restaurant,Drugstore,Dessert Shop
2,M1E,Electronics Store,Spa,Rental Car Location,Mexican Restaurant,Breakfast Spot,Medical Center,Pizza Place,Intersection,Drugstore,Donut Shop
3,M1G,Coffee Shop,Korean Restaurant,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dive Bar,Dog Run,Doner Restaurant,Women's Store
4,M1H,Fried Chicken Joint,Hakka Restaurant,Athletics & Sports,Bakery,Bank,Thai Restaurant,Caribbean Restaurant,Dog Run,Diner,Discount Store


#### Clustering Neighbourhoods

In [47]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 4, 0, 0, 0, 0, 0, 0, 0, 0])

In [52]:
# add clustering labels
neighborhoods_venues_sorted['PostalCode'] = neighborhoods_venues_sorted['Neighborhood']

neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('PostalCode'), on='PostalCode')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude,Cluster,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,0,0.0,M3A,Bus Stop,Park,Fast Food Restaurant,Food & Drink Shop,Doner Restaurant,Dim Sum Restaurant,Diner,Discount Store,Dive Bar,Dog Run
1,M4A,North York,Victoria Village,43.725882,-79.315572,5,0.0,M4A,Portuguese Restaurant,Coffee Shop,French Restaurant,Hockey Arena,Intersection,Pizza Place,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Donut Shop
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,2,0.0,M5A,Coffee Shop,Bakery,Pub,Park,Breakfast Spot,Theater,Café,Restaurant,Mexican Restaurant,Gym / Fitness Center
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763,1,0.0,M6A,Furniture / Home Store,Clothing Store,Women's Store,Coffee Shop,Boutique,Miscellaneous Shop,Gift Shop,Event Space,Accessories Store,Vietnamese Restaurant
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494,2,0.0,M7A,Coffee Shop,Park,Gym,Diner,College Auditorium,Seafood Restaurant,Sandwich Place,Burger Joint,Burrito Place,Café


In [55]:
import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np

In [67]:
# create map
map_clusters = folium.Map(location=[43.6532, -79.3832], zoom_start=10)

# set color scheme for the clusters
x = range(kclusters)
ys = [i + x + (i*x)**2 for i in np.arange(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['PostalCode'], toronto_merged['Cluster Labels']):
    if not np.isnan(cluster):
        label = folium.Popup(str(poi) + ' Cluster ' + str(int(cluster)), parse_html=True)
        folium.CircleMarker(
            [lat, lon],
            radius=5,
            popup=label,
            color=rainbow[int(cluster)-1],
            fill=True,
            fill_color=rainbow[int(cluster)-1],
            fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

As you can see, most neighbourhoods fall in 1 cluster because a common set of universal venue categories exists in all of them.