Week 3 - Toronto Clusterning

In [244]:
import pandas as pd
import numpy as np
import requests
#importing BeautifulSoup lib. (Allready istalled in JupiterLab)
from bs4 import BeautifulSoup 

### Using requests to get the content of web-page and parse it using BeautifulSoup

In [245]:
#Using requests to get the content of web-page and parse it using BeautifulSoup
link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(link)
soup = BeautifulSoup(page.content,'html.parser')

postal_df = pd.DataFrame()
i=0
end = False
#Looping <tr> tag to get rows of the table
for tr in soup.find_all('tr'):
    j=0
    #Looping <td> tag to get columns within eath row
    for tds in tr.find_all('td'):
        #Using empty value as the end criteria. There is another table at this page, with we do not whant to be appended.
        if (len(tds.text) > 1)&(not end): 
            postal_df.loc[i,j]=tds.text
            j=j+1
        else:
            end=True
    i=i+1

postal_df.columns=['PostCode','Borough','Neighborhood']

In [246]:
postal_df.shape

(288, 3)

### Cleaning the data

In [247]:
# Deleting new line tag "/n" in last column
postal_df['Neighborhood']=postal_df['Neighborhood'].str.replace('\n','')
postal_df=postal_df.reset_index(drop=True)
postal_df.head(15)


Unnamed: 0,PostCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [248]:
#Deleting rows with no Borough info. Assigning Neighborhood to Borough where is "Not assigned"
for i in range(postal_df.shape[0]):
    if postal_df.loc[i,'Borough']=='Not assigned':
        postal_df.drop(i,inplace=True)
    elif postal_df.loc[i,'Neighborhood']=='Not assigned':
        postal_df.loc[i,'Neighborhood']=postal_df.loc[i,'Borough']
postal_df=postal_df.reset_index(drop=True)

#Merging rows with the same PostCodes
for i in range(postal_df.shape[0]):
    for k in range(i+1,postal_df.shape[0]):
        if postal_df.loc[i,'PostCode']==postal_df.loc[k,'PostCode']:
            postal_df.loc[i,'Neighborhood']='{}, {}'.format(postal_df.loc[i,'Neighborhood'],postal_df.loc[k,'Neighborhood'])
postal_df.drop_duplicates(keep='first',subset=['PostCode'],inplace=True)
postal_df=postal_df.reset_index(drop=True)

In [249]:
postal_df.head(15)

Unnamed: 0,PostCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [250]:
postal_df.shape

(103, 3)

### Getting the geo coordinates for each Postal Code

In [251]:
!pip install geocoder
import geocoder
print("Geocoder is ready!")

Geocoder is ready!


In [252]:
#At this moment geocoder do not work (hang the kernel)

"""
for i in range(postal_df.shape[0]):
    lat_lng_coords = None
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_df.loc[i,'PostCode']))
        lat_lng_coords = g.latlng
    postal_df.loc[i,'Latitude'] = lat_lng_coords[0]
    postal_df.loc[i,'Longitude'] = lat_lng_coords[1]
"""

#Using csv file with coordinates
geo_df=pd.read_csv('http://cocl.us/Geospatial_data')

In [253]:
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [254]:
#Using 'left join' to merge dataframe with Postal Codes and Geo Coordinates
postal_geo_df = pd.merge(postal_df, geo_df, left_on='PostCode',right_on='Postal Code', how='left').drop('Postal Code', axis=1)

In [255]:
postal_geo_df.head(20)

Unnamed: 0,PostCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


In [256]:
postal_geo_df.shape

(103, 5)

In [257]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

print('Libraries imported.')

Libraries imported.


## It's interesting to draw all the points on the map to understand their spread.

In [258]:
address = 'Toronto, Ontario, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [259]:
# create map of Toronto
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add PostCode's coordinates to the map
for lat, lng, borough, neighborhood in zip(postal_geo_df['Latitude'], postal_geo_df['Longitude'], postal_geo_df['Borough'], postal_geo_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## The territiry is too wide.  
## Lets limit it by exploring only the city and using only boroughs that contain the word 'Toronto' 

In [260]:
toronto_df=postal_geo_df[postal_geo_df['Borough'].str.contains('Toronto')].reset_index(drop=True)
#There are two St. James Town in Downtown with different Postal Codes. Changing the second name to keep it from dropping in analyzing stage.
#toronto_df.loc[69,'Neighborhood']="{}, {}".format(toronto_df.loc[69,'Borough'],toronto_df.loc[69,'Neighborhood'])
toronto_df.shape

(38, 5)

In [261]:
# create map of Toronto
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add PostCode's coordinates to the map
for lat, lng, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Borough'], toronto_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='cyan',
        fill=True,
        fill_color='black',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Getting vanues for each Neighborhood from Foursquare

In [262]:
CLIENT_ID = '#################' # your Foursquare ID
CLIENT_SECRET = '#################' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT=300
radius = 2000

In [263]:
def getNearbyVenues(names, latitudes, longitudes, radius):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [264]:
toronto_venues = getNearbyVenues(names=toronto_df['Neighborhood'],
                                   latitudes=toronto_df['Latitude'],
                                   longitudes=toronto_df['Longitude'],
                                 radius=radius
                                  )
print(toronto_venues.shape)
toronto_venues.head()

Harbourfront, Regent Park
Ryerson, Garden District
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Adelaide, King, Richmond
Dovercourt Village, Dufferin
Harbourfront East, Toronto Islands, Union Station
Little Portugal, Trinity
The Danforth West, Riverdale
Design Exchange, Toronto Dominion Centre
Brockton, Exhibition Place, Parkdale Village
The Beaches West, India Bazaar
Commerce Court, Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North, Forest Hill West
High Park, The Junction South
North Toronto West
The Annex, North Midtown, Yorkville
Parkdale, Roncesvalles
Davisville
Harbord, University of Toronto
Runnymede, Swansea
Moore Park, Summerhill East
Chinatown, Grange Park, Kensington Market
Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Rosedale
Stn A PO Boxes 25 The Esplanade
Cabbagetown, St. James Town
Fir

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Harbourfront, Regent Park",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Harbourfront, Regent Park",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Harbourfront, Regent Park",43.65426,-79.360636,The Distillery Historic District,43.650244,-79.359323,Historic Site
3,"Harbourfront, Regent Park",43.65426,-79.360636,Distillery Sunday Market,43.650075,-79.361832,Farmers Market
4,"Harbourfront, Regent Park",43.65426,-79.360636,Rooster Coffee,43.6519,-79.365609,Coffee Shop


### Analyzing the data

In [265]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

print(toronto_onehot.shape)
toronto_onehot.head()

(3787, 238)


Unnamed: 0,Zoo,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,Arcade,...,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [266]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Zoo,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,...,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.01,...,0.01,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.03,0.0,0.0,0.0,0.01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,...,0.0,0.02,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.01
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.03,0.0,0.0,0.01,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.0,0.01,0.01,0.01,0.0,0.0,0.0,0.02,...,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.02
5,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01
6,Central Bay Street,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,...,0.0,0.02,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0
7,"Chinatown, Grange Park, Kensington Market",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.03,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.01
8,Christie,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,...,0.0,0.04,0.0,0.01,0.01,0.0,0.01,0.0,0.0,0.0
9,Church and Wellesley,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.01


In [267]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [268]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Theater,Hotel,Thai Restaurant,Japanese Restaurant
1,Berczy Park,Coffee Shop,Café,Hotel,Gastropub,Italian Restaurant
2,"Brockton, Exhibition Place, Parkdale Village",Café,Coffee Shop,Bar,Restaurant,Tibetan Restaurant
3,Business Reply Mail Processing Centre 969 Eastern,Coffee Shop,Park,Café,Brewery,Bakery
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Park,Café,Coffee Shop,Italian Restaurant,Hotel


In [269]:
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 9

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0, n_init = 12,init = "k-means++").fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe

import collections
collections.Counter(kmeans.labels_)

Counter({5: 6, 1: 4, 0: 5, 3: 4, 2: 5, 8: 3, 7: 5, 4: 4, 6: 2})

In [270]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
print(toronto_merged.shape)
toronto_merged.head() # check the last columns!

(38, 11)


Unnamed: 0,PostCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,8,Coffee Shop,Café,Park,Restaurant,Thai Restaurant
1,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,5,Gastropub,Coffee Shop,Café,Thai Restaurant,Japanese Restaurant
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,1,Coffee Shop,Café,Gastropub,Japanese Restaurant,Thai Restaurant
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,6,Coffee Shop,Pub,Beach,Bar,Breakfast Spot
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,1,Coffee Shop,Café,Hotel,Gastropub,Italian Restaurant


### Visualizing the resulting clusters

In [271]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=13)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color="black",
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Why I choose 9 clusters ? 

It is the maximum value that make clusters contain min 2 Neighborhood. So for each Neighborhood we potencialy have min 1 choise to move. At the other hand we have maximum destinguish between clusters and maximum similarity within. 