Neighborhoods in Toronto - Segmenting and Clustering Assignment

Author: Panagiotis Sidiropoulos

In [None]:
#!conda update --all
#print('libs have been updated')

import numpy as np
import pandas as pd
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
import json

#!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim
import requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
#!conda install -c conda-forge folium=0.5.0 --yes
import folium
from bs4 import BeautifulSoup
import requests
print('import step done!')

Having imported all needed libraries, I utilize the BeautifulSoup library to extract the data from the given url for Toronto

In [None]:
#url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
url = 'https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=1011037969'
r = requests.get(url).text
soup = BeautifulSoup(r,'html5lib')
tbl = soup.find_all('table')
tbl_rows = soup.find_all('tr')
tbl_el = soup.find_all('td')

I create the dataframe, with 3 columns as asked. Then I drop the first row as it contained the headers.
I form another table which drops rows that the Borough is not assigned. Likewise, I assign each Neighborhood value to Borough's value if it's not assigned.

In [None]:
import pandas as pd

data_table = pd.read_html(str(tbl), flavor = 'bs4')[0]
data_table.columns = ['Postal Code', 'Borough', 'Neighborhood']
data_table = data_table[1:]
tbl_tor = data_table[~(data_table['Borough'] == 'Not assigned') | ~(data_table['Neighborhood'] == 'Not assigned')]
#tbl_tor = data_table[~(data_table['Borough'] == 'Not assigned')]

for i,place in enumerate(tbl_tor['Neighborhood']):
    if (place == 'Not assigned'):
        tbl_tor['Neighborhood'] = tbl_tor['Borough']

tbl_tor.reset_index(drop=True, inplace=True)

In [None]:
tbl_tor.shape

In [None]:
!conda install -c conda-forge geopandas --yes
!conda install -c conda-forge geopy --yes

Step 2: Get Coordinates from google API

In [None]:
import geopy
import geopandas

locator = Nominatim(user_agent="myGeocoder")

for i,place in enumerate(tbl_tor['Neighborhood']):
    location = locator.geocode(place)
    while(location!=None):
        print(location)
    #print(location.latitude, location.longitude)
    #tbl_tor.loc[i,'Latitude'] = location.latitude
    #tbl_tor.loc[i,'Longitude'] = location.longitude

I saw that the assignment of coordinates is done wrongfully, therefore I joined my basic table with coordinates given in a dataframe in the instructions

In [None]:
coor = pd.read_csv('C:/Users/sidpa/Desktop/Geospatial_Coordinates.csv')
tbl_tor = pd.merge(tbl_tor, coor, left_on='Postal Code', right_on='Postal Code', how='left')

In [None]:
tbl_tor_cluster = tbl_tor[tbl_tor['Borough'].str.contains('Toronto')]

In [None]:
!conda install -c conda-forge folium --yes

In [None]:
tbl_tor_cluster

In [None]:
import folium

initial_lat = 43.687840
initial_lng = -79.547860
toronto_map = folium.Map(location=[initial_lat,initial_lng],zoom_start=15)

for lat,lng,label in zip(tbl_tor_cluster['Latitude'],tbl_tor_cluster['Longitude'],tbl_tor_cluster['Neighborhood']):
    label = folium.Popup(label,parse_html=True)
    folium.CircleMarker([lat,lng],
                        radius=5,
                        popup=label,
                        color='blue',
                        fill=True,
                        fill_color='#3186cc',
                        fill_opacity=0.7,
                        parse_html=False).add_to(toronto_map)
    
toronto_map


Get Nearby Venues from Foursquare.

In [None]:
CLIENT_ID = '***'
CLIENT_SECRET = '***'
VERSION = '20190425' #'20180605' 
LIMIT = '100'
import json

In [None]:
def getVenues(names, latitudes, longitudes, radius=500):
    
    venues_list = []
    for name,lat,lng in zip(names, latitudes, longitudes):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{},&radius={}&limit={}'.format(
        CLIENT_ID,CLIENT_SECRET,VERSION,latitudes,longitudes,radius,LIMIT
        )
        
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        venues_list.append([(name,lat,lng,
                             v['venue']['name'],
                             v['venue']['location']['lat'],
                             v['venue']['location']['lng'],
                             v['venue']['categories'][0]['name']) for v in results])
        
        nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
        nearby_venues.columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Venue',
                                'Venue Latitude', 'Venue Longitude','Venue Category']
        
        return(nearby_venues)

In [None]:
Toronto_venues = getVenues(names = tbl_tor_cluster['Neighborhood'], latitudes = tbl_tor_cluster['Latitude'], longitudes = tbl_tor_cluster['Longitude'])

In [None]:
Toronto_venues


Neighborhoods Analysis to Venues

In [None]:
toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']],prefix = "", prefix_sep="")
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood']
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

Cluster Analysis

In [None]:
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

In [None]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')


Depict on the folium map

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters