Let's import all the libraries needed for the project

In [1]:
!pip install bs4
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.cluster import KMeans
import requests

!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# set number of clusters
import numpy as np
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# definition of foursquare details in order to use its API
CLIENT_ID = 'WRPJCDJVVHL1LU3U10LE4SOJPZO1HVJSXK00R2GNO150W3PU' # your Foursquare ID
CLIENT_SECRET = 'UESUV112ILZANAYVRP3ORJNZLKGZWANTUZJTLGE41GSVUVCE' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value


Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



Let's read the page of Toronto's neighborhoods

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')

Let's parse the url with the soup object
Assumption: Based on the page - the main table appears in the first table of the page.

In [3]:
tables = soup.findAll("table")
torontos_negihboarhoods_table = tables[0]
col_names = [header.get_text()[:-1] for header in torontos_negihboarhoods_table.findAll("th")] #take all the content without the '\n'
table_content = torontos_negihboarhoods_table.findAll("tr")[1:] 
table_data = list()
for line in table_content:
    line_content = line.findAll("td")
    table_data = table_data + [[cell.get_text()[:-1] for cell in line_content]] #take all the content without the '\n'
    
neighborhoods_df = pd.DataFrame(columns = col_names, data= table_data)
neighborhoods_df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


Let's clean the table according to the intructions give in the assignment

In [4]:
neighborhoods_df = neighborhoods_df[neighborhoods_df['Borough']!='Not assigned']
neighborhoods_df

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


Let's show the shape of the df

In [5]:
neighborhoods_df = neighborhoods_df.reset_index(drop=True)
neighborhoods_df.shape

(103, 3)

In [6]:
loc_df = pd.read_csv('Geospatial_Coordinates.csv')

In [7]:
neighborhoods_df = loc_df.join(neighborhoods_df, lsuffix='_caller', rsuffix='_other').reset_index(drop=True)

In [8]:
neighborhoods_df = neighborhoods_df.drop(['Postal Code_other'],axis=1)
neighborhoods_df = neighborhoods_df.rename(columns={"Postal Code_caller": "Postal Code"})
neighborhoods_df

Unnamed: 0,Postal Code,Latitude,Longitude,Borough,Neighbourhood
0,M1B,43.806686,-79.194353,North York,Parkwoods
1,M1C,43.784535,-79.160497,North York,Victoria Village
2,M1E,43.763573,-79.188711,Downtown Toronto,"Regent Park, Harbourfront"
3,M1G,43.770992,-79.216917,North York,"Lawrence Manor, Lawrence Heights"
4,M1H,43.773136,-79.239476,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...,...,...
98,M9N,43.706876,-79.518188,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M9P,43.696319,-79.532242,Downtown Toronto,Church and Wellesley
100,M9R,43.688905,-79.554724,East Toronto,"Business reply mail Processing Centre, South C..."
101,M9V,43.739416,-79.588437,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [9]:
address = 'toronto canada'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of toronto canada are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of toronto canada are 43.6534817, -79.3839347.


In [10]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods_df['Latitude'], neighborhoods_df['Longitude'], neighborhoods_df['Borough'], neighborhoods_df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

We want to cluster the neighboarhoods according to the categories of the venues located in.
The first function 'getNearbyVenues' will get all the venues located inside the neighboarhoods with their categories foreach neighborhood and returns it as a dataframe object.

In [11]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [12]:
toronto_venues = getNearbyVenues(names=neighborhoods_df['Neighbourhood'],
                                   latitudes=neighborhoods_df['Latitude'],
                                   longitudes=neighborhoods_df['Longitude']
                                  )
toronto_venues.head(5)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
1,Victoria Village,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,Victoria Village,43.784535,-79.160497,SEBS Engineering Inc. (Sustainable Energy and ...,43.782371,-79.15682,Construction & Landscaping
3,"Regent Park, Harbourfront",43.763573,-79.188711,RBC Royal Bank,43.76679,-79.191151,Bank
4,"Regent Park, Harbourfront",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


Let's encode the categories of the venues with one hot vector encoding, count and normalize the frequencies of each category foreach neighboarhood

In [13]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 


In [14]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000
1,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000
2,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.041667,0.000000,0.0,0.0,0.041667
3,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.035294,...,0.0,0.0,0.011765,0.000000,0.000000,0.000000,0.011765,0.0,0.0,0.000000
4,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,"Willowdale, Willowdale West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000
92,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.030303,0.000000,0.000000,0.0,0.0,0.000000
93,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000
94,York Mills West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.029412,0.000000,0.000000,0.000000,0.0,0.0,0.029412


Let's run knn clustering algorithm

In [15]:
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# add clustering labels
toronto_grouped_clustering.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_grouped_clustering = toronto_grouped_clustering.join(neighborhoods_df)

Showing the clustering on the map

In [16]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_grouped_clustering['Latitude'], toronto_grouped_clustering['Longitude'], toronto_grouped_clustering['Neighbourhood'], toronto_grouped_clustering['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters