## Installing BeautifulSoup and other necessary packages 

In [57]:
!pip install beautifulsoup4

!pip install lxml

!pip install html5lib

!pip install requests



## Importing initial packages

In [1]:
from bs4 import BeautifulSoup
import requests 
import pandas as pd
import numpy as np
import csv 

## Scraping raw data from the Wiki source and transforming it using the soup

In [9]:
raw_data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(raw_data, 'lxml')

## Extracting data from the Postal Codes table and appending it into lists 

In [None]:
table = soup.find('table', class_='wikitable') # Identifies table containing the postal codes
rows = table.find_all('tr') # Extracts all rows from the table 

# Initializes lists 
postalcodes = []
boroughs = []
neighborhoods = []

for row in rows:    
    columns = row.find_all('td')
    try:
        if columns[1].text.split('\n')[0] != 'Not assigned': # ignores boroughs named "Not Assignmend"
            
            postalcode = columns[0].text.split('\n')[0] # ignores newline character at the end
            postalcodes.append(postalcode)
             
        
            borough = columns[1].text.split('\n')[0]  # ignores newline character at the end
            boroughs.append(borough)
        
            neighborhood = columns[2].text.replace('/',',').split('\n')[0] # replaces / with , for visual reasons
            neighborhoods.append(neighborhood)
            
    except Exception as e:
        pass 
        
        

## Join the above 3 lists into a 'df' Dataframe

In [11]:
df = pd.DataFrame(list(zip(postalcodes, boroughs,neighborhoods)), columns =['PostalCode', 'Borough','Neighborhood']) 
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


## Checks for the shape

In [12]:
df.shape

(103, 3)

## Extracting the 'PostalCode' column from df 

In [59]:
df_postals = df['PostalCode'].values
df_postal

NameError: name 'df_postal' is not defined

## Geocoder API was very unreliable in this exercise, therefore the free OpenCage Geocoder was used 

### OpenCage API Credentials

In [14]:
API_KEY = '5e232949c48244eb888c484b236c9534'

In [16]:
import json

latitudes = [] #initializes empty list to append latitude data
longitudes = [] #initializes empty list to append longitude data

for postal in df_postals:
    location_input = postal + " Toronto, Canada" # correct location input format 
    url = 'https://api.opencagedata.com/geocode/v1/json?q={}&key={}'.format(location_input, API_KEY) # correct url format
    obj = json.loads(requests.get(url).text)  # Gets the json file, stores in a dictionary
    
    results = obj['results'] # gets the 'results' key
    latitude = results[0]['geometry']['lat'] # gets the 'latitude' data 
    longitude = results[0]['geometry']['lng'] #gets the 'longitude' data
    
    latitudes.append(latitude) # Appends data to the list of latitudes
    longitudes.append(longitude) # Appends data to the list of longitudes


KeyboardInterrupt: 

## Appends these lists into the 'df' DataFrame

In [None]:
df['Latitude'] = latitudes 
df['Longitude'] = longitudes
df.head()

## A free OpenCage account is limited to 2500 API calls/day. Therefore, need to save the results into a csv file as backup

In [13]:
df.to_csv('Toronto_Neighborhood.csv')

## Creates a pandas Dataframe from the backup csv file

In [2]:
df = pd.read_csv('Toronto_Neighborhood.csv')
df.drop(['Unnamed: 0'], axis = 1, inplace = True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.653482,-79.383935
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.7223,-79.4504
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.653482,-79.383935


## List number of Boroughs and Neighborhoods

In [3]:
print('The dataframe has {} unique Postal Codes, {} boroughs and {} neighborhoods.'.format(len(df['PostalCode'].unique()), 
        len(df['Borough'].unique()),
        len(df['Neighborhood'].unique())
    )
)

The dataframe has 103 unique Postal Codes, 10 boroughs and 98 neighborhoods.


## Installing and importing Geopy and Folium

In [5]:
!pip install geopy
from geopy.geocoders import Nominatim

from pandas.io.json import json_normalize

import matplotlib.cm as cm
import matplotlib.colors as colors

!pip install folium
import folium



## Visualizing a map of Toronto with all Neighborhoods and Boroughs

In [6]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
to_latitude = location.latitude
to_longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(to_latitude, to_longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [7]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[to_latitude, to_longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Exploring Boroughs that contain the word "Etobicoke"

In [8]:
etobicoke_data = df[df['Borough'].str.contains("Etobicoke")].reset_index(drop=True) # searches for entries containing 'Etobicoke' only and resets the index
etobicoke_data

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M9A,Etobicoke,Islington Avenue,43.6662,-79.5282
1,M9B,Etobicoke,"West Deane Park , Princess Gardens , Martin Gr...",43.640741,-79.541902
2,M9C,Etobicoke,"Eringate , Bloordale Gardens , Old Burnhamthor...",43.64411,-79.588907
3,M9P,Etobicoke,Westmount,43.6949,-79.5323
4,M9R,Etobicoke,"Kingsview Village , St. Phillips , Martin Grov...",43.695166,-79.55089
5,M8V,Etobicoke,"New Toronto , Mimico South , Humber Bay Shores",43.6075,-79.5013
6,M9V,Etobicoke,"South Steeles , Silverstone , Humbergate , Jam...",43.7432,-79.5876
7,M8W,Etobicoke,"Alderwood , Long Branch",43.6021,-79.5402
8,M9W,Etobicoke,Northwest,43.7144,-79.5909
9,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North",43.6518,-79.5076


In [9]:
# create map of Etobicoke using latitude and longitude values
map_etobicoke = folium.Map(location=[to_latitude, to_longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(etobicoke_data['Latitude'], etobicoke_data['Longitude'], etobicoke_data['Borough'], etobicoke_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_etobicoke)  
    
map_etobicoke

## Foursquare API Credentials

In [10]:
CLIENT_ID = 'NVNSTQUGEPDCCEKZ3FKLOWND1FYPVIR3YDLAPQJPV5CYDWWK' # your Foursquare ID
CLIENT_SECRET = 'XG233VXSPMLJOQS40H23GWCGGWKNBGJWT1NLSNLDBZYPB4PQ' # your Foursquare Secret
VERSION = '20200401' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: NVNSTQUGEPDCCEKZ3FKLOWND1FYPVIR3YDLAPQJPV5CYDWWK
CLIENT_SECRET:XG233VXSPMLJOQS40H23GWCGGWKNBGJWT1NLSNLDBZYPB4PQ


## Explores the first neighborhood in Etobicoke

In [11]:
etobicoke_data.loc[0, 'Neighborhood']

'Islington Avenue'

In [12]:
neighborhood_latitude = etobicoke_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = etobicoke_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = etobicoke_data.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Islington Avenue are 43.6662, -79.5282.


In [13]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)

url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=NVNSTQUGEPDCCEKZ3FKLOWND1FYPVIR3YDLAPQJPV5CYDWWK&client_secret=XG233VXSPMLJOQS40H23GWCGGWKNBGJWT1NLSNLDBZYPB4PQ&v=20200401&ll=43.6662,-79.5282&radius=500&limit=100'

## Makes the 'get' request and acquires the results

In [14]:
results = requests.get(url).json()

## Defines function to acquire the categories and venues of the reults 

In [15]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

## Cleans and prepares the json object and transforms it into a pandas dataframe

In [16]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,TD Canada Trust,Bank,43.662545,-79.531749
1,Shoppers Drug Mart,Pharmacy,43.663067,-79.531753
2,Humber Valley Park,Park,43.664825,-79.524999
3,Humber Valley Rink,Skating Rink,43.664826,-79.524873
4,Thorncrest Drug Store,Pharmacy,43.662988,-79.531817


## Prints the number of unique venues found by FourSquare

In [17]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

6 venues were returned by Foursquare.


## Creates functions to repeat this process for other neighborhoods in Etobicoke

In [18]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

## Uses the code on each neighborhood to create a new pandas dataframe 

In [19]:
etobicoke_venues = getNearbyVenues(names=etobicoke_data['Neighborhood'],
                                   latitudes=etobicoke_data['Latitude'],
                                   longitudes=etobicoke_data['Longitude']
                                  )

Islington Avenue
West Deane Park , Princess Gardens , Martin Grove , Islington , Cloverdale
Eringate , Bloordale Gardens , Old Burnhamthorpe , Markland Wood
Westmount
Kingsview Village , St. Phillips , Martin Grove Gardens , Richview Gardens
New Toronto , Mimico South , Humber Bay Shores
South Steeles , Silverstone , Humbergate , Jamestown , Mount Olive , Beaumond Heights , Thistletown , Albion Gardens
Alderwood , Long Branch
Northwest
The Kingsway , Montgomery Road  , Old Mill North
Old Mill South , King's Mill Park , Sunnylea , Humber Bay , Mimico NE , The Queensway East , Royal York South East , Kingsway Park South East
Mimico NW , The Queensway West , South of Bloor , Kingsway Park South West , Royal York South West


In [20]:
print(etobicoke_venues.shape)
etobicoke_venues.head()

(98, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Islington Avenue,43.6662,-79.5282,TD Canada Trust,43.662545,-79.531749,Bank
1,Islington Avenue,43.6662,-79.5282,Shoppers Drug Mart,43.663067,-79.531753,Pharmacy
2,Islington Avenue,43.6662,-79.5282,Humber Valley Park,43.664825,-79.524999,Park
3,Islington Avenue,43.6662,-79.5282,Humber Valley Rink,43.664826,-79.524873,Skating Rink
4,Islington Avenue,43.6662,-79.5282,Thorncrest Drug Store,43.662988,-79.531817,Pharmacy


## Returns number of neighborhoods with venues found by FourSquare

In [21]:
etobicoke_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Alderwood , Long Branch",7,7,7,7,7,7
Islington Avenue,6,6,6,6,6,6
"Kingsview Village , St. Phillips , Martin Grove Gardens , Richview Gardens",4,4,4,4,4,4
"Mimico NW , The Queensway West , South of Bloor , Kingsway Park South West , Royal York South West",15,15,15,15,15,15
Northwest,5,5,5,5,5,5
"Old Mill South , King's Mill Park , Sunnylea , Humber Bay , Mimico NE , The Queensway East , Royal York South East , Kingsway Park South East",2,2,2,2,2,2
"South Steeles , Silverstone , Humbergate , Jamestown , Mount Olive , Beaumond Heights , Thistletown , Albion Gardens",12,12,12,12,12,12
"The Kingsway , Montgomery Road , Old Mill North",17,17,17,17,17,17
"West Deane Park , Princess Gardens , Martin Grove , Islington , Cloverdale",20,20,20,20,20,20
Westmount,10,10,10,10,10,10


In [60]:
# finds number of unique categories in the dataframe
print('There are {} uniques categories.'.format(len(etobicoke_venues['Venue Category'].unique())))

There are 50 uniques categories.


## Analyzes the avaialble venues in each neighborhood

In [23]:
# one hot encoding
etobicoke_onehot = pd.get_dummies(etobicoke_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
etobicoke_onehot['Neighborhood'] = etobicoke_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [etobicoke_onehot.columns[-1]] + list(etobicoke_onehot.columns[:-1])
etobicoke_onehot = etobicoke_onehot[fixed_columns]

etobicoke_onehot.head()

Unnamed: 0,Neighborhood,Bakery,Bank,Bar,Baseball Field,Beer Store,Breakfast Spot,Buffet,Burger Joint,Burrito Place,...,Restaurant,Sandwich Place,Skating Rink,Smoke Shop,Sporting Goods Shop,Supermarket,Sushi Restaurant,Thai Restaurant,Vietnamese Restaurant,Yoga Studio
0,Islington Avenue,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Islington Avenue,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Islington Avenue,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Islington Avenue,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,Islington Avenue,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
etobicoke_onehot.shape

(98, 51)

## Finds the mean of frequency of occurence of each neighborhood

In [25]:
etobicoke_grouped = etobicoke_onehot.groupby('Neighborhood').mean().reset_index()
etobicoke_grouped

Unnamed: 0,Neighborhood,Bakery,Bank,Bar,Baseball Field,Beer Store,Breakfast Spot,Buffet,Burger Joint,Burrito Place,...,Restaurant,Sandwich Place,Skating Rink,Smoke Shop,Sporting Goods Shop,Supermarket,Sushi Restaurant,Thai Restaurant,Vietnamese Restaurant,Yoga Studio
0,"Alderwood , Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Islington Avenue,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Kingsview Village , St. Phillips , Martin Grov...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Mimico NW , The Queensway West , South of Bloo...",0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.066667,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.066667,0.0,0.066667
4,Northwest,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Old Mill South , King's Mill Park , Sunnylea ,...",0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,"South Steeles , Silverstone , Humbergate , Jam...",0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,...,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"The Kingsway , Montgomery Road , Old Mill North",0.117647,0.117647,0.058824,0.0,0.0,0.117647,0.0,0.058824,0.0,...,0.058824,0.0,0.0,0.117647,0.0,0.0,0.117647,0.0,0.0,0.0
8,"West Deane Park , Princess Gardens , Martin Gr...",0.05,0.05,0.0,0.0,0.0,0.0,0.0,0.05,0.0,...,0.05,0.0,0.0,0.0,0.05,0.05,0.05,0.0,0.05,0.0
9,Westmount,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.1,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0


In [26]:
etobicoke_grouped.shape

(10, 51)

## Prints out the top 5 venues of each neighborhood by frequency of occurence 

In [27]:
num_top_venues = 5

for hood in etobicoke_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = etobicoke_grouped[etobicoke_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Alderwood , Long Branch----
            venue  freq
0             Gym  0.14
1  Sandwich Place  0.14
2        Pharmacy  0.14
3     Pizza Place  0.14
4             Pub  0.14


----Islington Avenue----
           venue  freq
0       Pharmacy  0.33
1           Bank  0.17
2  Grocery Store  0.17
3   Skating Rink  0.17
4           Park  0.17


----Kingsview Village , St. Phillips , Martin Grove Gardens , Richview Gardens----
           venue  freq
0    Coffee Shop  0.50
1  Grocery Store  0.25
2       Bus Line  0.25
3         Bakery  0.00
4            Pub  0.00


----Mimico NW , The Queensway West , South of Bloor , Kingsway Park South West , Royal York South West----
              venue  freq
0     Burrito Place  0.20
1               Gym  0.07
2       Coffee Shop  0.07
3   Thai Restaurant  0.07
4  Sushi Restaurant  0.07


----Northwest----
                      venue  freq
0                     Hotel   0.4
1               Coffee Shop   0.2
2  Mediterranean Restaurant   0.2
3       Rental 

## Sort venues in descending order

In [28]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

## Creates new dataframe containing the sorted neighborhoods

In [29]:
um_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = etobicoke_grouped['Neighborhood']

for ind in np.arange(etobicoke_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(etobicoke_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"Alderwood , Long Branch",Pharmacy,Gym,Sandwich Place,Convenience Store,Pub
1,Islington Avenue,Pharmacy,Skating Rink,Park,Grocery Store,Bank
2,"Kingsview Village , St. Phillips , Martin Grov...",Coffee Shop,Grocery Store,Bus Line,Flea Market,Fish & Chips Shop
3,"Mimico NW , The Queensway West , South of Bloo...",Burrito Place,Yoga Studio,Burger Joint,Gym,Gym / Fitness Center
4,Northwest,Hotel,Coffee Shop,Rental Car Location,Mediterranean Restaurant,Yoga Studio
5,"Old Mill South , King's Mill Park , Sunnylea ,...",Baseball Field,Park,Yoga Studio,Coffee Shop,Fish & Chips Shop
6,"South Steeles , Silverstone , Humbergate , Jam...",Grocery Store,Sandwich Place,Fried Chicken Joint,Discount Store,Hardware Store
7,"The Kingsway , Montgomery Road , Old Mill North",Bakery,Bank,Sushi Restaurant,Smoke Shop,Breakfast Spot
8,"West Deane Park , Princess Gardens , Martin Gr...",Coffee Shop,Grocery Store,Pharmacy,Bank,Burger Joint
9,Westmount,Pizza Place,Flea Market,Chinese Restaurant,Supermarket,Discount Store


## Installs scikit-learn ML package, gets KMeans clustering, and generates clusters

In [30]:
!pip install -U scikit-learn

Requirement already up-to-date: scikit-learn in c:\users\sonle\anaconda3\lib\site-packages (0.22.2.post1)


In [39]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

etobicoke_grouped_clustering =etobicoke_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(etobicoke_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 0, 2, 1, 4, 3, 1, 1, 1, 1])

## Adds 'Cluster Labels' column into 'etobicoke_merged' dataframe

In [41]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

etobicoke_merged = etobicoke_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
etobicoke_merged = etobicoke_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

etobicoke_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M9A,Etobicoke,Islington Avenue,43.6662,-79.5282,0.0,Pharmacy,Skating Rink,Park,Grocery Store,Bank
1,M9B,Etobicoke,"West Deane Park , Princess Gardens , Martin Gr...",43.640741,-79.541902,1.0,Coffee Shop,Grocery Store,Pharmacy,Bank,Burger Joint
2,M9C,Etobicoke,"Eringate , Bloordale Gardens , Old Burnhamthor...",43.64411,-79.588907,,,,,,
3,M9P,Etobicoke,Westmount,43.6949,-79.5323,1.0,Pizza Place,Flea Market,Chinese Restaurant,Supermarket,Discount Store
4,M9R,Etobicoke,"Kingsview Village , St. Phillips , Martin Grov...",43.695166,-79.55089,2.0,Coffee Shop,Grocery Store,Bus Line,Flea Market,Fish & Chips Shop


## Since FourSquares could not find any data on two neighorhoods, drop from dataframe
## Formats the dataframe

In [61]:
# Drops the two unknown neighborhood
etobicoke_merged.drop(index = 2, inplace = True)
etobicoke_merged.drop(index = 5, inplace = True)

# DRops the 'PostalCode' column
etobicoke_merged.drop('PostalCode', axis = 1, inplace = True)

# Resets the index
etobicoke_merged.reset_index(drop=True)


KeyError: '[2] not found in axis'

## Changes the Cluster Labels to intergers

In [44]:
etobicoke_merged['Cluster Labels'] = etobicoke_merged['Cluster Labels'].astype(int)
etobicoke_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M9A,Etobicoke,Islington Avenue,43.6662,-79.5282,0,Pharmacy,Skating Rink,Park,Grocery Store,Bank
1,M9B,Etobicoke,"West Deane Park , Princess Gardens , Martin Gr...",43.640741,-79.541902,1,Coffee Shop,Grocery Store,Pharmacy,Bank,Burger Joint
3,M9P,Etobicoke,Westmount,43.6949,-79.5323,1,Pizza Place,Flea Market,Chinese Restaurant,Supermarket,Discount Store
4,M9R,Etobicoke,"Kingsview Village , St. Phillips , Martin Grov...",43.695166,-79.55089,2,Coffee Shop,Grocery Store,Bus Line,Flea Market,Fish & Chips Shop
6,M9V,Etobicoke,"South Steeles , Silverstone , Humbergate , Jam...",43.7432,-79.5876,1,Grocery Store,Sandwich Place,Fried Chicken Joint,Discount Store,Hardware Store


## Visualizes the clusters

In [51]:
# create map
map_clusters = folium.Map(location=[to_latitude, to_longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(etobicoke_merged['Latitude'], etobicoke_merged['Longitude'], etobicoke_merged['Neighborhood'], etobicoke_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Analyzes the first cluster

In [52]:
etobicoke_merged.loc[etobicoke_merged['Cluster Labels'] == 0, etobicoke_merged.columns[[1] + list(range(5, etobicoke_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Islington Avenue,Pharmacy,Skating Rink,Park,Grocery Store,Bank


Analyzes the second cluster

In [53]:
etobicoke_merged.loc[etobicoke_merged['Cluster Labels'] == 1, etobicoke_merged.columns[[1] + list(range(5, etobicoke_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1,"West Deane Park , Princess Gardens , Martin Gr...",Coffee Shop,Grocery Store,Pharmacy,Bank,Burger Joint
3,Westmount,Pizza Place,Flea Market,Chinese Restaurant,Supermarket,Discount Store
6,"South Steeles , Silverstone , Humbergate , Jam...",Grocery Store,Sandwich Place,Fried Chicken Joint,Discount Store,Hardware Store
7,"Alderwood , Long Branch",Pharmacy,Gym,Sandwich Place,Convenience Store,Pub
9,"The Kingsway , Montgomery Road , Old Mill North",Bakery,Bank,Sushi Restaurant,Smoke Shop,Breakfast Spot
11,"Mimico NW , The Queensway West , South of Bloo...",Burrito Place,Yoga Studio,Burger Joint,Gym,Gym / Fitness Center


## Analyzes the third cluster

In [54]:
etobicoke_merged.loc[etobicoke_merged['Cluster Labels'] == 2, etobicoke_merged.columns[[1] + list(range(5, etobicoke_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
4,"Kingsview Village , St. Phillips , Martin Grov...",Coffee Shop,Grocery Store,Bus Line,Flea Market,Fish & Chips Shop


## Analyzes the fourth cluster

In [55]:
etobicoke_merged.loc[etobicoke_merged['Cluster Labels'] == 3, etobicoke_merged.columns[[1] + list(range(5, etobicoke_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
10,"Old Mill South , King's Mill Park , Sunnylea ,...",Baseball Field,Park,Yoga Studio,Coffee Shop,Fish & Chips Shop


## Analyzes the fifth cluster

In [56]:
etobicoke_merged.loc[etobicoke_merged['Cluster Labels'] == 4, etobicoke_merged.columns[[1] + list(range(5, etobicoke_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
8,Northwest,Hotel,Coffee Shop,Rental Car Location,Mediterranean Restaurant,Yoga Studio
