# This notebook will be mainly used for this Capstone project 

### import packages

In [5]:
import numpy as np
import pandas as pd
import json
from urllib.request import urlopen
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans

### scrape wikipedia page : List of postal codes of Canada:_M

In [6]:
html = urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
content = BeautifulSoup(html, 'html.parser')
table = content.find('table')
print(table.prettify())

<table class="wikitable sortable">
 <tbody>
  <tr>
   <th>
    Postal Code
   </th>
   <th>
    Borough
   </th>
   <th>
    Neighbourhood
   </th>
  </tr>
  <tr>
   <td>
    M1A
   </td>
   <td>
    Not assigned
   </td>
   <td>
    Not assigned
   </td>
  </tr>
  <tr>
   <td>
    M2A
   </td>
   <td>
    Not assigned
   </td>
   <td>
    Not assigned
   </td>
  </tr>
  <tr>
   <td>
    M3A
   </td>
   <td>
    North York
   </td>
   <td>
    Parkwoods
   </td>
  </tr>
  <tr>
   <td>
    M4A
   </td>
   <td>
    North York
   </td>
   <td>
    Victoria Village
   </td>
  </tr>
  <tr>
   <td>
    M5A
   </td>
   <td>
    Downtown Toronto
   </td>
   <td>
    Regent Park, Harbourfront
   </td>
  </tr>
  <tr>
   <td>
    M6A
   </td>
   <td>
    North York
   </td>
   <td>
    Lawrence Manor, Lawrence Heights
   </td>
  </tr>
  <tr>
   <td>
    M7A
   </td>
   <td>
    Downtown Toronto
   </td>
   <td>
    Queen's Park, Ontario Provincial Government
   </td>
  </tr>
  <tr>
   <td>
    M8

### Extract table contents and save as dataframe

In [7]:
data = []
for tr in table.find_all('tr')[1:]:
    row_data = tr.find_all('td')
    data.append([cell.text for cell in row_data])
dataset = pd.DataFrame(data, columns = ['PostalCode', 'Borough', 'Neighborhood'])
dataset.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A\n,Not assigned\n,Not assigned\n
1,M2A\n,Not assigned\n,Not assigned\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"


### preprocess \n in each columns before dropping rows where borough is "Not assigned" and reset index

In [8]:
# before dropping, preprocess \n in each columns
dataset['Neighborhood'] = dataset['Neighborhood'].str.split('\n', expand = True)[0]
dataset['Borough'] = dataset['Borough'].str.split('\n', expand = True)[0]
dataset['PostalCode'] = dataset['PostalCode'].str.split('\n', expand = True)[0]

In [9]:
dataset

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


### dropping rows where borough is "Not assigned" and reset index

In [10]:
# dropping rows where borough is "Not assigned" and reset index
dataset = dataset[dataset['Borough'] != 'Not assigned']
dataset.reset_index(drop = True, inplace = True)

In [11]:
dataset

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [12]:
# groupby postal code and borough
dataset = dataset.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()

In [13]:
dataset 

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


In [14]:
# save to csv
dataset.to_csv('Toronto_dataset.csv', index = False)

### open csv file with longitude and latitude data 

In [15]:
coord_data = pd.read_csv('https://cocl.us/Geospatial_data')
coord_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge Toronto postal code data and coordinate data
* Before merging, need to change column name 

In [16]:
coord_data.rename(columns={'Postal Code':'PostalCode'},inplace=True)

In [17]:
geo_merged = pd.merge(dataset,coord_data, on='PostalCode')
geo_merged

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437


In [47]:
import time
from geopy.geocoders import Nominatim
import json 

import requests 
from pandas.io.json import json_normalize 

import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# Importing to use the Foursquare API lab
!conda install -c conda-forge folium=0.5.0 --yes  #Uncomment if not installed
import folium

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



### Using Foursquatre API

* setting to use Foursquare API 

In [48]:
CLIENT_ID = 'PLABBRFUVEWY4GXRWZ1D251HNMSZ01NHTTF3SDSMTFA2V45D' 
CLIENT_SECRET = 'E1M2DPREQDBVGXGHBMDEJFBOYHDSHFQUV3AI54CETR3X2LU0' 
VERSION = '20210212' # Foursquare API version

* defining coordinates before searching venues using Foursquare API

In [20]:
row_num = 77
geo_merged.loc[8]
latitude = np.float(geo_merged.loc[row_num,['Latitude']].values)
longitude =  np.float(geo_merged.loc[row_num,['Longitude']].values)

In [21]:
print(geo_merged.loc[row_num ])

PostalCode                           M6J
Borough                     West Toronto
Neighborhood    Little Portugal, Trinity
Latitude                         43.6479
Longitude                       -79.4197
Name: 77, dtype: object


####  get the max 100 venues within predefined radius, logitude and latitude

In [22]:
LIMIT = 100 # number of venues returned by Foursquare API

radius = 300 # radius


In [23]:

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT)

In [24]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '60265a38cfd3f91592da1a26'},
 'response': {'headerLocation': 'Trinity Bellwoods',
  'headerFullLocation': 'Trinity Bellwoods, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 28,
  'suggestedBounds': {'ne': {'lat': 43.65062670270001,
    'lng': -79.4160252928462},
   'sw': {'lat': 43.645226697300004, 'lng': -79.4234741071538}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4f7891c7e4b0b9643b73e08d',
       'name': 'Bellwoods Brewery',
       'location': {'address': '124 Ossington Ave',
        'crossStreet': 'at Argyle St',
        'lat': 43.647097254598236,
        'lng': -79.41995537873463,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.647097254598236,
          'lng': -79.41995537873

* Extract category of the venue

In [25]:
# define function
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

* convert to dataframe

In [26]:
venues = results['response']['groups'][0]['items']
venues 

[{'reasons': {'count': 0,
   'items': [{'summary': 'This spot is popular',
     'type': 'general',
     'reasonName': 'globalInteractionReason'}]},
  'venue': {'id': '4f7891c7e4b0b9643b73e08d',
   'name': 'Bellwoods Brewery',
   'location': {'address': '124 Ossington Ave',
    'crossStreet': 'at Argyle St',
    'lat': 43.647097254598236,
    'lng': -79.41995537873463,
    'labeledLatLngs': [{'label': 'display',
      'lat': 43.647097254598236,
      'lng': -79.41995537873463}],
    'distance': 93,
    'postalCode': 'M6J 2Z5',
    'cc': 'CA',
    'city': 'Toronto',
    'state': 'ON',
    'country': 'Canada',
    'formattedAddress': ['124 Ossington Ave (at Argyle St)',
     'Toronto ON M6J 2Z5',
     'Canada']},
   'categories': [{'id': '50327c8591d4c4b30a586d5d',
     'name': 'Brewery',
     'pluralName': 'Breweries',
     'shortName': 'Brewery',
     'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/brewery_',
      'suffix': '.png'},
     'primary': True}],
   'photos': 

In [27]:
# flatten JSON
nearby_venues = pd.json_normalize(venues) 
nearby_venues

Unnamed: 0,referralId,reasons.count,reasons.items,venue.id,venue.name,venue.location.address,venue.location.crossStreet,venue.location.lat,venue.location.lng,venue.location.labeledLatLngs,...,venue.location.cc,venue.location.city,venue.location.state,venue.location.country,venue.location.formattedAddress,venue.categories,venue.photos.count,venue.photos.groups,venue.venuePage.id,venue.location.neighborhood
0,e-0-4f7891c7e4b0b9643b73e08d-0,0,"[{'summary': 'This spot is popular', 'type': '...",4f7891c7e4b0b9643b73e08d,Bellwoods Brewery,124 Ossington Ave,at Argyle St,43.647097,-79.419955,"[{'label': 'display', 'lat': 43.64709725459823...",...,CA,Toronto,ON,Canada,"[124 Ossington Ave (at Argyle St), Toronto ON ...","[{'id': '50327c8591d4c4b30a586d5d', 'name': 'B...",0,[],,
1,e-0-4ada6d36f964a520802221e3-1,0,"[{'summary': 'This spot is popular', 'type': '...",4ada6d36f964a520802221e3,Pizzeria Libretto,221 Ossington Ave,at Dundas St W,43.648979,-79.420604,"[{'label': 'display', 'lat': 43.64897862710277...",...,CA,Toronto,ON,Canada,"[221 Ossington Ave (at Dundas St W), Toronto O...","[{'id': '4bf58dd8d48988d1ca941735', 'name': 'P...",0,[],,
2,e-0-4af369d0f964a52060ed21e3-2,0,"[{'summary': 'This spot is popular', 'type': '...",4af369d0f964a52060ed21e3,Foxley Bistro,207 Ossington Ave,Dundas,43.648643,-79.420495,"[{'label': 'display', 'lat': 43.64864274628383...",...,CA,Toronto,ON,Canada,"[207 Ossington Ave (Dundas), Toronto ON M6J 2Z...","[{'id': '4bf58dd8d48988d142941735', 'name': 'A...",0,[],,
3,e-0-50bafbe4e4b0e225612debb6-3,0,"[{'summary': 'This spot is popular', 'type': '...",50bafbe4e4b0e225612debb6,OddSeoul,90 Ossington Ave.,at Humbert St,43.646192,-79.419601,"[{'label': 'display', 'lat': 43.64619218474247...",...,CA,Toronto,ON,Canada,"[90 Ossington Ave. (at Humbert St), Toronto ON...","[{'id': '4bf58dd8d48988d113941735', 'name': 'K...",0,[],,
4,e-0-54e000c9498e4adcc4e449be-4,0,"[{'summary': 'This spot is popular', 'type': '...",54e000c9498e4adcc4e449be,La Cubana,92 Ossington Ave,at Humber St,43.64623,-79.419636,"[{'label': 'display', 'lat': 43.64623003878161...",...,CA,Toronto,ON,Canada,"[92 Ossington Ave (at Humber St), Toronto ON, ...","[{'id': '4bf58dd8d48988d154941735', 'name': 'C...",0,[],,
5,e-0-5a91e9e7e4c4590ae9eb0a5c-5,0,"[{'summary': 'This spot is popular', 'type': '...",5a91e9e7e4c4590ae9eb0a5c,Paris Paris Bar,1161 Dundas St W.,,43.649237,-79.421436,"[{'label': 'display', 'lat': 43.649237, 'lng':...",...,CA,Toronto,ON,Canada,"[1161 Dundas St W., Toronto ON M6J 1X3, Canada]","[{'id': '4bf58dd8d48988d123941735', 'name': 'W...",0,[],,
6,e-0-585c96058d8e995f7316745c-6,0,"[{'summary': 'This spot is popular', 'type': '...",585c96058d8e995f7316745c,Gift Shop,89 Ossington Ave,Queen & Ossington,43.646149,-79.419481,"[{'label': 'display', 'lat': 43.64614871682715...",...,CA,Toronto,ON,Canada,"[89 Ossington Ave (Queen & Ossington), Toronto...","[{'id': '4bf58dd8d48988d11e941735', 'name': 'C...",0,[],,
7,e-0-4ae662e8f964a520cfa621e3-7,0,"[{'summary': 'This spot is popular', 'type': '...",4ae662e8f964a520cfa621e3,Reposado,136 Ossington Ave.,btwn Argyle & Foxley,43.647321,-79.420032,"[{'label': 'display', 'lat': 43.64732078900138...",...,CA,Toronto,ON,Canada,"[136 Ossington Ave. (btwn Argyle & Foxley), To...","[{'id': '4bf58dd8d48988d116941735', 'name': 'B...",0,[],,
8,e-0-537bd61f498e0dae73728800-8,0,"[{'summary': 'This spot is popular', 'type': '...",537bd61f498e0dae73728800,Bang Bang Ice Cream & Bakery,93a Ossington Ave,btwn Queen and Argyle,43.646246,-79.419553,"[{'label': 'display', 'lat': 43.64624619920086...",...,CA,Toronto,ON,Canada,"[93a Ossington Ave (btwn Queen and Argyle), To...","[{'id': '4bf58dd8d48988d1c9941735', 'name': 'I...",0,[],,
9,e-0-4ae791a7f964a52079ac21e3-9,0,"[{'summary': 'This spot is popular', 'type': '...",4ae791a7f964a52079ac21e3,Lower Ossington Theatre,100A Ossington Ave.,between Queen and Dundas,43.646389,-79.419781,"[{'label': 'display', 'lat': 43.64638945106542...",...,CA,Toronto,ON,Canada,[100A Ossington Ave. (between Queen and Dundas...,"[{'id': '4bf58dd8d48988d137941735', 'name': 'T...",0,[],52485753.0,


#### Select data columns

In [28]:
columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']

nearby_venues =nearby_venues.loc[:, columns]
nearby_venues

Unnamed: 0,venue.name,venue.categories,venue.location.lat,venue.location.lng
0,Bellwoods Brewery,"[{'id': '50327c8591d4c4b30a586d5d', 'name': 'B...",43.647097,-79.419955
1,Pizzeria Libretto,"[{'id': '4bf58dd8d48988d1ca941735', 'name': 'P...",43.648979,-79.420604
2,Foxley Bistro,"[{'id': '4bf58dd8d48988d142941735', 'name': 'A...",43.648643,-79.420495
3,OddSeoul,"[{'id': '4bf58dd8d48988d113941735', 'name': 'K...",43.646192,-79.419601
4,La Cubana,"[{'id': '4bf58dd8d48988d154941735', 'name': 'C...",43.64623,-79.419636
5,Paris Paris Bar,"[{'id': '4bf58dd8d48988d123941735', 'name': 'W...",43.649237,-79.421436
6,Gift Shop,"[{'id': '4bf58dd8d48988d11e941735', 'name': 'C...",43.646149,-79.419481
7,Reposado,"[{'id': '4bf58dd8d48988d116941735', 'name': 'B...",43.647321,-79.420032
8,Bang Bang Ice Cream & Bakery,"[{'id': '4bf58dd8d48988d1c9941735', 'name': 'I...",43.646246,-79.419553
9,Lower Ossington Theatre,"[{'id': '4bf58dd8d48988d137941735', 'name': 'T...",43.646389,-79.419781


#### Category column processing

In [29]:
# get category for each row  - predefined function
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# preprocess column
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

In [30]:
# check shape
print(nearby_venues.shape)

# check first five rows
nearby_venues.head()

(28, 4)


Unnamed: 0,name,categories,lat,lng
0,Bellwoods Brewery,Brewery,43.647097,-79.419955
1,Pizzeria Libretto,Pizza Place,43.648979,-79.420604
2,Foxley Bistro,Asian Restaurant,43.648643,-79.420495
3,OddSeoul,Korean Restaurant,43.646192,-79.419601
4,La Cubana,Cuban Restaurant,43.64623,-79.419636


### EDA 

### define function to retrieve nearby venues for multiple coordinates in the dataset

In [31]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### retrieving data...

In [32]:
toronto_venues = getNearbyVenues(names=geo_merged['Neighborhood'],
                                   latitudes=geo_merged['Latitude'],
                                   longitudes=geo_merged['Longitude']
                                  )

Malvern, Rouge
Rouge Hill, Port Union, Highland Creek
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
Kennedy Park, Ionview, East Birchmount Park
Golden Mile, Clairlea, Oakridge
Cliffside, Cliffcrest, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Wexford Heights, Scarborough Town Centre
Wexford, Maryvale
Agincourt
Clarks Corners, Tam O'Shanter, Sullivan
Milliken, Agincourt North, Steeles East, L'Amoreaux East
Steeles West, L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview, Henry Farm, Oriole
Bayview Village
York Mills, Silver Hills
Willowdale, Newtonbrook
Willowdale, Willowdale East
York Mills West
Willowdale, Willowdale West
Parkwoods
Don Mills
Don Mills
Bathurst Manor, Wilson Heights, Downsview North
Northwood Park, York University
Downsview
Downsview
Downsview
Downsview
Victoria Village
Parkview Hill, Woodbine Gardens
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto, Broadview North (Old East York)
The Danforth West, 

In [33]:
# check for shape
print(toronto_venues.shape)
toronto_venues.head()

(2123, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Malvern, Rouge",43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
1,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,Scarborough Historical Society,43.788755,-79.162438,History Museum
3,"Guildwood, Morningside, West Hill",43.763573,-79.188711,RBC Royal Bank,43.76679,-79.191151,Bank
4,"Guildwood, Morningside, West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


#### check how many venues were retieved for each neighborhood

In [34]:
toronto_venues_g = toronto_venues.groupby('Neighborhood').count()
toronto_venues_g 

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,4,4,4,4,4,4
"Alderwood, Long Branch",8,8,8,8,8,8
"Bathurst Manor, Wilson Heights, Downsview North",23,23,23,23,23,23
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",24,24,24,24,24,24
...,...,...,...,...,...,...
"Willowdale, Willowdale West",5,5,5,5,5,5
Woburn,4,4,4,4,4,4
Woodbine Heights,6,6,6,6,6,6
York Mills West,2,2,2,2,2,2


### One hot encoding based on venue category

In [35]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()


Unnamed: 0,Yoga Studio,Accessories Store,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Truck Stop,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### groupby neighborhood , venue categories

In [36]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Trail,Train Station,Truck Stop,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,"Willowdale, Willowdale West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
92,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
93,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
94,York Mills West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


* check for top frequent venues

In [37]:
Top_venues = 3

for Nhood in toronto_grouped['Neighborhood']:
    print("----"+Nhood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == Nhood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(Top_venues))
    print('\n')

----Agincourt----
                       venue  freq
0             Breakfast Spot  0.25
1  Latin American Restaurant  0.25
2               Skating Rink  0.25


----Alderwood, Long Branch----
         venue  freq
0  Pizza Place  0.25
1     Pharmacy  0.12
2          Pub  0.12


----Bathurst Manor, Wilson Heights, Downsview North----
         venue  freq
0         Bank  0.09
1  Coffee Shop  0.09
2  Pizza Place  0.04


----Bayview Village----
                 venue  freq
0  Japanese Restaurant  0.25
1                 Café  0.25
2                 Bank  0.25


----Bedford Park, Lawrence Manor East----
                venue  freq
0      Sandwich Place  0.08
1  Italian Restaurant  0.08
2     Thai Restaurant  0.08


----Berczy Park----
          venue  freq
0   Coffee Shop  0.09
1  Cocktail Bar  0.07
2      Beer Bar  0.04


----Birch Cliff, Cliffside West----
             venue  freq
0  College Stadium   0.2
1             Farm   0.2
2             Café   0.2


----Brockton, Parkdale Village, Exh

2     Yoga Studio   0.0


----Runnymede, Swansea----
              venue  freq
0  Sushi Restaurant  0.08
1              Café  0.08
2       Coffee Shop  0.08


----Runnymede, The Junction North----
               venue  freq
0            Brewery  0.25
1           Bus Line  0.25
2  Convenience Store  0.25


----Scarborough Village----
              venue  freq
0  Business Service   0.5
1        Playground   0.5
2     Metro Station   0.0


----South Steeles, Silverstone, Humbergate, Jamestown, Mount Olive, Beaumond Heights, Thistletown, Albion Gardens----
                 venue  freq
0        Grocery Store  0.22
1             Pharmacy  0.11
2  Fried Chicken Joint  0.11


----St. James Town----
          venue  freq
0   Coffee Shop  0.06
1          Café  0.06
2  Cocktail Bar  0.05


----St. James Town, Cabbagetown----
         venue  freq
0  Coffee Shop  0.07
1         Park  0.04
2    Pet Store  0.04


----Steeles West, L'Amoreaux West----
                  venue  freq
0  Fast Food Restaur

In [38]:
# define function to find most common venue categories
def return_most_common_venues(row, Top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:Top_venues]

In [39]:
Top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(Top_venues):
    try:
        columns.append('{}{} Popular Venues'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Popular Venues'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], Top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Popular Venues,2nd Popular Venues,3rd Popular Venues,4th Popular Venues,5th Popular Venues
0,Agincourt,Latin American Restaurant,Lounge,Skating Rink,Breakfast Spot,Women's Store
1,"Alderwood, Long Branch",Pizza Place,Gym,Coffee Shop,Sandwich Place,Pub
2,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Gift Shop,Mobile Phone Shop,Sandwich Place
3,Bayview Village,Chinese Restaurant,Café,Bank,Japanese Restaurant,Department Store
4,"Bedford Park, Lawrence Manor East",Thai Restaurant,Coffee Shop,Sandwich Place,Italian Restaurant,Juice Bar
...,...,...,...,...,...,...
91,"Willowdale, Willowdale West",Coffee Shop,Discount Store,Pharmacy,Pizza Place,Grocery Store
92,Woburn,Coffee Shop,Indian Restaurant,Korean BBQ Restaurant,Dog Run,Dessert Shop
93,Woodbine Heights,Park,Curling Ice,Beer Store,Skating Rink,Bus Stop
94,York Mills West,Park,Convenience Store,Distribution Center,Department Store,Dessert Shop


### K-means clustering
* cluster neighborhoods with similar venue distribution

In [40]:
merged = pd.merge(toronto_grouped, geo_merged, on='Neighborhood')
merged 

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,PostalCode,Borough,Latitude,Longitude
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,M1S,Scarborough,43.794200,-79.262029
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,M8W,Etobicoke,43.602414,-79.543484
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,M3H,North York,43.754328,-79.442259
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,M2K,North York,43.786947,-79.385975
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,M5M,North York,43.733283,-79.419750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,"Willowdale, Willowdale West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,M2R,North York,43.782736,-79.442259
96,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,M1G,Scarborough,43.770992,-79.216917
97,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,M4C,East York,43.695344,-79.318389
98,York Mills West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,M2P,North York,43.752758,-79.400049


In [41]:
# set number of clusters
k_num = 4

kmeans_data = merged .drop(['Neighborhood','PostalCode','Borough','Latitude','Longitude'],1)

# run k-means clustering
kmeans = KMeans(init = "k-means++", n_clusters=k_num, random_state=0).fit(kmeans_data)

# check cluster labels generated for each row in the dataframe
labels = kmeans.labels_
print(labels)

[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 3 0 0 1 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 1]


In [42]:
print(labels.shape)

(100,)


In [43]:
toronto_merged = merged
print(toronto_merged.shape)
print(labels.shape)

(100, 271)
(100,)


In [44]:
# add clustering labels
toronto_merged['Cluster Labels'] = labels.tolist()
toronto_merged # check the last columns!

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,PostalCode,Borough,Latitude,Longitude,Cluster Labels
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,M1S,Scarborough,43.794200,-79.262029,0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,M8W,Etobicoke,43.602414,-79.543484,0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,M3H,North York,43.754328,-79.442259,0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,M2K,North York,43.786947,-79.385975,0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,M5M,North York,43.733283,-79.419750,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,"Willowdale, Willowdale West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,M2R,North York,43.782736,-79.442259,0
96,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,M1G,Scarborough,43.770992,-79.216917,0
97,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,M4C,East York,43.695344,-79.318389,0
98,York Mills West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,M2P,North York,43.752758,-79.400049,1


In [45]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k_num)
ys = [i+x+(i*x)**2 for i in range(k_num)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [46]:
### check the first cluster
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Yoga Studio,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery,...,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,PostalCode,Borough,Latitude,Longitude,Cluster Labels
0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,M1S,Scarborough,43.794200,-79.262029,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,M8W,Etobicoke,43.602414,-79.543484,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,M3H,North York,43.754328,-79.442259,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,M2K,North York,43.786947,-79.385975,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,M5M,North York,43.733283,-79.419750,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,M1R,Scarborough,43.750072,-79.295849,0
94,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.029412,0.0,0.0,0.0,0.0,M2N,North York,43.770120,-79.408493,0
95,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,M2R,North York,43.782736,-79.442259,0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,M1G,Scarborough,43.770992,-79.216917,0
