## Scraping table data from webpage

In [78]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

data_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(data_url)
soup = BeautifulSoup(page.content,'lxml')
table = soup.find('table')
table_cells = table.find_all('td')

tempdf1=[]
for td in table_cells:
    b = td.find_all('b')
    row = [td.text.strip() for td in b]
    tempdf1.append(row)
df1 = pd.DataFrame(tempdf1,columns=["Postal Code"])
df1['Postal Code']=df1['Postal Code'].astype(str)

tempdf2=[]
for td in table_cells:
    span = td.find_all('span')
    row = [td.text.strip() for td in span]
    tempdf2.append(row)

df2 = pd.DataFrame(tempdf2,columns=["Borough"])
df2['Borough']=df2['Borough'].astype(str)

tempdf3=[]
for text in df2['Borough']:
    start = text.find("(")+1
    end = text.find(")")
    temp = text[start:end]
    tempdf3.append(temp)

df3 = pd.DataFrame(tempdf3,columns=["Neighborhood"])
df3['Neighborhood']=df3['Neighborhood'].astype(str)
df3["Neighborhood"]=df3["Neighborhood"].str.replace('/',',')

tempdf4=[]
for text in df2["Borough"]:
    end = text.find("(")
    temp=text[0:end]
    tempdf4.append(temp)
df4 = pd.DataFrame(tempdf4,columns=["Borough"])
df4['Borough']=df4['Borough'].astype(str)

df = pd.concat([df1,df4,df3],axis=1,sort=False)

df.drop(df[df["Borough"]=="Not assigne"].index,axis=0, inplace=True)
df.reset_index(drop=True,inplace=True)
df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Queen's Park / Ontario Provincial Governmen,"Queen's Park , Ontario Provincial Governmen"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern , Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill , Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [79]:
df.shape

(103, 3)

## Downloading required libraries

In [80]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json

!pip install geopy
#!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim 
import requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans


!conda install -c conda-forge folium=0.5.0 --yes
import folium

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.1.1e             |       h516909a_0         2.1 MB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    altair-4.0.1               |             py_0         575 KB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    certifi-2019.11.28         |   py36h9f0ad1d_1         149 KB  conda-forge
    branca-0.4.0               |             py_0          26 KB  conda-forge
    ------------------------------------------------------------
                       

## Splitting rows with multiple neighbourhoods in the last column into separate rows

In [81]:
df_split = pd.DataFrame(df.set_index(['Postal Code', 'Borough'])
   .stack()
   .str.split(',', expand=True)
   .stack()
   .unstack(-2)
   .reset_index(-1, drop=True)
   .reset_index()
)
df_split

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,Malvern
1,M1B,Scarborough,Rouge
2,M1C,Scarborough,Rouge Hill
3,M1C,Scarborough,Port Union
4,M1C,Scarborough,Highland Creek
5,M1E,Scarborough,Guildwood
6,M1E,Scarborough,Morningside
7,M1E,Scarborough,West Hill
8,M1G,Scarborough,Woburn
9,M1H,Scarborough,Cedarbrae


## Further cleaning up the Bourough names after examining the data

In [82]:
df_split['Borough'].replace('Downtown TorontoStn A PO Boxes25 The Esplanade','Downtown Toronto',inplace=True)
df_split['Borough'].replace('EtobicokeNorthwest','Etobicoke Northwest',inplace=True)
df_split['Borough'].replace('East YorkEast Toronto','East York',inplace=True)
df_split['Borough'].replace('MississaugaCanada Post Gateway Processing Centre','Mississauga Canada Post Gateway Processing Centre',inplace=True)
df_split['Borough'][159]='East Toronto'
df_split['Borough'][156,157]='Queens Park'
                            
df_split

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,Malvern
1,M1B,Scarborough,Rouge
2,M1C,Scarborough,Rouge Hill
3,M1C,Scarborough,Port Union
4,M1C,Scarborough,Highland Creek
5,M1E,Scarborough,Guildwood
6,M1E,Scarborough,Morningside
7,M1E,Scarborough,West Hill
8,M1G,Scarborough,Woburn
9,M1H,Scarborough,Cedarbrae


In [83]:
df_boroughs = pd.DataFrame(df_split['Borough'].unique())
df_boroughs.columns = ['Borough']
df_boroughs

Unnamed: 0,Borough
0,Scarborough
1,North York
2,East York
3,East Toronto
4,Central Toronto
5,Downtown Toronto
6,York
7,West Toronto
8,Queens Park
9,Mississauga Canada Post Gateway Processing Centre


## Populating Lat / Lon data for all neighborhood - was able to process only 10 rows at a time with geopy - so ran the below code block until all rows were populated (last cell 131)

In [84]:
latitudelist = []
longitudelist = []
geolocator = Nominatim(user_agent="toronto_explorer")

for index in range(0,10):
    neighlocation = geolocator.geocode(df_split['Neighborhood'][index]+", Toronto")
    if neighlocation is None:
        nlattemp = None
        nlontemp = None
    else:
        nlattemp = neighlocation.latitude
        nlontemp = neighlocation.longitude                                
    latitudelist.append(nlattemp)
    longitudelist.append(nlontemp)
len(latitudelist)

10

In [85]:
for index in range(10,20):
    neighlocation = geolocator.geocode(df_split['Neighborhood'][index]+", Toronto")
    if neighlocation is None:
        nlattemp = None
        nlontemp = None
    else:
        nlattemp = neighlocation.latitude
        nlontemp = neighlocation.longitude                                
    latitudelist.append(nlattemp)
    longitudelist.append(nlontemp)
len(latitudelist)

20

In [86]:
for index in range(20,30):
    neighlocation = geolocator.geocode(df_split['Neighborhood'][index]+", Toronto")
    if neighlocation is None:
        nlattemp = None
        nlontemp = None
    else:
        nlattemp = neighlocation.latitude
        nlontemp = neighlocation.longitude                                
    latitudelist.append(nlattemp)
    longitudelist.append(nlontemp)
len(latitudelist)

30

In [87]:
for index in range(30,40):
    neighlocation = geolocator.geocode(df_split['Neighborhood'][index]+", Toronto")
    if neighlocation is None:
        nlattemp = None
        nlontemp = None
    else:
        nlattemp = neighlocation.latitude
        nlontemp = neighlocation.longitude                                
    latitudelist.append(nlattemp)
    longitudelist.append(nlontemp)
len(latitudelist)

40

In [88]:
for index in range(40,50):
    neighlocation = geolocator.geocode(df_split['Neighborhood'][index]+", Toronto")
    if neighlocation is None:
        nlattemp = None
        nlontemp = None
    else:
        nlattemp = neighlocation.latitude
        nlontemp = neighlocation.longitude                                
    latitudelist.append(nlattemp)
    longitudelist.append(nlontemp)
len(latitudelist)

50

In [89]:
for index in range(50,60):
    neighlocation = geolocator.geocode(df_split['Neighborhood'][index]+", Toronto")
    if neighlocation is None:
        nlattemp = None
        nlontemp = None
    else:
        nlattemp = neighlocation.latitude
        nlontemp = neighlocation.longitude                                
    latitudelist.append(nlattemp)
    longitudelist.append(nlontemp)
len(latitudelist)

60

In [90]:
for index in range(60,70):
    neighlocation = geolocator.geocode(df_split['Neighborhood'][index]+", Toronto")
    if neighlocation is None:
        nlattemp = None
        nlontemp = None
    else:
        nlattemp = neighlocation.latitude
        nlontemp = neighlocation.longitude                                
    latitudelist.append(nlattemp)
    longitudelist.append(nlontemp)
len(latitudelist)

70

In [91]:
for index in range(70,80):
    neighlocation = geolocator.geocode(df_split['Neighborhood'][index]+", Toronto")
    if neighlocation is None:
        nlattemp = None
        nlontemp = None
    else:
        nlattemp = neighlocation.latitude
        nlontemp = neighlocation.longitude                                
    latitudelist.append(nlattemp)
    longitudelist.append(nlontemp)
len(latitudelist)

80

In [92]:
for index in range(80,90):
    neighlocation = geolocator.geocode(df_split['Neighborhood'][index]+", Toronto")
    if neighlocation is None:
        nlattemp = None
        nlontemp = None
    else:
        nlattemp = neighlocation.latitude
        nlontemp = neighlocation.longitude                                
    latitudelist.append(nlattemp)
    longitudelist.append(nlontemp)
len(latitudelist)

90

In [93]:
for index in range(90,100):
    neighlocation = geolocator.geocode(df_split['Neighborhood'][index]+", Toronto")
    if neighlocation is None:
        nlattemp = None
        nlontemp = None
    else:
        nlattemp = neighlocation.latitude
        nlontemp = neighlocation.longitude                                
    latitudelist.append(nlattemp)
    longitudelist.append(nlontemp)
len(latitudelist)

100

In [94]:
for index in range(100,110):
    neighlocation = geolocator.geocode(df_split['Neighborhood'][index]+", Toronto")
    if neighlocation is None:
        nlattemp = None
        nlontemp = None
    else:
        nlattemp = neighlocation.latitude
        nlontemp = neighlocation.longitude                                
    latitudelist.append(nlattemp)
    longitudelist.append(nlontemp)
len(latitudelist)

110

In [95]:
for index in range(110,120):
    neighlocation = geolocator.geocode(df_split['Neighborhood'][index]+", Toronto")
    if neighlocation is None:
        nlattemp = None
        nlontemp = None
    else:
        nlattemp = neighlocation.latitude
        nlontemp = neighlocation.longitude                                
    latitudelist.append(nlattemp)
    longitudelist.append(nlontemp)
len(latitudelist)

120

In [96]:
for index in range(120,130):
    neighlocation = geolocator.geocode(df_split['Neighborhood'][index]+", Toronto")
    if neighlocation is None:
        nlattemp = None
        nlontemp = None
    else:
        nlattemp = neighlocation.latitude
        nlontemp = neighlocation.longitude                                
    latitudelist.append(nlattemp)
    longitudelist.append(nlontemp)
len(latitudelist)

130

In [112]:
for index in range(130,140):
    neighlocation = geolocator.geocode(df_split['Neighborhood'][index]+", Toronto")
    if neighlocation is None:
        nlattemp = None
        nlontemp = None
    else:
        nlattemp = neighlocation.latitude
        nlontemp = neighlocation.longitude                                
    latitudelist.append(nlattemp)
    longitudelist.append(nlontemp)
len(latitudelist)

140

In [123]:
for index in range(140,150):
    neighlocation = geolocator.geocode(df_split['Neighborhood'][index]+", Toronto")
    if neighlocation is None:
        nlattemp = None
        nlontemp = None
    else:
        nlattemp = neighlocation.latitude
        nlontemp = neighlocation.longitude                                
    latitudelist.append(nlattemp)
    longitudelist.append(nlontemp)
len(latitudelist)

150

In [124]:
for index in range(150,160):
    neighlocation = geolocator.geocode(df_split['Neighborhood'][index]+", Toronto")
    if neighlocation is None:
        nlattemp = None
        nlontemp = None
    else:
        nlattemp = neighlocation.latitude
        nlontemp = neighlocation.longitude                                
    latitudelist.append(nlattemp)
    longitudelist.append(nlontemp)
len(latitudelist)

160

In [125]:
for index in range(160,170):
    neighlocation = geolocator.geocode(df_split['Neighborhood'][index]+", Toronto")
    if neighlocation is None:
        nlattemp = None
        nlontemp = None
    else:
        nlattemp = neighlocation.latitude
        nlontemp = neighlocation.longitude                                
    latitudelist.append(nlattemp)
    longitudelist.append(nlontemp)
len(latitudelist)

170

In [126]:
for index in range(170,180):
    neighlocation = geolocator.geocode(df_split['Neighborhood'][index]+", Toronto")
    if neighlocation is None:
        nlattemp = None
        nlontemp = None
    else:
        nlattemp = neighlocation.latitude
        nlontemp = neighlocation.longitude                                
    latitudelist.append(nlattemp)
    longitudelist.append(nlontemp)
len(latitudelist)

180

In [129]:
for index in range(180,190):
    neighlocation = geolocator.geocode(df_split['Neighborhood'][index]+", Toronto")
    if neighlocation is None:
        nlattemp = None
        nlontemp = None
    else:
        nlattemp = neighlocation.latitude
        nlontemp = neighlocation.longitude                                
    latitudelist.append(nlattemp)
    longitudelist.append(nlontemp)
len(latitudelist)

190

In [130]:
for index in range(190,200):
    neighlocation = geolocator.geocode(df_split['Neighborhood'][index]+", Toronto")
    if neighlocation is None:
        nlattemp = None
        nlontemp = None
    else:
        nlattemp = neighlocation.latitude
        nlontemp = neighlocation.longitude                                
    latitudelist.append(nlattemp)
    longitudelist.append(nlontemp)
len(latitudelist)

200

In [131]:
for index in range(200,217):
    neighlocation = geolocator.geocode(df_split['Neighborhood'][index]+", Toronto")
    if neighlocation is None:
        nlattemp = None
        nlontemp = None
    else:
        nlattemp = neighlocation.latitude
        nlontemp = neighlocation.longitude                                
    latitudelist.append(nlattemp)
    longitudelist.append(nlontemp)
len(latitudelist)

217

In [132]:
df_split["Latitude"]=latitudelist
df_split["Longitude"]=longitudelist
df_split.shape

(217, 5)

## Drop null values where lat / lon data is unavailable

In [133]:
df_split.dropna(subset=["Latitude"],inplace=True)
df_split.reset_index(drop=True,inplace=True)
df_split

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern,43.809196,-79.221701
1,M1B,Scarborough,Rouge,43.80493,-79.165837
2,M1C,Scarborough,Rouge Hill,43.780271,-79.130499
3,M1C,Scarborough,Port Union,43.775504,-79.134976
4,M1C,Scarborough,Highland Creek,43.790117,-79.173334
5,M1E,Scarborough,Guildwood,43.755225,-79.198229
6,M1E,Scarborough,Morningside,43.782601,-79.204958
7,M1E,Scarborough,West Hill,43.768914,-79.187291
8,M1G,Scarborough,Woburn,43.759824,-79.225291
9,M1H,Scarborough,Cedarbrae,43.756467,-79.226692


In [134]:
CLIENT_ID = 'IWRKAIP3Q0VIX5I5EUBUZLACPFA4KRT3WBNQUFDN5KZF0VZJ'
CLIENT_SECRET = 'J0TESU0SUOELM5QRXQTLIZFCRDNAJIIJJCGG04NHQSE15LX2' 
VERSION = '20180605'
LIMIT = 100

## Use Foursquare API to retrieve venue data for each neighbourhood

In [135]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [136]:
toronto_venues = getNearbyVenues(names=df_split['Neighborhood'],
                                   latitudes=df_split['Latitude'],
                                   longitudes=df_split['Longitude']
                                  )

Malvern 
 Rouge
Rouge Hill 
 Port Union 
 Highland Creek
Guildwood 
 Morningside 
 West Hill
Woburn
Cedarbrae
Scarborough Village
Kennedy Park 
 Ionview 
 East Birchmount Park
Golden Mile 
 Clairlea 
 Oakridge
Cliffside 
 Cliffcrest 
 Scarborough Village West
Birch Cliff 
 Cliffside West
Dorset Park 
 Wexford Heights 
 Scarborough Town Centre
Wexford 
 Maryvale
Agincourt
Clarks Corners 
 Tam O'Shanter 
 Sullivan
Milliken 
 Agincourt North 
 Steeles East 
 L'Amoreaux East
Steeles West 
 L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview 
 Henry Farm 
 Oriole
Bayview Village
York Mills 
 Silver Hills
Willowdale 
 Newtonbrook
Willowdale
York Mills West
Willowdale
Parkwoods
Don Mills
Don Mills
Bathurst Manor 
 Wilson Heights 
 Downsview North
Northwood Park 
 York University
Downsview
Downsview
Downsview
Downsview
Victoria Village
Parkview Hill 
 Woodbine Gardens
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
The Danforth  East
The Danforth West 
 Riverdale
India Bazaar 
 The Be

In [137]:
print(toronto_venues.shape)
toronto_venues.head()

(5890, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Malvern,43.809196,-79.221701,Shoppers Drug Mart,43.80961,-79.222729,Pharmacy
1,Malvern,43.809196,-79.221701,Subway,43.806961,-79.221476,Sandwich Place
2,Malvern,43.809196,-79.221701,Malvern Arena,43.808594,-79.216634,Skating Rink
3,Malvern,43.809196,-79.221701,Pizza Hut,43.808326,-79.220616,Pizza Place
4,Malvern,43.809196,-79.221701,Pizza Pizza,43.806613,-79.221243,Pizza Place


In [138]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Adelaide,100,100,100,100,100,100
Agincourt North,26,26,26,26,26,26
Albion Gardens,12,12,12,12,12,12
Bathurst Quay,23,23,23,23,23,23
Bloordale Gardens,7,7,7,7,7,7
Cabbagetown,46,46,46,46,46,46
Chinatown,91,91,91,91,91,91
Clairlea,7,7,7,7,7,7
Cliffcrest,6,6,6,6,6,6
Cliffside West,8,8,8,8,8,8


In [139]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 335 uniques categories.


In [140]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.shape

(197, 335)

## Printing top 5 venues for each neighborhood

In [141]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

---- Adelaide ----
                venue  freq
0         Coffee Shop  0.09
1          Restaurant  0.06
2                Café  0.05
3               Hotel  0.04
4  Italian Restaurant  0.04


---- Agincourt North ----
                venue  freq
0                Bank  0.08
1  Chinese Restaurant  0.08
2              Bakery  0.08
3        Liquor Store  0.04
4          Beer Store  0.04


---- Albion Gardens----
                  venue  freq
0         Grocery Store  0.17
1          Liquor Store  0.08
2  Caribbean Restaurant  0.08
3        Sandwich Place  0.08
4        Hardware Store  0.08


---- Bathurst Quay ----
                  venue  freq
0           Coffee Shop  0.17
1                  Café  0.13
2                  Park  0.09
3                   Gym  0.04
4  Caribbean Restaurant  0.04


---- Bloordale Gardens ----
               venue  freq
0  Convenience Store  0.29
1     Sandwich Place  0.14
2       Intersection  0.14
3     Shipping Store  0.14
4        Coffee Shop  0.14


---- Cabbag

In [181]:
import numpy as np
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Coffee Shop,Restaurant,Café,Italian Restaurant,Hotel,Gastropub,American Restaurant,Seafood Restaurant,Japanese Restaurant,Gym
1,Agincourt North,Bank,Chinese Restaurant,Bakery,Fast Food Restaurant,Beer Store,Liquor Store,Frozen Yogurt Shop,Fried Chicken Joint,Sporting Goods Shop,Spa
2,Albion Gardens,Grocery Store,Caribbean Restaurant,Pizza Place,Pharmacy,Fast Food Restaurant,Fried Chicken Joint,Liquor Store,Sandwich Place,Gym Pool,Hardware Store
3,Bathurst Quay,Coffee Shop,Café,Park,Sculpture Garden,Sushi Restaurant,Bank,Gym,Ramen Restaurant,Garden,Diner
4,Bloordale Gardens,Convenience Store,Intersection,Sandwich Place,Shipping Store,Coffee Shop,Bank,Filipino Restaurant,Empanada Restaurant,Flea Market,Dumpling Restaurant


## K-Means CLUSTERING ANALYSIS

In [182]:
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)
kmeans.labels_[0:40]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], dtype=int32)

In [183]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
neighborhoods_venues_sorted.dtypes
toronto_merged = df_split
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
toronto_merged.dtypes
toronto_merged.head()

Postal Code                object
Borough                    object
Neighborhood               object
Latitude                  float64
Longitude                 float64
Cluster Labels            float64
1st Most Common Venue      object
2nd Most Common Venue      object
3rd Most Common Venue      object
4th Most Common Venue      object
5th Most Common Venue      object
6th Most Common Venue      object
7th Most Common Venue      object
8th Most Common Venue      object
9th Most Common Venue      object
10th Most Common Venue     object
dtype: object

In [187]:
toronto_merged.dropna(subset=["Cluster Labels"],inplace=True)
toronto_merged.reset_index(drop=True,inplace=True)
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].apply(np.int64)
toronto_merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,Malvern,43.809196,-79.221701,0,Pizza Place,Fast Food Restaurant,Pharmacy,Gym / Fitness Center,Park,Bubble Tea Shop,Skating Rink,Sandwich Place,Grocery Store,Convenience Store
1,M1B,Scarborough,Rouge,43.80493,-79.165837,1,Park,Fast Food Restaurant,Women's Store,Falafel Restaurant,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store,Empanada Restaurant,Ethiopian Restaurant
2,M1C,Scarborough,Rouge Hill,43.780271,-79.130499,3,Train Station,Women's Store,Falafel Restaurant,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store,Empanada Restaurant,Ethiopian Restaurant,Event Space
3,M1C,Scarborough,Port Union,43.775504,-79.134976,1,Park,Women's Store,Falafel Restaurant,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store,Empanada Restaurant,Ethiopian Restaurant,Event Space
4,M1C,Scarborough,Highland Creek,43.790117,-79.173334,1,Pharmacy,Park,Women's Store,Falafel Restaurant,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store,Empanada Restaurant,Ethiopian Restaurant


## Display Map with Clusters

In [188]:
# create map
address = 'Toronto, Canada'
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude  

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[(cluster-1)],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters