# Segmenting and Clustering Neighborhoods in Toronto - Part 3

### Actual code of Part 3 starts from row 21

## Importing required Libraries

In [2]:
from bs4 import BeautifulSoup   #Python package for parsing HTML and XML documents

import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

# !conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

%matplotlib inline

# import k-means from clustering stage
from sklearn.cluster import KMeans

# !conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


### Assigning wikipedia Article to __*url*__

In [3]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

### Prasing table using *requests* and *BeautifulSoup*

In [4]:
req=requests.get(url)
data=req.text
    
soup = BeautifulSoup(data,"html.parser")
table = soup.table

### __*table*__ now has all the table data in wikipedia article

In [5]:
table_rows=[] # creating an empty list

#### Iterating over *'tr'* and *'td'* tags in __'table'__

When in __*'td'*__ tag, search for __*'a'*__ tag and then get the title by doing this we can filter the Grayed Out and Not Assigned cells

Grayed Out and Not Assigned cells will be replaced by __*'None'*__

Extracting table data into __*table_rows*__

In [6]:
trs=table.find_all('tr')
for tr in trs:
    td = tr.find_all('td')
    if len(td)==0:
        continue
    postal_code = td[0].getText()
    district = td[1].find('a')
    if district is None:
        dist_name = 'None'
    else:
        dist_name = district.getText() 
    Neighborhood = td[2].find('a')
    if Neighborhood is None:
        Neig_hood = 'None'
    else:
        Neig_hood = Neighborhood.getText()
    table_rows.append([postal_code,dist_name,Neig_hood])

#### Creating Dataframe from __*table_rows*__ and assigning column names

In [7]:
df=pd.DataFrame(table_rows,columns=['PostalCode','Borough','Neighborhood'])

#### Filtering the rows which has 'None'. 
'None' indicates grayed out and Not assigned cells

In [8]:
df1=df[df['Neighborhood'] != 'None']

In [9]:
df2=df1[df1['Borough'] != 'None']

In [10]:
df3=df2.reset_index()

In [11]:
df4=df3.drop('index',axis=1)

#### Below step joins values in Neighborhood column based on postalCode and Borough

In [12]:
p_codes=df4.groupby(['PostalCode','Borough'])['Neighborhood'].apply(','.join).reset_index()

In [13]:
p_codes.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [14]:
p_codes.shape

(84, 3)

In [15]:
df5=p_codes  # copying Dataframe 'P-codes' to df5

### Creating 2 functions to get Latitude and Longitude

I'm using "Geolocator" tool insted of geocoder which was given to us for sample 

I'm using Neighboorhod and Borough to get the latitude and longitude. 

In [16]:
# Function for getting Latitude
def lat(elem):
    n,b=elem
    n1=list(n.split(","))
    a='{},{}'.format(n1[0],b)
    geolocator = Nominatim()
    location = geolocator.geocode(a)
    if location == None:
        location = geolocator.geocode(b)
        latitude = location.latitude
        longitude = location.longitude
    else:
        latitude = location.latitude
        longitude = location.longitude
    return latitude

In [17]:
# Function for getting Longitude
def lng(elem):
    n,b=elem
    n1=list(n.split(","))
    a='{},{}'.format(n1[0],b)
    #print(a)
    geolocator = Nominatim()
    location = geolocator.geocode(a)
    if location == None:
        location = geolocator.geocode(b)
        latitude = location.latitude
        longitude = location.longitude
    else:
        latitude = location.latitude
        longitude = location.longitude
    return longitude

#### Applying functions and getting latittude and Longitude

In [18]:
df5['latitude'] = df5[['Neighborhood','Borough']].apply(lat,axis=1)



In [19]:
df5['longitude'] = df5[['Neighborhood','Borough']].apply(lng,axis=1)



In [20]:
df5.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,latitude,longitude
0,M1B,Scarborough,"Rouge,Malvern",43.80493,-79.165837
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.790117,-79.173334
2,M1E,Scarborough,"Morningside,West Hill",43.782601,-79.204958
3,M1G,Scarborough,Woburn,43.759824,-79.225291
4,M1H,Scarborough,Cedarbrae,43.756467,-79.226692


<a id="Third_Section"></a>

#### Third part of the Question starts from Here

In [21]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df5['Borough'].unique()),
        len(df5['Neighborhood'].unique())
    )
)

The dataframe has 9 boroughs and 84 neighborhoods.


In [22]:
neighborhoods =df5

#### Using geopy library to get the latitude and longitude values of __Toronto__.

In [23]:
address = 'Toronto, Ontario'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))



The geograpical coordinate of Toronto City are 43.653963, -79.387207.


#### Creating a map of __Toronto__ with neighborhoods superimposed on top.

In [24]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['latitude'], neighborhoods['longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label,parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

I am taking __Etobicoke__ as my sample cluster

Note : I am taking this because, when using __Foursquare API__ we can reduce number of calls for API as Etobicoke has less neighboors.
Beacuse we only have 950 calls, I am using Etobicoke

In [25]:
Etobicoke_data = neighborhoods[neighborhoods['Borough'] == 'Etobicoke'].reset_index(drop=True)
Etobicoke_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,latitude,longitude
0,M8V,Etobicoke,New Toronto,43.600763,-79.505264
1,M8W,Etobicoke,"Alderwood,Long Branch",43.601717,-79.545232
2,M8X,Etobicoke,The Kingsway,43.647381,-79.511333
3,M8Y,Etobicoke,"Humber Bay,King's Mill Park,Mimico NE,Old Mill...",43.640046,-79.495028
4,M8Z,Etobicoke,"Mimico NW,The Queensway West,South of Bloor",43.603656,-79.493178


#### Get Latitude and Longitude for Etobicoke

In [26]:
address = 'Etobicoke, Toronto'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Etobicoke are {}, {}.'.format(latitude, longitude))



The geograpical coordinate of Etobicoke are 43.6435559, -79.5656326.


#### Creating a map of __Etobicoke__ with neighborhoods superimposed on top.

In [27]:
# create map of New York using latitude and longitude values
map_Etobicoke = folium.Map(location=[latitude, longitude], zoom_start=12)
# add markers to map
for lat, lng, borough, neighborhood in zip(Etobicoke_data['latitude'], Etobicoke_data['longitude'], Etobicoke_data['Borough'], Etobicoke_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label,parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_Etobicoke)  
    
map_Etobicoke



Next, we are going to start utilizing the Foursquare API to explore the neighborhoods and segment them.

#### Defining Foursquare Credentials and Version

In [28]:
CLIENT_ID = 'BHY0KKGDDRBJJZ0M5VPISG5INZ432PUC0G44P0GQLHBFAHT5' # your Foursquare ID
CLIENT_SECRET = '4XHL1P1TRIEPAXZHINM23Y0HOMN5UOHAYEEVOBOHVLPRKFJH' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: BHY0KKGDDRBJJZ0M5VPISG5INZ432PUC0G44P0GQLHBFAHT5
CLIENT_SECRET:4XHL1P1TRIEPAXZHINM23Y0HOMN5UOHAYEEVOBOHVLPRKFJH


#### Here I'm taking __Kingsview Village__ for exploring 

In [30]:
Etobicoke_data.loc[9, 'Neighborhood']

'Kingsview Village'

Get the neighborhood's latitude and longitude values.

In [31]:
neighborhood_latitude = Etobicoke_data.loc[9, 'latitude'] # neighborhood latitude value
neighborhood_longitude = Etobicoke_data.loc[9, 'longitude'] # neighborhood longitude value

neighborhood_name = Etobicoke_data.loc[9, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Kingsview Village are 43.6995391, -79.5563459.


#### Getting top 50 venues that are in Kingsview Village within a radius of 700 meters.

In [32]:
#Limiting to 50 to save number of calls

LIMIT = 50 # limit of number of venues returned by Foursquare API

radius = 700 # define radius

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=BHY0KKGDDRBJJZ0M5VPISG5INZ432PUC0G44P0GQLHBFAHT5&client_secret=4XHL1P1TRIEPAXZHINM23Y0HOMN5UOHAYEEVOBOHVLPRKFJH&v=20180605&ll=43.6995391,-79.5563459&radius=700&limit=50'

In [33]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5b9e7a631ed2192cac3ca112'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Toronto',
  'headerFullLocation': 'Toronto',
  'headerLocationGranularity': 'city',
  'totalResults': 8,
  'suggestedBounds': {'ne': {'lat': 43.70583910630001,
    'lng': -79.54764813943873},
   'sw': {'lat': 43.693239093699994, 'lng': -79.56504366056126}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4b462be3f964a520ec1826e3',
       'name': 'Fitness 365',
       'location': {'address': '40 Ronson Drive, Unit 2',
        'crossStreet': 'Kipling Ave.',
        'lat': 43.6984228592084,
        'lng': -79.56421141496305,
        'labeledLatLngs': [{'label': 'display',
 

From the Foursquare lab in the previous module, we know that all the information is in the *items* key. Before we proceed, let's borrow the **get_category_type** function from the Foursquare lab.

In [35]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Now we are ready to clean the json and structure it into a *pandas* dataframe.

In [36]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Fitness 365,Gym,43.698423,-79.564211
1,401 Diner,Breakfast Spot,43.70094,-79.562443
2,TD Canada Trust,Bank,43.693932,-79.557227
3,The Beer Store,Beer Store,43.693314,-79.557283
4,Shoppers Drug Mart,Pharmacy,43.693296,-79.557144


In [37]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

8 venues were returned by Foursquare.


## 2. Explore Neighborhoods in Etobicoke

#### Creating a function to repeat the same process to all the neighborhoods in Etobicoke

In [38]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

##### Geeting venues in Etobicoke

In [39]:
Etobicoke_venues = getNearbyVenues(names=Etobicoke_data['Neighborhood'],
                                   latitudes=Etobicoke_data['latitude'],
                                   longitudes=Etobicoke_data['longitude']
                                  )

New Toronto
Alderwood,Long Branch
The Kingsway
Humber Bay,King's Mill Park,Mimico NE,Old Mill South,The Queensway East,Royal York South East,Sunnylea
Mimico NW,The Queensway West,South of Bloor
Islington Avenue
Islington,Princess Gardens,West Deane Park
Markland Wood
Westmount
Kingsview Village
Beaumond Heights,Humbergate,Jamestown,Mount Olive,Silverstone,South Steeles,Thistletown
Northwest


In [40]:
print(Etobicoke_venues.shape)
Etobicoke_venues.head()

(137, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,New Toronto,43.600763,-79.505264,Huevos Gourmet,43.601188,-79.503717,Mexican Restaurant
1,New Toronto,43.600763,-79.505264,Cellar Door Restaurant,43.600221,-79.507638,Italian Restaurant
2,New Toronto,43.600763,-79.505264,Sweet Olenka's,43.601099,-79.500325,Dessert Shop
3,New Toronto,43.600763,-79.505264,Bombay on the Lake,43.600157,-79.507992,Indian Restaurant
4,New Toronto,43.600763,-79.505264,New Toronto Fish & Chips,43.601849,-79.503281,Restaurant


Checking how many venues were returned for each neighborhood

In [41]:
Etobicoke_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Alderwood,Long Branch",7,7,7,7,7,7
"Beaumond Heights,Humbergate,Jamestown,Mount Olive,Silverstone,South Steeles,Thistletown",11,11,11,11,11,11
"Humber Bay,King's Mill Park,Mimico NE,Old Mill South,The Queensway East,Royal York South East,Sunnylea",3,3,3,3,3,3
Islington Avenue,22,22,22,22,22,22
"Islington,Princess Gardens,West Deane Park",24,24,24,24,24,24
Kingsview Village,1,1,1,1,1,1
Markland Wood,4,4,4,4,4,4
"Mimico NW,The Queensway West,South of Bloor",4,4,4,4,4,4
New Toronto,15,15,15,15,15,15
Northwest,11,11,11,11,11,11


In [42]:
print('There are {} uniques categories.'.format(len(Etobicoke_venues['Venue Category'].unique())))

There are 53 uniques categories.


## 3. Analyze Each Neighborhood

#### One Hot encoding

In [43]:
# one hot encoding
Etobicoke_onehot = pd.get_dummies(Etobicoke_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Etobicoke_onehot['Neighborhood'] = Etobicoke_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [Etobicoke_onehot.columns[-1]] + list(Etobicoke_onehot.columns[:-1])
Etobicoke_onehot = Etobicoke_onehot[fixed_columns]

Etobicoke_onehot.tail()

Unnamed: 0,Neighborhood,American Restaurant,BBQ Joint,Bakery,Bank,Baseball Field,Beer Store,Breakfast Spot,Burger Joint,Bus Stop,Café,Caribbean Restaurant,Clothing Store,Coffee Shop,Construction & Landscaping,Convenience Store,Dessert Shop,Discount Store,Donut Shop,Farmers Market,Fast Food Restaurant,Flower Shop,French Restaurant,Fried Chicken Joint,Gastropub,Golf Course,Greek Restaurant,Grocery Store,Gym,Hotel,Indian Restaurant,Indie Movie Theater,Italian Restaurant,Japanese Restaurant,Korean Restaurant,Liquor Store,Mexican Restaurant,Mobile Phone Shop,Music Store,Park,Pharmacy,Piano Bar,Pizza Place,Pool,Pool Hall,Pub,Restaurant,Sandwich Place,Scenic Lookout,Seafood Restaurant,Steakhouse,Sushi Restaurant,Tapas Restaurant,Thai Restaurant
132,Northwest,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
133,Northwest,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
134,Northwest,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
135,Northwest,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
136,Northwest,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [44]:
Etobicoke_onehot.shape

(137, 54)

#### Grouping rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [46]:
Etobicoke_grouped = Etobicoke_onehot.groupby('Neighborhood').mean().reset_index()
Etobicoke_grouped

Unnamed: 0,Neighborhood,American Restaurant,BBQ Joint,Bakery,Bank,Baseball Field,Beer Store,Breakfast Spot,Burger Joint,Bus Stop,Café,Caribbean Restaurant,Clothing Store,Coffee Shop,Construction & Landscaping,Convenience Store,Dessert Shop,Discount Store,Donut Shop,Farmers Market,Fast Food Restaurant,Flower Shop,French Restaurant,Fried Chicken Joint,Gastropub,Golf Course,Greek Restaurant,Grocery Store,Gym,Hotel,Indian Restaurant,Indie Movie Theater,Italian Restaurant,Japanese Restaurant,Korean Restaurant,Liquor Store,Mexican Restaurant,Mobile Phone Shop,Music Store,Park,Pharmacy,Piano Bar,Pizza Place,Pool,Pool Hall,Pub,Restaurant,Sandwich Place,Scenic Lookout,Seafood Restaurant,Steakhouse,Sushi Restaurant,Tapas Restaurant,Thai Restaurant
0,"Alderwood,Long Branch",0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285714,0.0,0.0,0.142857,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0
1,"Beaumond Heights,Humbergate,Jamestown,Mount Ol...",0.090909,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.090909,0.090909,0.0,0.090909,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.090909,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0
2,"Humber Bay,King's Mill Park,Mimico NE,Old Mill...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Islington Avenue,0.0,0.045455,0.045455,0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.045455,0.0,0.136364,0.0,0.0,0.0,0.045455,0.045455,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.045455,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,0.0,0.090909,0.0,0.090909,0.0,0.0,0.045455,0.045455,0.0,0.090909
4,"Islington,Princess Gardens,West Deane Park",0.0,0.041667,0.041667,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.041667,0.0,0.125,0.0,0.0,0.0,0.041667,0.041667,0.0,0.041667,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.041667,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.083333,0.0,0.083333,0.0,0.0,0.0,0.083333,0.0,0.083333
5,Kingsview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Markland Wood,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"Mimico NW,The Queensway West,South of Bloor",0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0
8,New Toronto,0.0,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,0.0,0.0,0.0,0.066667,0.0,0.066667,0.0,0.0,0.0,0.0,0.066667,0.0,0.066667,0.0,0.066667,0.0,0.0,0.0,0.133333,0.0,0.0,0.066667,0.066667,0.0,0.0,0.0,0.0,0.066667,0.066667,0.0,0.066667,0.0,0.0,0.0,0.0,0.0
9,Northwest,0.090909,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.090909,0.090909,0.0,0.090909,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.090909,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
Etobicoke_grouped.shape

(12, 54)

#### Printing each neighborhood along with the top 5 most common venues

In [48]:
num_top_venues = 5

for hood in Etobicoke_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = Etobicoke_grouped[Etobicoke_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Alderwood,Long Branch----
            venue  freq
0     Pizza Place  0.29
1  Sandwich Place  0.14
2     Coffee Shop  0.14
3             Pub  0.14
4             Gym  0.14


----Beaumond Heights,Humbergate,Jamestown,Mount Olive,Silverstone,South Steeles,Thistletown----
                 venue  freq
0  American Restaurant  0.09
1                 Café  0.09
2       Sandwich Place  0.09
3                Hotel  0.09
4                  Gym  0.09


----Humber Bay,King's Mill Park,Mimico NE,Old Mill South,The Queensway East,Royal York South East,Sunnylea----
                        venue  freq
0  Construction & Landscaping  0.33
1                        Pool  0.33
2                        Park  0.33
3                   Piano Bar  0.00
4           Indian Restaurant  0.00


----Islington Avenue----
               venue  freq
0        Coffee Shop  0.14
1    Thai Restaurant  0.09
2     Sandwich Place  0.09
3                Pub  0.09
4  Korean Restaurant  0.05


----Islington,Princess Gardens,Wes

Function to sort the venues in descending order.

In [50]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Creating a new dataframe and displaying the top 10 venues for each neighborhood.

In [51]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Etobicoke_grouped['Neighborhood']

for ind in np.arange(Etobicoke_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Etobicoke_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Alderwood,Long Branch",Pizza Place,Coffee Shop,Gym,Bank,Sandwich Place,Pub,Thai Restaurant,French Restaurant,Flower Shop,Fast Food Restaurant
1,"Beaumond Heights,Humbergate,Jamestown,Mount Ol...",Grocery Store,Hotel,Bank,Café,Clothing Store,Coffee Shop,Convenience Store,Farmers Market,Gym,American Restaurant
2,"Humber Bay,King's Mill Park,Mimico NE,Old Mill...",Park,Pool,Construction & Landscaping,Thai Restaurant,Fried Chicken Joint,French Restaurant,Flower Shop,Fast Food Restaurant,Farmers Market,Donut Shop
3,Islington Avenue,Coffee Shop,Thai Restaurant,Sandwich Place,Pub,Pizza Place,Korean Restaurant,Caribbean Restaurant,Discount Store,Gym,Donut Shop
4,"Islington,Princess Gardens,West Deane Park",Coffee Shop,Thai Restaurant,Sushi Restaurant,Sandwich Place,Pub,Pizza Place,Flower Shop,Fast Food Restaurant,Gym,Donut Shop
5,Kingsview Village,Park,Thai Restaurant,Gastropub,Fried Chicken Joint,French Restaurant,Flower Shop,Fast Food Restaurant,Farmers Market,Donut Shop,Discount Store
6,Markland Wood,Baseball Field,Piano Bar,Golf Course,Park,Thai Restaurant,Construction & Landscaping,Fried Chicken Joint,French Restaurant,Flower Shop,Fast Food Restaurant
7,"Mimico NW,The Queensway West,South of Bloor",American Restaurant,Sandwich Place,Fast Food Restaurant,Bus Stop,Construction & Landscaping,Gastropub,Fried Chicken Joint,French Restaurant,Flower Shop,Farmers Market
8,New Toronto,Mexican Restaurant,Café,Pharmacy,Gym,Dessert Shop,Indian Restaurant,Italian Restaurant,Flower Shop,Park,Fried Chicken Joint
9,Northwest,Grocery Store,Hotel,Bank,Café,Clothing Store,Coffee Shop,Convenience Store,Farmers Market,Gym,American Restaurant


## 4. Cluster Neighborhoods

Run *k*-means to cluster the neighborhood into 5 clusters.

In [52]:
# set number of clusters
kclusters = 5

Etobicoke_grouped_clustering = Etobicoke_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Etobicoke_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 2, 1, 1, 0, 3, 1, 1, 1], dtype=int32)

Creating a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [53]:
Etobicoke_merged = Etobicoke_data

# add clustering labels
Etobicoke_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
Etobicoke_merged = Etobicoke_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

Etobicoke_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M8V,Etobicoke,New Toronto,43.600763,-79.505264,1,Mexican Restaurant,Café,Pharmacy,Gym,Dessert Shop,Indian Restaurant,Italian Restaurant,Flower Shop,Park,Fried Chicken Joint
1,M8W,Etobicoke,"Alderwood,Long Branch",43.601717,-79.545232,1,Pizza Place,Coffee Shop,Gym,Bank,Sandwich Place,Pub,Thai Restaurant,French Restaurant,Flower Shop,Fast Food Restaurant
2,M8X,Etobicoke,The Kingsway,43.647381,-79.511333,2,Coffee Shop,Breakfast Spot,Dessert Shop,Pub,Italian Restaurant,Sushi Restaurant,Bank,Burger Joint,Café,Bakery
3,M8Y,Etobicoke,"Humber Bay,King's Mill Park,Mimico NE,Old Mill...",43.640046,-79.495028,1,Park,Pool,Construction & Landscaping,Thai Restaurant,Fried Chicken Joint,French Restaurant,Flower Shop,Fast Food Restaurant,Farmers Market,Donut Shop
4,M8Z,Etobicoke,"Mimico NW,The Queensway West,South of Bloor",43.603656,-79.493178,1,American Restaurant,Sandwich Place,Fast Food Restaurant,Bus Stop,Construction & Landscaping,Gastropub,Fried Chicken Joint,French Restaurant,Flower Shop,Farmers Market


## Finally, let's visualize the resulting clusters

In [54]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Etobicoke_merged['latitude'], Etobicoke_merged['longitude'], Etobicoke_merged['Neighborhood'], Etobicoke_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters