#### First install Beautiful Soup package, used for performing "web-scraping" operations...

#### (Note that we install the latest Beautiful Soup package, version 4, using LXML parser) 

In [1]:
!conda install -c conda-forge  beautifulsoup4  --yes  

Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
beautifulsoup4            4.6.3                    py35_0    conda-forge


In [2]:
!conda install -c conda-forge  lxml  --yes

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following packages will be UPDATED:

    libxml2: 2.9.4-h6b072ca_5     --> 2.9.8-h422b904_2     conda-forge
    libxslt: 1.1.29-hcf9102b_5    --> 1.1.32-h88dbc4e_2    conda-forge
    lxml:    4.1.0-py35ha401a81_0 --> 4.2.5-py35hc9114bc_0 conda-forge

libxml2-2.9.8- 100% |################################| Time: 0:00:00   2.85 MB/s
libxslt-1.1.32 100% |################################| Time: 0:00:00   8.44 MB/s
lxml-4.2.5-py3 100% |################################| Time: 0:00:00  12.97 MB/s


#### Now import the necessary Python Libraries...

In [3]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

In [4]:
# Create Pandas dataframe to store Toronto neighborhood data
# Only have three columns: PostalCode, Borough, and Neighborhood

# define the dataframe columns
column_names = ['PostalCode', 'Borough', 'Neighborhood']

# instantiate the dataframe
df_neighborhoods = pd.DataFrame(columns=column_names)

# take a look at the empty dataframe, to check that columns are correctly named
df_neighborhoods

Unnamed: 0,PostalCode,Borough,Neighborhood


# 1. Perform Web-Scraping and Explore Resulting Dataset

#### Now use the Python requests library to read the contents of the Wikipedia web site as a string of HTML code

#### This HTML code string will then be parsed using the Beautiful Soup library (with XML parse module)

In [5]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
#print(source)
soup = BeautifulSoup(source, 'lxml')
#print(soup.prettify())


#### We now use the structure of the HTML code to find the postal code, borough and neighborhood data.

#### Note that it is necessary to examine HTML code to see how it should be parsed to find this data...

In [6]:
# now search for the PostalCode, Borough and Neighborhood data in the HTML data
body = soup.find('body')
#print(body.prettify())
table = body.find('table', class_='wikitable sortable')
#print(table.prettify())
table_data = table.tbody.find_all('tr')
# skip first occurrence, as that is just header data
for i in range(1, len(table_data)):
    data = table_data[i].text.split('\n')
    postcode = data[1]
    borough = data[2]
    neighborhood = data[3]

#### Now that we have parsed this data from the web-site HTML code, it will be used to populate the Pandas dataframe.

#### Note that if a borough is not assigned, then data is skipped; if a neighborhood is not assigned, then it gets borough name.

In [7]:
# Now read through this table data, to assign data to dataframe
# Note that need to use dictionary to build up list of neighborhoods for each postal code;
# the dictionary key is the postal code and the dictionary value is list of neighborhoods
neighborhood_dict = {}
borough_dict = {}
for i in range(1, len(table_data)):
    data = table_data[i].text.split('\n')
    postcode = str(data[1])
    borough = str(data[2])
    neighborhood = str(data[3])
    if borough == 'Not assigned':
        continue
    elif  neighborhood == 'Not assigned':
        neighborhood = borough
    if not(postcode in neighborhood_dict.keys()):
        neighborhood_dict[postcode] = []
    if not(neighborhood in neighborhood_dict[postcode]):
        neighborhood_dict[postcode].append(neighborhood)
    if not(postcode in borough_dict.keys()):
        borough_dict[postcode] = ""
    if len(borough) > 0:
        borough_dict[postcode] = borough   

#### Now add this information to the pandas dataframe, converting list of neighborood names into comma-separated strings

In [8]:
# now add this data to the dataframe
key_list = list(neighborhood_dict.keys())
data_list = []
for i in range(len(key_list)):
    data_dict = {}
    data_dict['PostalCode'] = key_list[i]
    data_dict['Borough'] = borough_dict[key_list[i]]
    # need to convert list of strings into a single comma-separated string
    if len(neighborhood_dict[key_list[i]]) > 1:
       data_dict['Neighborhood'] = ", ".join(neighborhood_dict[key_list[i]]) 
    else:     
       data_dict['Neighborhood'] = neighborhood_dict[key_list[i]][0]
    data_list.append(data_dict)
    
df_neighborhoods = pd.DataFrame(data_list)
df_neighborhoods = df_neighborhoods[['PostalCode', 'Borough', 'Neighborhood']]
df_neighborhoods.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
1,M4X,Downtown Toronto,"Cabbagetown, St. James Town"
2,M9C,Etobicoke,"Bloordale Gardens, Eringate, Markland Wood, Ol..."
3,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi..."
4,M3J,North York,"Northwood Park, York University"
5,M8Z,Etobicoke,"Kingsway Park South West, Mimico NW, The Queen..."
6,M8V,Etobicoke,"Humber Bay Shores, Mimico South, New Toronto"
7,M5W,Downtown Toronto,Stn A PO Boxes 25 The Esplanade
8,M5E,Downtown Toronto,Berczy Park
9,M6P,West Toronto,"High Park, The Junction South"


#### Finally, display the number of rows in our pandas dataframe...

In [9]:
# let's find the number of rows in our pandas dataframe
print("The number of rows in our pandas dataframe is:", df_neighborhoods.shape[0])

The number of rows in our pandas dataframe is: 103


#### Now let's add Latitude and Longitude columns into the pandas dataframe...

In [10]:
df_neighborhoods.insert(3, 'Latitude', '')
df_neighborhoods.insert(4, 'Longitude', '')
df_neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",,
1,M4X,Downtown Toronto,"Cabbagetown, St. James Town",,
2,M9C,Etobicoke,"Bloordale Gardens, Eringate, Markland Wood, Ol...",,
3,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",,
4,M3J,North York,"Northwood Park, York University",,


#### Note that we are forced to use CSV file containing Latitude and Longitude values for Toronto area

#### Although much time was spent working with Geocoder, it could not provide this data for us... 

In [11]:
# read in latitude and longitude values from CSV file
!wget  -q -O 'Toronto_Lat_Long.csv'  https://cocl.us/Geospatial_data
print("Geospatial Data Successfully downloaded...")


Geospatial Data Successfully downloaded...


#### Now read the Latitude/Longitude data from the CSV file into our Pandas dataframe...

In [12]:
import csv

df_neighborhoods.set_index('PostalCode', inplace=True)

with open('Toronto_Lat_Long.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter = ',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            # skip first row, since it is just header information
            pass
        else:
            postalCode = str(row[0])
            df_neighborhoods.loc[[postalCode], ['Latitude']]  = float(row[1])
            df_neighborhoods.loc[[postalCode], ['Longitude']] = float(row[2])
        line_count = line_count + 1
csv_file.close()
            
        

#### Look at the first few rows of our dataframe, to check that the Latitude/Longitude data is available...

In [13]:
df_neighborhoods.head(10)

Unnamed: 0_level_0,Borough,Neighborhood,Latitude,Longitude
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.7279,-79.262
M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.668,-79.3677
M9C,Etobicoke,"Bloordale Gardens, Eringate, Markland Wood, Ol...",43.6435,-79.5772
M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.6864,-79.4
M3J,North York,"Northwood Park, York University",43.768,-79.4873
M8Z,Etobicoke,"Kingsway Park South West, Mimico NW, The Queen...",43.6288,-79.521
M8V,Etobicoke,"Humber Bay Shores, Mimico South, New Toronto",43.6056,-79.5013
M5W,Downtown Toronto,Stn A PO Boxes 25 The Esplanade,43.6464,-79.3748
M5E,Downtown Toronto,Berczy Park,43.6448,-79.3733
M6P,West Toronto,"High Park, The Junction South",43.6616,-79.4648


In [14]:
# check number of boroughs and neighborhoods in dataframe
print('The dataframe has {} Boroughs and {} Neighborhoods'.format(
       len(df_neighborhoods['Borough'].unique()), df_neighborhoods['Neighborhood'].shape[0]))


The dataframe has 11 Boroughs and 103 Neighborhoods


In [15]:
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

print('Libraries imported.')

Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
geopy                     1.17.0                     py_0    conda-forge
Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
folium                    0.5.0                      py_0    conda-forge
Libraries imported.


In [16]:
address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinates of Toronto are {}, {}.'.format(latitude, longitude))



The geographical coordinates of Toronto are 43.653963, -79.387207.


#### Create a map of Toronto with neighborhoods superimposed on top.

In [17]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_neighborhoods['Latitude'],df_neighborhoods['Longitude'],df_neighborhoods['Borough'], df_neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

**Folium** is a great visualization library. Feel free to zoom into the above map, and click on each circle mark to reveal the name of the neighborhood and its respective borough.

However, for illustration purposes, let's simplify the above map and segment and cluster only the Toronto neighborhoods. So let's slice the original dataframe and create a new dataframe of the Toronto data.

In [18]:
toronto_data = df_neighborhoods[df_neighborhoods['Borough'].str.find('Toronto') > 0].reset_index(drop=True)
toronto_data.head(20)

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Downtown Toronto,"Cabbagetown, St. James Town",43.668,-79.3677
1,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.6864,-79.4
2,Downtown Toronto,Stn A PO Boxes 25 The Esplanade,43.6464,-79.3748
3,Downtown Toronto,Berczy Park,43.6448,-79.3733
4,West Toronto,"High Park, The Junction South",43.6616,-79.4648
5,Downtown Toronto,St. James Town,43.6515,-79.3754
6,Central Toronto,North Toronto West,43.7154,-79.4057
7,West Toronto,"Parkdale, Roncesvalles",43.649,-79.4563
8,Central Toronto,Lawrence Park,43.728,-79.3888
9,Downtown Toronto,"Harbord, University of Toronto",43.6627,-79.4


In [19]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Next, we are going to start utilizing the Foursquare API to explore the neighborhoods and segment them.

#### Define FourSquare Credentials and Version

In [20]:
CLIENT_ID = '1LY5CN30M1GCZLENIORYFQNEUNRGCCXHI4AEWLWJR5UBQXEN' # your Foursquare ID
CLIENT_SECRET = 'DLSGIUEZUXKSQBNNLOGGL1JIUUFGGYKDLYQSW5XFB5XCSNJN' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentials:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentials:
CLIENT_ID: 1LY5CN30M1GCZLENIORYFQNEUNRGCCXHI4AEWLWJR5UBQXEN
CLIENT_SECRET:DLSGIUEZUXKSQBNNLOGGL1JIUUFGGYKDLYQSW5XFB5XCSNJN


#### Let's explore the neighborhood in our dataframe around the University of Toronto, as that may have some interesting venues to be explored...

In [22]:
toronto_data.loc[9]

Borough                       Downtown Toronto
Neighborhood    Harbord, University of Toronto
Latitude                               43.6627
Longitude                                -79.4
Name: 9, dtype: object

Get the neighborhood's latitude and longitude values.

In [24]:
neighborhood_latitude = toronto_data.loc[9, 'Latitude']   # neighborhood latitude value
neighborhood_longitude = toronto_data.loc[9, 'Longitude'] # neighborhood longitude value

neighborhood_name = toronto_data.loc[9, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Harbord, University of Toronto are 43.6626956, -79.4000493.


#### Now, let's get the top 100 venues that are in Marble Hill within a radius of 500 meters.

First, let's create the GET request URL. Name your URL **url**.

In [25]:
# type your answer here
LIMIT=100
radius = 500
url= 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID,
    CLIENT_SECRET,
    VERSION,
    neighborhood_latitude,
    neighborhood_longitude,
    radius,
    LIMIT)
# display URL
url

'https://api.foursquare.com/v2/venues/explore?&client_id=1LY5CN30M1GCZLENIORYFQNEUNRGCCXHI4AEWLWJR5UBQXEN&client_secret=DLSGIUEZUXKSQBNNLOGGL1JIUUFGGYKDLYQSW5XFB5XCSNJN&v=20180605&ll=43.6626956,-79.4000493&radius=500&limit=100'

In [None]:
Send the GET request and examine the results

In [26]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5bf4a283db04f52cb8d31623'},
 'response': {'groups': [{'items': [{'reasons': {'count': 0,
       'items': [{'reasonName': 'globalInteractionReason',
         'summary': 'This spot is popular',
         'type': 'general'}]},
      'referralId': 'e-0-5362c366498e602fbe1db395-0',
      'venue': {'categories': [{'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/japanese_',
          'suffix': '.png'},
         'id': '4bf58dd8d48988d111941735',
         'name': 'Japanese Restaurant',
         'pluralName': 'Japanese Restaurants',
         'primary': True,
         'shortName': 'Japanese'}],
       'id': '5362c366498e602fbe1db395',
       'location': {'address': '81 Harbord St.',
        'cc': 'CA',
        'city': 'Toronto',
        'country': 'Canada',
        'distance': 255,
        'formattedAddress': ['81 Harbord St.', 'Toronto ON M5S 1G4', 'Canada'],
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.66283719650635,
  

From the Foursquare lab in the previous module, we know that all the information is in the *items* key. Before we proceed, let's borrow the **get_category_type** function from the Foursquare lab.

In [27]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Now we are ready to clean the json and structure it into a *pandas* dataframe.

In [28]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head(10)

Unnamed: 0,name,categories,lat,lng
0,Yasu,Japanese Restaurant,43.662837,-79.403217
1,Rasa,Restaurant,43.662757,-79.403988
2,Piano Piano,Italian Restaurant,43.662949,-79.402898
3,Cafe Cancan,French Restaurant,43.662735,-79.403447
4,Almond Butterfly,Bakery,43.662836,-79.403365
5,Athletic Centre,College Gym,43.662487,-79.400657
6,Her Father's Cider Bar + Kitchen,Beer Bar,43.662448,-79.404703
7,Bakka Phoenix Books,Bookstore,43.662959,-79.402601
8,Harbord House,Bar,43.662466,-79.40541
9,Akai Sushi,Sushi Restaurant,43.66247,-79.404946


And how many venues were returned by Foursquare?

In [29]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

33 venues were returned by Foursquare.


## 2. Explore Neighborhoods in Toronto

#### Let's create a function to repeat the same process to all the neighborhoods in Toronto

In [30]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Now write the code to run the above function on each neighborhood and create a new dataframe called *toronto_venues*.

In [31]:
toronto_venues = getNearbyVenues(names = toronto_data['Neighborhood'],
                                  latitudes = toronto_data['Latitude'],
                                  longitudes = toronto_data['Longitude'])

Cabbagetown, St. James Town
Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
Stn A PO Boxes 25 The Esplanade
Berczy Park
High Park, The Junction South
St. James Town
North Toronto West
Parkdale, Roncesvalles
Lawrence Park
Harbord, University of Toronto
The Beaches
Roselawn
The Beaches West, India Bazaar
First Canadian Place, Underground city
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Davisville North
Rosedale
Runnymede, Swansea
Central Bay Street
Studio District
Harbourfront East, Toronto Islands, Union Station
Brockton, Exhibition Place, Parkdale Village
The Danforth West, Riverdale
Church and Wellesley
Forest Hill North, Forest Hill West
Business reply mail Processing Centre969 Eastern
Little Portugal, Trinity
Design Exchange, Toronto Dominion Centre
Dovercourt Village, Dufferin
Adelaide, King, Richmond
Ryerson, Garden District
Chinatown, Grange Park, Kensington Market
Davisville
Harbourfront, Regent Par

#### Let's check the size of the resulting dataframe

In [32]:
print(toronto_venues.shape)
toronto_venues.head()

(1710, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Cabbagetown, St. James Town",43.667967,-79.367675,Cranberries,43.667843,-79.369407,Diner
1,"Cabbagetown, St. James Town",43.667967,-79.367675,F'Amelia,43.667536,-79.368613,Italian Restaurant
2,"Cabbagetown, St. James Town",43.667967,-79.367675,Butter Chicken Factory,43.667072,-79.369184,Indian Restaurant
3,"Cabbagetown, St. James Town",43.667967,-79.367675,Kingyo Toronto,43.665895,-79.368415,Japanese Restaurant
4,"Cabbagetown, St. James Town",43.667967,-79.367675,Rashnaa Restaurant,43.668183,-79.369066,Indian Restaurant


Let's check how many venues were returned for each neighborhood

In [33]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Berczy Park,56,56,56,56,56,56
"Brockton, Exhibition Place, Parkdale Village",20,20,20,20,20,20
Business reply mail Processing Centre969 Eastern,19,19,19,19,19,19
"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",14,14,14,14,14,14
"Cabbagetown, St. James Town",46,46,46,46,46,46
Central Bay Street,87,87,87,87,87,87
"Chinatown, Grange Park, Kensington Market",100,100,100,100,100,100
Christie,15,15,15,15,15,15
Church and Wellesley,89,89,89,89,89,89


#### Let's find out how many unique categories can be curated from all the returned venues

In [34]:
print('There are {} unique categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 239 unique categories.


## 3. Analyze Each Neighborhood

In [35]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


And let's examine the new dataframe size.

In [36]:
toronto_onehot.shape

(1710, 239)

#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [37]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,"Adelaide, King, Richmond",0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.01
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business reply mail Processing Centre969 Eastern,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.0,0.0,0.0,0.071429,0.071429,0.071429,0.142857,0.142857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Central Bay Street,0.011494,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.011494,0.0,0.0,0.0,0.011494,0.0,0.0,0.0
7,"Chinatown, Grange Park, Kensington Market",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.05,0.0,0.04,0.0,0.01,0.0,0.0,0.0
8,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Church and Wellesley,0.011236,0.0,0.011236,0.011236,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.011236,0.011236,0.011236,0.0,0.0,0.011236,0.011236,0.0


#### Let's confirm the new size

In [38]:
toronto_grouped.shape

(38, 239)

#### Let's print each neighborhood along with the top 5 most common venues

In [39]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
                 venue  freq
0          Coffee Shop  0.07
1                 Café  0.05
2           Steakhouse  0.04
3  American Restaurant  0.04
4      Thai Restaurant  0.04


----Berczy Park----
            venue  freq
0     Coffee Shop  0.09
1      Restaurant  0.05
2    Cocktail Bar  0.05
3            Café  0.04
4  Farmers Market  0.04


----Brockton, Exhibition Place, Parkdale Village----
                   venue  freq
0            Coffee Shop  0.15
1                   Café  0.10
2         Breakfast Spot  0.10
3          Burrito Place  0.05
4  Performing Arts Venue  0.05


----Business reply mail Processing Centre969 Eastern----
                venue  freq
0  Light Rail Station  0.11
1         Yoga Studio  0.05
2             Brewery  0.05
3       Auto Workshop  0.05
4                 Spa  0.05


----CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara----
              venue  freq
0    Airport Lou

#### Let's put that into a *pandas* dataframe

First, let's write a function to sort the venues in descending order.

In [40]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [41]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Steakhouse,American Restaurant,Thai Restaurant,Breakfast Spot,Hotel,Gym,Cosmetics Shop,Clothing Store
1,Berczy Park,Coffee Shop,Restaurant,Cocktail Bar,Pub,Cheese Shop,Steakhouse,Seafood Restaurant,Farmers Market,Café,Bakery
2,"Brockton, Exhibition Place, Parkdale Village",Coffee Shop,Café,Breakfast Spot,Gym / Fitness Center,Stadium,Gym,Furniture / Home Store,Italian Restaurant,Convenience Store,Performing Arts Venue
3,Business reply mail Processing Centre969 Eastern,Light Rail Station,Auto Workshop,Comic Shop,Pizza Place,Butcher,Recording Studio,Restaurant,Burrito Place,Brewery,Skate Park
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Lounge,Airport Service,Airport Terminal,Boat or Ferry,Harbor / Marina,Airport,Airport Food Court,Airport Gate,Boutique,Plane
5,"Cabbagetown, St. James Town",Coffee Shop,Restaurant,Park,Bakery,Pizza Place,Italian Restaurant,Chinese Restaurant,Café,Pub,Indian Restaurant
6,Central Bay Street,Coffee Shop,Café,Italian Restaurant,Bar,Bubble Tea Shop,Burger Joint,Japanese Restaurant,Ice Cream Shop,Sandwich Place,Sushi Restaurant
7,"Chinatown, Grange Park, Kensington Market",Café,Bar,Vegetarian / Vegan Restaurant,Chinese Restaurant,Bakery,Coffee Shop,Mexican Restaurant,Vietnamese Restaurant,Dumpling Restaurant,Burger Joint
8,Christie,Grocery Store,Café,Park,Coffee Shop,Nightclub,Baby Store,Restaurant,Diner,Italian Restaurant,Convenience Store
9,Church and Wellesley,Japanese Restaurant,Coffee Shop,Gay Bar,Sushi Restaurant,Burger Joint,Restaurant,Gastropub,Fast Food Restaurant,Pub,Nightclub


## 4. Cluster Neighborhoods

Run *k*-means to cluster the neighborhood into 5 clusters.

In [42]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 2, 1, 2, 2, 2, 2, 2, 2], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [43]:
toronto_merged = toronto_data

# add clustering labels
toronto_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,"Cabbagetown, St. James Town",43.668,-79.3677,2,Coffee Shop,Restaurant,Park,Bakery,Pizza Place,Italian Restaurant,Chinese Restaurant,Café,Pub,Indian Restaurant
1,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.6864,-79.4,2,Pub,Coffee Shop,Light Rail Station,Convenience Store,Bagel Shop,Sports Bar,Fried Chicken Joint,American Restaurant,Supermarket,Sushi Restaurant
2,Downtown Toronto,Stn A PO Boxes 25 The Esplanade,43.6464,-79.3748,2,Coffee Shop,Café,Restaurant,Hotel,Cocktail Bar,Pub,Seafood Restaurant,Art Gallery,Creperie,Cheese Shop
3,Downtown Toronto,Berczy Park,43.6448,-79.3733,1,Coffee Shop,Restaurant,Cocktail Bar,Pub,Cheese Shop,Steakhouse,Seafood Restaurant,Farmers Market,Café,Bakery
4,West Toronto,"High Park, The Junction South",43.6616,-79.4648,2,Café,Mexican Restaurant,Park,Antique Shop,Flea Market,Bar,Cajun / Creole Restaurant,Arts & Crafts Store,Speakeasy,Fried Chicken Joint


Finally, let's visualize the resulting clusters

In [44]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 5. Examine Clusters

Now, you can examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, you can then assign a name to each cluster. I will leave this exercise to you.

#### Cluster 1

In [45]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
24,"Forest Hill North, Forest Hill West",Jewelry Store,Trail,Mexican Restaurant,Sushi Restaurant,Women's Store,Design Studio,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop
27,"Design Exchange, Toronto Dominion Centre",Coffee Shop,Café,Hotel,Restaurant,American Restaurant,Italian Restaurant,Gastropub,Sports Bar,Gym,Deli / Bodega


#### Cluster 2

In [46]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Berczy Park,Coffee Shop,Restaurant,Cocktail Bar,Pub,Cheese Shop,Steakhouse,Seafood Restaurant,Farmers Market,Café,Bakery
12,"The Beaches West, India Bazaar",Park,Sandwich Place,Ice Cream Shop,Steakhouse,Sushi Restaurant,Italian Restaurant,Fast Food Restaurant,Food & Drink Shop,Pub,Burrito Place
22,"The Danforth West, Riverdale",Greek Restaurant,Ice Cream Shop,Coffee Shop,Italian Restaurant,Bookstore,Yoga Studio,Indian Restaurant,Bakery,Pub,Dessert Shop
36,Christie,Grocery Store,Café,Park,Coffee Shop,Nightclub,Baby Store,Restaurant,Diner,Italian Restaurant,Convenience Store


#### Cluster 3

In [47]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Cabbagetown, St. James Town",Coffee Shop,Restaurant,Park,Bakery,Pizza Place,Italian Restaurant,Chinese Restaurant,Café,Pub,Indian Restaurant
1,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",Pub,Coffee Shop,Light Rail Station,Convenience Store,Bagel Shop,Sports Bar,Fried Chicken Joint,American Restaurant,Supermarket,Sushi Restaurant
2,Stn A PO Boxes 25 The Esplanade,Coffee Shop,Café,Restaurant,Hotel,Cocktail Bar,Pub,Seafood Restaurant,Art Gallery,Creperie,Cheese Shop
4,"High Park, The Junction South",Café,Mexican Restaurant,Park,Antique Shop,Flea Market,Bar,Cajun / Creole Restaurant,Arts & Crafts Store,Speakeasy,Fried Chicken Joint
5,St. James Town,Coffee Shop,Café,Restaurant,Hotel,Gastropub,Cocktail Bar,Bakery,Clothing Store,Cosmetics Shop,Beer Bar
6,North Toronto West,Sporting Goods Shop,Clothing Store,Coffee Shop,Gym / Fitness Center,Chinese Restaurant,Dessert Shop,Diner,Fast Food Restaurant,Mexican Restaurant,Park
7,"Parkdale, Roncesvalles",Breakfast Spot,Gift Shop,Coffee Shop,Cuban Restaurant,Bookstore,Bar,Dog Run,Burger Joint,Italian Restaurant,Dessert Shop
8,Lawrence Park,Bus Line,Lake,Park,Dim Sum Restaurant,Swim School,Deli / Bodega,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant
9,"Harbord, University of Toronto",Café,Restaurant,Bar,Japanese Restaurant,Bookstore,Bakery,Coffee Shop,Chinese Restaurant,Poutine Place,Pub
10,The Beaches,Gym / Fitness Center,Pub,Trail,Coffee Shop,Comfort Food Restaurant,Department Store,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop


#### Cluster 4

In [48]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
17,"Runnymede, Swansea",Coffee Shop,Pizza Place,Café,Italian Restaurant,Sushi Restaurant,Bookstore,Latin American Restaurant,Indie Movie Theater,Falafel Restaurant,Bar


#### Cluster 5

In [49]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
28,"Dovercourt Village, Dufferin",Supermarket,Pharmacy,Discount Store,Bakery,Gym / Fitness Center,Music Venue,Café,Bus Stop,Middle Eastern Restaurant,Brewery


### This lab is now completed!