## STEP 1- Scraping WikiPedia and Creating Data Frame

In [1]:
# install and import libraries

!pip install BS4
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
print('DONE!')

Collecting BS4
  Downloading https://files.pythonhosted.org/packages/10/ed/7e8b97591f6f456174139ec089c769f89a94a1a4025fe967691de971f314/bs4-0.0.1.tar.gz
Collecting beautifulsoup4 (from BS4)
[?25l  Downloading https://files.pythonhosted.org/packages/66/25/ff030e2437265616a1e9b25ccc864e0371a0bc3adb7c5a404fd661c6f4f6/beautifulsoup4-4.9.1-py3-none-any.whl (115kB)
[K     |████████████████████████████████| 122kB 6.4MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2 (from beautifulsoup4->BS4)
  Downloading https://files.pythonhosted.org/packages/6f/8f/457f4a5390eeae1cc3aeab89deb7724c965be841ffca6cfca9197482e470/soupsieve-2.0.1-py3-none-any.whl
Building wheels for collected packages: BS4
  Building wheel for BS4 (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/jupyterlab/.cache/pip/wheels/a0/b0/b2/4f80b9456b87abedbc0bf2d52235414c3467d8889be38dd472
Successfully built BS4
Installing collected packages: soupsieve, beautifulsoup4, BS4
Successfully installed BS4-0.0.1 beautifulsoup4-4.9.

### Step 1.A- Getting the data and preprocessing 

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url)
soup = BeautifulSoup(source.text, 'html')

#using soup object, iterate the wikitable to get the data from the HTML page and store it into a list
data = []
columns = []
table = soup.find(class_='wikitable')
for index, tr in enumerate(table.find_all('tr')):
    section = []
    for td in tr.find_all(['th','td']):
        section.append(td.text.rstrip())
    
    #First row of data is the header
    if (index == 0):
        columns = section
    else:
        data.append(section)

#convert list into Pandas DataFrame
Toronto_df = pd.DataFrame(data = data,columns = columns)
Toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [3]:
Toronto_df.shape

(180, 3)

### Step 1.B. Cleaning the Data

In [4]:
# Removing rows w/o any borough assigned!
Toronto_df = Toronto_df[Toronto_df['Borough'] != 'Not assigned']
Toronto_df.reset_index(inplace=True)
Toronto_df.drop('index', axis=1, inplace=True)
print('This table has {} rows and {} columns.'.format(Toronto_df.shape[0], Toronto_df.shape[1]))

This table has 103 rows and 3 columns.


In [5]:
Toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## STEP 2- Getting the Coordinates for each Neighborhood

I used pgeocode package to get the coordinates since the geocoder did nor work for me!

In [6]:
# instal and import pgeocode
!pip install pgeocode
import pgeocode

# getting the coordinates in a new data frame
pcode = Toronto_df['Postal Code'].tolist()
nomi = pgeocode.Nominatim('ca')
New_df = nomi.query_postal_code(pcode)
New_df

Collecting pgeocode
  Downloading https://files.pythonhosted.org/packages/86/44/519e3db3db84acdeb29e24f2e65991960f13464279b61bde5e9e96909c9d/pgeocode-0.2.1-py2.py3-none-any.whl
Installing collected packages: pgeocode
Successfully installed pgeocode-0.2.1


Unnamed: 0,postal_code,country code,place_name,state_name,state_code,county_name,county_code,community_name,community_code,latitude,longitude,accuracy
0,M3A,CA,North York (York Heights / Victoria Village / ...,Ontario,ON,North York,,,,43.7545,-79.3300,1.0
1,M4A,CA,North York (Sweeney Park / Wigmore Park),Ontario,ON,,,,,43.7276,-79.3148,6.0
2,M5A,CA,Downtown Toronto (Regent Park / Port of Toronto),Ontario,ON,Toronto,8133394.0,,,43.6555,-79.3626,6.0
3,M6A,CA,North York (Lawrence Manor / Lawrence Heights),Ontario,ON,North York,,,,43.7223,-79.4504,6.0
4,M7A,CA,Queen's Park Ontario Provincial Government,Ontario,ON,,,,,43.6641,-79.3889,
...,...,...,...,...,...,...,...,...,...,...,...,...
98,M8X,CA,Etobicoke (The Kingsway / Montgomery Road / Ol...,Ontario,ON,Etobicoke,,,,43.6518,-79.5076,6.0
99,M4Y,CA,Downtown Toronto (Church and Wellesley),Ontario,ON,Toronto,8133394.0,,,43.6656,-79.3830,6.0
100,M7Y,CA,East Toronto Business Reply Mail Processing Ce...,Ontario,ON,Toronto,8133394.0,,,43.7804,-79.2505,
101,M8Y,CA,Etobicoke (Old Mill South / King's Mill Park /...,Ontario,ON,Etobicoke,,,,43.6325,-79.4939,6.0


In [8]:
# adding coordinate from New_df to the Toronto_df 
Toronto_df['Latitude'] = New_df['latitude']
Toronto_df['Longitude'] = New_df['longitude']
print (Toronto_df.shape)
Toronto_df.head()

(103, 5)


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889


pgeocode did not get the coordinates for two of the Postal Codes correctly!

In [9]:
#coorecting the coordinates for rows 76 and 100. 
Toronto_df.loc[76,['Latitude', 'Longitude']] = 43.6364,-79.6157
Toronto_df.loc[100,['Latitude', 'Longitude']] = 43.6645,-79.3214


## STEP 3- Clustering Neighborhood

In [12]:
import json # library to handle JSON files

!pip install geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values


# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Collecting geopy
[?25l  Downloading https://files.pythonhosted.org/packages/ab/97/25def417bf5db4cc6b89b47a56961b893d4ee4fec0c335f5b9476a8ff153/geopy-1.22.0-py2.py3-none-any.whl (113kB)
[K     |████████████████████████████████| 122kB 7.0MB/s eta 0:00:01
[?25hCollecting geographiclib<2,>=1.49 (from geopy)
  Downloading https://files.pythonhosted.org/packages/8b/62/26ec95a98ba64299163199e95ad1b0e34ad3f4e176e221c40245f211e425/geographiclib-1.50-py3-none-any.whl
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.50 geopy-1.22.0
Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - folium=0.5.0


The following packages will be downloaded:

    package                    |      

In [13]:
# Get coordinates of Toronto 
address = 'Toronto, ON, CA'

geolocator = Nominatim(user_agent="TR_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))


The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


In [14]:
#making a list of boroughs in Toronto to be used in visualization 
boroughs = list(set(Toronto_df['Borough']))
boroughs

['Etobicoke',
 'Scarborough',
 'East York',
 'Downtown Toronto',
 'West Toronto',
 'East Toronto',
 'North York',
 'Mississauga',
 'York',
 'Central Toronto']

In [15]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)
colors= ['#0000FF','#FF4040', '#66CD00', '#00CDCD', '#CAFF70', '#9932CC', '#EEC900', '#808080', '#FF69B4', '#FFFFE0']

# add markers to map
for lat, lng, borough, neighborhood in zip(Toronto_df['Latitude'], Toronto_df['Longitude'], Toronto_df['Borough'], Toronto_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    
    for i in range(0, len(boroughs)) : 
        if boroughs[i] == borough : 
            c = colors[i]
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color= c,
        fill=True,
        fill_color= c,
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
    
map_toronto

### Let's go with North York and cluster its neighborhood

In [16]:
nyork_data = Toronto_df[Toronto_df['Borough'] == 'North York'].reset_index(drop=True)
nyork_data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
3,M3B,North York,Don Mills,43.745,-79.359
4,M6B,North York,Glencairn,43.7081,-79.4479


In [17]:
# Getting coordinates of North York
address = 'North York, ON, CA'

geolocator = Nominatim(user_agent="nyork_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of North York are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of North York are 43.7543263, -79.44911696639593.


In [18]:
# create map of North York using latitude and longitude values
map_nyork = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(nyork_data['Latitude'], nyork_data['Longitude'], nyork_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.7,
        parse_html=False).add_to(map_nyork)  
    
map_nyork

In [19]:
# defining foursquare credentials
CLIENT_ID = 'J0GPZTC4NA5Q12YWL101RVQRFYYTJWP10W1BZEH020ZUENMS'
CLIENT_SECRET = 'Q1ZMN0FASRPSBW4BGPXDTUJRWHMVRLHN2O1DIBUBRGY4JYCW' 
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: J0GPZTC4NA5Q12YWL101RVQRFYYTJWP10W1BZEH020ZUENMS
CLIENT_SECRET:Q1ZMN0FASRPSBW4BGPXDTUJRWHMVRLHN2O1DIBUBRGY4JYCW


In [20]:
#Getting the name of first neighboorhood
nyork_data.loc[0, 'Neighborhood']

'Parkwoods'

In [21]:
neighborhood_latitude = nyork_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = nyork_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = nyork_data.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Parkwoods are 43.7545, -79.33.


#### Now, let's get the top 100 venues that are in Parkwoods within a radius of 500 meters.

In [22]:
#First, let's create the GET request URL. 
LIMIT = 100 
radius = 500
Latitude = neighborhood_latitude
Longitude = neighborhood_longitude

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)

url

'https://api.foursquare.com/v2/venues/explore?&client_id=J0GPZTC4NA5Q12YWL101RVQRFYYTJWP10W1BZEH020ZUENMS&client_secret=Q1ZMN0FASRPSBW4BGPXDTUJRWHMVRLHN2O1DIBUBRGY4JYCW&v=20180605&ll=43.7545,-79.33&radius=500&limit=100'

In [23]:
#Send the GET request and examine the resutls
results = requests.get(url).json()
results


{'meta': {'code': 200, 'requestId': '5ee3dad2c6fd9436f69ef672'},
  'headerLocation': 'Parkwoods - Donalda',
  'headerFullLocation': 'Parkwoods - Donalda, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 3,
  'suggestedBounds': {'ne': {'lat': 43.7590000045, 'lng': -79.32378161085641},
   'sw': {'lat': 43.7499999955, 'lng': -79.33621838914358}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4e8d9dcdd5fbbbb6b3003c7b',
       'name': 'Brookbanks Park',
       'location': {'address': 'Toronto',
        'lat': 43.751976046055574,
        'lng': -79.33214044722958,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.751976046055574,
          'lng': -79.33214044722958}],
        'distance': 329,
        'cc': 'CA',
        'city': 'To

In [24]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [25]:
#clean the json and structure it into a *pandas* dataframe.

venues = results['response']['groups'][0]['items']
    
nearby_venues = pd.json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Brookbanks Park,Park,43.751976,-79.33214
1,GTA Restoration,Fireworks Store,43.753396,-79.333477
2,Variety Store,Food & Drink Shop,43.751974,-79.333114


#### Let's create a function to repeat the same process to all the neighborhoods in North York

In [26]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [27]:
#run the function
nyork_venues = getNearbyVenues(names=nyork_data['Neighborhood'],
                                   latitudes=nyork_data['Latitude'],
                                   longitudes=nyork_data['Longitude'])

Parkwoods
Victoria Village
Lawrence Manor, Lawrence Heights
Don Mills
Glencairn
Don Mills
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Fairview, Henry Farm, Oriole
Northwood Park, York University
Bayview Village
Downsview
York Mills, Silver Hills
Downsview
North Park, Maple Leaf Park, Upwood Park
Humber Summit
Willowdale, Newtonbrook
Downsview
Bedford Park, Lawrence Manor East
Humberlea, Emery
Willowdale, Willowdale East
Downsview
York Mills West
Willowdale, Willowdale West


#### Let's check the size of the resulting dataframe

In [28]:
print(nyork_venues.shape)
nyork_venues.head()

(319, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.7545,-79.33,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.7545,-79.33,GTA Restoration,43.753396,-79.333477,Fireworks Store
2,Parkwoods,43.7545,-79.33,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,Victoria Village,43.7276,-79.3148,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.7276,-79.3148,Tim Hortons,43.725517,-79.313103,Coffee Shop


In [29]:
#check to see how many venues were returned for each neighborhood
nyork_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Bathurst Manor, Wilson Heights, Downsview North",8,8,8,8,8,8
Bayview Village,3,3,3,3,3,3
"Bedford Park, Lawrence Manor East",24,24,24,24,24,24
Don Mills,9,9,9,9,9,9
Downsview,28,28,28,28,28,28
"Fairview, Henry Farm, Oriole",61,61,61,61,61,61
Glencairn,14,14,14,14,14,14
Hillcrest Village,2,2,2,2,2,2
Humber Summit,2,2,2,2,2,2
"Humberlea, Emery",6,6,6,6,6,6


In [30]:
# Let's find out how many unique categories can be curated from all the returned venues
print('There are {} uniques categories.'.format(len(nyork_venues['Venue Category'].unique())))

There are 115 uniques categories.


###  Analyze Each Neighborhood

In [31]:
# one hot encoding
nyork_onehot = pd.get_dummies(nyork_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
nyork_onehot['Neighborhood'] = nyork_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [nyork_onehot.columns[-1]] + list(nyork_onehot.columns[:-1])
nyork_onehot = nyork_onehot[fixed_columns]

nyork_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Bakery,Bank,Bar,Baseball Field,...,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Trail,Video Game Store,Vietnamese Restaurant,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [32]:
nyork_grouped = nyork_onehot.groupby('Neighborhood').mean().reset_index()
nyork_grouped

Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Bakery,Bank,Bar,Baseball Field,...,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Trail,Video Game Store,Vietnamese Restaurant,Women's Store,Yoga Studio
0,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0
2,"Bedford Park, Lawrence Manor East",0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,...,0.041667,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Don Mills,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111
4,Downsview,0.0,0.035714,0.0,0.0,0.0,0.0,0.0,0.0,0.035714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0
5,"Fairview, Henry Farm, Oriole",0.0,0.0,0.016393,0.0,0.016393,0.016393,0.032787,0.016393,0.032787,...,0.0,0.016393,0.0,0.016393,0.016393,0.0,0.016393,0.0,0.032787,0.0
6,Glencairn,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,...,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Hillcrest Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Humber Summit,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Humberlea, Emery",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Let's print each neighborhood along with the top 5 most common venues

In [33]:
num_top_venues = 5

for hood in nyork_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = nyork_grouped[nyork_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Bathurst Manor, Wilson Heights, Downsview North----
                       venue  freq
0                        Spa  0.12
1              Grocery Store  0.12
2                Pizza Place  0.12
3                Coffee Shop  0.12
4  Middle Eastern Restaurant  0.12


----Bayview Village----
               venue  freq
0        Gas Station  0.33
1               Park  0.33
2              Trail  0.33
3  Accessories Store  0.00
4          Pet Store  0.00


----Bedford Park, Lawrence Manor East----
                venue  freq
0         Coffee Shop  0.08
1  Italian Restaurant  0.08
2          Restaurant  0.08
3      Sandwich Place  0.08
4             Butcher  0.04


----Don Mills----
                        venue  freq
0                        Park  0.22
1                Home Service  0.22
2                 Yoga Studio  0.11
3      Furniture / Home Store  0.11
4  Construction & Landscaping  0.11


----Downsview----
                   venue  freq
0            Pizza Place  0.07
1  Vietnamese Re

#### Let's put that into a *pandas* dataframe

In [34]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [35]:
#Now let's create the new dataframe and display the top 10 venues for each neighborhood.
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = nyork_grouped['Neighborhood']

for ind in np.arange(nyork_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(nyork_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Bathurst Manor, Wilson Heights, Downsview North",Pizza Place,Deli / Bodega,Coffee Shop,Grocery Store,Middle Eastern Restaurant,Fried Chicken Joint,Mediterranean Restaurant,Spa,Fish Market,Department Store
1,Bayview Village,Park,Trail,Gas Station,Yoga Studio,Food Court,Deli / Bodega,Department Store,Dessert Shop,Discount Store,Distribution Center
2,"Bedford Park, Lawrence Manor East",Italian Restaurant,Coffee Shop,Sandwich Place,Restaurant,Grocery Store,Café,Pizza Place,Butcher,Pharmacy,Hobby Shop
3,Don Mills,Home Service,Park,Yoga Studio,Pool,Gym,Furniture / Home Store,Construction & Landscaping,Bank,Fried Chicken Joint,Dessert Shop
4,Downsview,Shopping Mall,Vietnamese Restaurant,Coffee Shop,Discount Store,Pizza Place,Grocery Store,Pharmacy,Home Service,Caribbean Restaurant,Fast Food Restaurant


## 4. Cluster Neighborhoods

In [36]:
# set number of clusters
kclusters = 5

nyork_grouped_clustering = nyork_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(nyork_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 1, 1, 1, 1, 1, 0, 2, 1], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [37]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

nyork_merged = nyork_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
nyork_merged = nyork_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

nyork_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.7545,-79.33,0,Park,Fireworks Store,Food & Drink Shop,Yoga Studio,French Restaurant,Deli / Bodega,Department Store,Dessert Shop,Discount Store,Distribution Center
1,M4A,North York,Victoria Village,43.7276,-79.3148,1,Intersection,Portuguese Restaurant,Hockey Arena,French Restaurant,Park,Coffee Shop,Pizza Place,Bank,Bar,Department Store
2,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504,1,Clothing Store,Coffee Shop,Restaurant,Cosmetics Shop,Women's Store,Sushi Restaurant,Men's Store,Electronics Store,Bakery,Sandwich Place
3,M3B,North York,Don Mills,43.745,-79.359,1,Home Service,Park,Yoga Studio,Pool,Gym,Furniture / Home Store,Construction & Landscaping,Bank,Fried Chicken Joint,Dessert Shop
4,M6B,North York,Glencairn,43.7081,-79.4479,1,Pizza Place,Ice Cream Shop,Grocery Store,Mediterranean Restaurant,Gas Station,Latin American Restaurant,Fast Food Restaurant,Sushi Restaurant,Japanese Restaurant,Italian Restaurant


Finally, let's visualize the resulting clusters

In [39]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(nyork_merged['Latitude'], nyork_merged['Longitude'], nyork_merged['Neighborhood'], nyork_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters