In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
!pip install geocoder



# Question 1 Code

In [3]:
#Screaping list of postal codes from wikipedia
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
table = pd.read_html(source, header=0, attrs={"class":"wikitable sortable"})[0]
table

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


In [4]:
#Renaming the column headers and removing all those rows where Borough is not assigned
table = table.rename(columns={"Postal Code":"PostalCode", "Borough":"Borough", "Neighbourhood":"Neighborhood"})
table = table.drop(table.index[table['Borough']=='Not assigned'])

#Remove more than one neighborhood in one postal code area and format it
table=table.groupby('PostalCode', sort=False, as_index=False).agg(lambda x: ', '.join(set(x)))
table

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [5]:
#Displaying the shape of the table
print(type(table))
table.shape

<class 'pandas.core.frame.DataFrame'>


(103, 3)

# Question 2 Code

In [6]:
#build the dataframe with the csv file that has the geographical coordinates of each postal code

geo_coord_table=pd.read_csv("https://cocl.us/Geospatial_data")
geo_coord_table.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [7]:
print('The table dataframe has {} dimension and geo_coord_table dataframe also has the same {} .'.format(table.shape, geo_coord_table.shape))

The table dataframe has (103, 3) dimension and geo_coord_table dataframe also has the same (103, 3) .


In [8]:
#Change the column name 
geo_coord_table.rename(columns={"Postal Code":"PostalCode"}, inplace=True)
geo_coord_table.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [9]:
# Merge both data frames
Toronto_df = table.merge(geo_coord_table, left_on='PostalCode', right_on='PostalCode')
Toronto_df.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [10]:
print(Toronto_df.shape)
print('The Toronto_df dataframe has {} boroughs and {} neighborhoods.'.format(
        len(Toronto_df['Borough'].unique()),
        Toronto_df.shape[0])
     )

(103, 5)
The Toronto_df dataframe has 10 boroughs and 103 neighborhoods.


# Question 3 Code

In [11]:
import numpy as np 
import json
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

!pip install geopy
!pip install folium



In [12]:
import folium

In [13]:
#Getting longitude and latitude of toronto using geopy

address = 'Toronto, Canada'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [14]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(Toronto_df['Latitude'], Toronto_df['Longitude'], Toronto_df['Borough'], Toronto_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [15]:
print(Toronto_df['Borough'].unique())

['North York' 'Downtown Toronto' 'Etobicoke' 'Scarborough' 'East York'
 'York' 'East Toronto' 'West Toronto' 'Central Toronto' 'Mississauga']


In [16]:
north_y_data = Toronto_df[Toronto_df['Borough'] == "North York"].reset_index(drop=True)
north_y_data.head()
print(north_y_data.shape)

(24, 5)


In [17]:
address = 'North York, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# create map of Manhattan using latitude and longitude values
map_north_y = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(north_y_data['Latitude'], north_y_data['Longitude'], north_y_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_north_y)  
    
map_north_y

In [18]:
#Get the neighborhood's name.

north_y_data.loc[12, 'Neighborhood']

'York Mills, Silver Hills'

In [19]:
neighborhood_latitude = north_y_data.loc[12, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = north_y_data.loc[12, 'Longitude'] # neighborhood longitude value

neighborhood_name = north_y_data.loc[12, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of York Mills, Silver Hills are 43.7574902, -79.37471409999999.


In [21]:
# get the top 100 venues that are in York Mills, Silver Hills within a radius of 500 meters

LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 500 # define radius


CLIENT_ID = 'VAH25GVAB0FYVVECSXU2TTQ1YPAO4MS1LU1ZMWHCBREJ2J1U' # your Foursquare ID
CLIENT_SECRET = 'U11TK4UIPXXCDFFYBQW4R042T4NMIWASVZJJPTHDQCYLUJYP' # your Foursquare Secret
VERSION = '20200629' # Foursquare API version

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)

url

'https://api.foursquare.com/v2/venues/explore?&client_id=VAH25GVAB0FYVVECSXU2TTQ1YPAO4MS1LU1ZMWHCBREJ2J1U&client_secret=U11TK4UIPXXCDFFYBQW4R042T4NMIWASVZJJPTHDQCYLUJYP&v=20200629&ll=43.7574902,-79.37471409999999&radius=500&limit=100'

In [22]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5efadeb1353f133543635be6'},
  'headerLocation': 'St. Andrew - Windfields',
  'headerFullLocation': 'St. Andrew - Windfields, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 0,
  'suggestedBounds': {'ne': {'lat': 43.7619902045, 'lng': -79.3684954001132},
   'sw': {'lat': 43.7529901955, 'lng': -79.38093279988678}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': []}]}}

In [23]:
LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 1000 # define radius


CLIENT_ID = 'VAH25GVAB0FYVVECSXU2TTQ1YPAO4MS1LU1ZMWHCBREJ2J1U' # your Foursquare ID
CLIENT_SECRET = 'U11TK4UIPXXCDFFYBQW4R042T4NMIWASVZJJPTHDQCYLUJYP' # your Foursquare Secret
VERSION = '20200630' # Foursquare API version

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)

url

'https://api.foursquare.com/v2/venues/explore?&client_id=VAH25GVAB0FYVVECSXU2TTQ1YPAO4MS1LU1ZMWHCBREJ2J1U&client_secret=U11TK4UIPXXCDFFYBQW4R042T4NMIWASVZJJPTHDQCYLUJYP&v=20200630&ll=43.7574902,-79.37471409999999&radius=1000&limit=100'

In [24]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5efadfb04cb2e80ec60de68b'},
 'response': {'headerLocation': 'St. Andrew - Windfields',
  'headerFullLocation': 'St. Andrew - Windfields, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 4,
  'suggestedBounds': {'ne': {'lat': 43.76649020900001,
    'lng': -79.36227670022642},
   'sw': {'lat': 43.74849019099999, 'lng': -79.38715149977357}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '54c93d4c498e80ef9b327c09',
       'name': 'Swimming Pool',
       'location': {'address': '490 York Mills Rd',
        'crossStreet': 'Banbury Rd',
        'lat': 43.75099421720474,
        'lng': -79.37436489349645,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.75099421720474,
          'lng': -79.3743648

In [25]:
# defining a function to retrieve the category of venues

def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [26]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Swimming Pool,Pool,43.750994,-79.374365
1,Talara Park,Park,43.765457,-79.377301
2,St. Andrews Park,Park,43.757309,-79.386616
3,Ames Park,Park,43.751868,-79.365473


In [27]:
# create a function to repeat the same process to all the neighborhoods in North York

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)


In [28]:
# run the above function on each neighborhood and create a new dataframe called north_y_venues.

north_y_venues = getNearbyVenues(names=north_y_data['Neighborhood'],
                                   latitudes=north_y_data['Latitude'],
                                   longitudes=north_y_data['Longitude']
                                  )

Parkwoods
Victoria Village
Lawrence Manor, Lawrence Heights
Don Mills
Glencairn
Don Mills
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Fairview, Henry Farm, Oriole
Northwood Park, York University
Bayview Village
Downsview
York Mills, Silver Hills
Downsview
North Park, Maple Leaf Park, Upwood Park
Humber Summit
Willowdale, Newtonbrook
Downsview
Bedford Park, Lawrence Manor East
Humberlea, Emery
Willowdale, Willowdale East
Downsview
York Mills West
Willowdale, Willowdale West


In [29]:
print(north_y_venues.shape)
north_y_venues.head()

(244, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant
4,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop


In [30]:
# check number of venues returned for each neighborhood

north_y_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Bathurst Manor, Wilson Heights, Downsview North",21,21,21,21,21,21
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",24,24,24,24,24,24
Don Mills,27,27,27,27,27,27
Downsview,15,15,15,15,15,15
"Fairview, Henry Farm, Oriole",67,67,67,67,67,67
Glencairn,4,4,4,4,4,4
Hillcrest Village,4,4,4,4,4,4
Humber Summit,4,4,4,4,4,4
"Humberlea, Emery",1,1,1,1,1,1


In [31]:
print('There are {} uniques categories.'.format(len(north_y_venues['Venue Category'].unique())))

There are 106 uniques categories.


In [32]:
#Analysing each neighbor hood

# one hot encoding
north_y_onehot = pd.get_dummies(north_y_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
north_y_onehot['Neighborhood'] = north_y_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [north_y_onehot.columns[-1]] + list(north_y_onehot.columns[:-1])
north_y_onehot = north_y_onehot[fixed_columns]

north_y_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bakery,Bank,...,Steakhouse,Supermarket,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Vietnamese Restaurant,Women's Store
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
north_y_onehot.shape

(244, 107)

In [34]:
# group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

north_y_grouped = north_y_onehot.groupby('Neighborhood').mean().reset_index()
north_y_grouped

Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bakery,Bank,...,Steakhouse,Supermarket,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Vietnamese Restaurant,Women's Store
0,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.095238,...,0.0,0.047619,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bedford Park, Lawrence Manor East",0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.041667,0.0,0.041667,0.0,0.0,0.0,0.0,0.0
3,Don Mills,0.0,0.0,0.0,0.037037,0.0,0.074074,0.037037,0.0,0.0,...,0.0,0.037037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Downsview,0.0,0.066667,0.0,0.0,0.0,0.0,0.066667,0.0,0.066667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Fairview, Henry Farm, Oriole",0.0,0.0,0.014925,0.0,0.0,0.014925,0.0,0.029851,0.029851,...,0.0,0.0,0.0,0.029851,0.0,0.014925,0.029851,0.014925,0.0,0.014925
6,Glencairn,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,...,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Hillcrest Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Humber Summit,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Humberlea, Emery",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
north_y_grouped.shape

(18, 107)

In [36]:
#print each neighborhood along with the top 5 most common venues

num_top_venues = 5

for hood in north_y_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = north_y_grouped[north_y_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Bathurst Manor, Wilson Heights, Downsview North----
               venue  freq
0        Coffee Shop  0.10
1               Bank  0.10
2      Shopping Mall  0.05
3  Mobile Phone Shop  0.05
4               Park  0.05


----Bayview Village----
                 venue  freq
0   Chinese Restaurant  0.25
1                 Café  0.25
2                 Bank  0.25
3  Japanese Restaurant  0.25
4    Accessories Store  0.00


----Bedford Park, Lawrence Manor East----
                venue  freq
0         Coffee Shop  0.08
1      Sandwich Place  0.08
2  Italian Restaurant  0.08
3          Restaurant  0.08
4       Grocery Store  0.04


----Don Mills----
                 venue  freq
0          Coffee Shop  0.07
1           Restaurant  0.07
2     Asian Restaurant  0.07
3                  Gym  0.07
4  Japanese Restaurant  0.07


----Downsview----
               venue  freq
0               Park  0.13
1      Grocery Store  0.13
2     Baseball Field  0.07
3     Discount Store  0.07
4  Electronics Store 

In [42]:
# create a function to sort the venues in descending order.

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

# create the new dataframe and display the top 10 venues for each neighborhood.

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = north_y_grouped['Neighborhood']

for ind in np.arange(north_y_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(north_y_grouped.iloc[ind, :], num_top_venues)


    
neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Middle Eastern Restaurant,Shopping Mall,Mobile Phone Shop,Park,Ice Cream Shop,Pharmacy,Pizza Place,Bridal Shop
1,Bayview Village,Chinese Restaurant,Café,Bank,Japanese Restaurant,Discount Store,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega
2,"Bedford Park, Lawrence Manor East",Restaurant,Sandwich Place,Coffee Shop,Italian Restaurant,Juice Bar,Comfort Food Restaurant,Butcher,Indian Restaurant,Pharmacy,Pizza Place
3,Don Mills,Restaurant,Coffee Shop,Japanese Restaurant,Asian Restaurant,Gym,Beer Store,Sandwich Place,Clothing Store,Chinese Restaurant,Italian Restaurant
4,Downsview,Grocery Store,Park,Gym / Fitness Center,Electronics Store,Airport,Food Truck,Discount Store,Athletics & Sports,Liquor Store,Bank
5,"Fairview, Henry Farm, Oriole",Clothing Store,Coffee Shop,Fast Food Restaurant,Restaurant,Japanese Restaurant,Toy / Game Store,Tea Room,Bank,Bakery,Shoe Store
6,Glencairn,Asian Restaurant,Sushi Restaurant,Pub,Japanese Restaurant,Women's Store,Diner,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store
7,Hillcrest Village,Golf Course,Mediterranean Restaurant,Pool,Dog Run,Diner,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop
8,Humber Summit,Furniture / Home Store,Gym,Pizza Place,Home Service,Diner,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop
9,"Humberlea, Emery",Baseball Field,Women's Store,Distribution Center,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store


In [43]:
 # USe k-means to cluster the neighborhood into 5 clusters.

# set number of clusters
kclusters = 5

north_y_grouped_clustering = north_y_grouped.drop('Neighborhood', 1)

# run k-means clustering
#kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(north_y_grouped_clustering)
kmeans = KMeans(init = "k-means++", n_clusters = kclusters, n_init = 10).fit(north_y_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 4, 0, 0, 0, 3], dtype=int32)

In [44]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

north_y_merged = north_y_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
north_y_merged = north_y_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood', how = 'inner')

north_y_merged 

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,4,Park,Food & Drink Shop,Diner,Clothing Store,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega
1,M4A,North York,Victoria Village,43.725882,-79.315572,0,Intersection,Coffee Shop,Hockey Arena,Portuguese Restaurant,Discount Store,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega
2,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,0,Clothing Store,Furniture / Home Store,Accessories Store,Boutique,Event Space,Coffee Shop,Miscellaneous Shop,Carpet Store,Vietnamese Restaurant,Bank
3,M3B,North York,Don Mills,43.745906,-79.352188,0,Restaurant,Coffee Shop,Japanese Restaurant,Asian Restaurant,Gym,Beer Store,Sandwich Place,Clothing Store,Chinese Restaurant,Italian Restaurant
5,M3C,North York,Don Mills,43.7259,-79.340923,0,Restaurant,Coffee Shop,Japanese Restaurant,Asian Restaurant,Gym,Beer Store,Sandwich Place,Clothing Store,Chinese Restaurant,Italian Restaurant
4,M6B,North York,Glencairn,43.709577,-79.445073,0,Asian Restaurant,Sushi Restaurant,Pub,Japanese Restaurant,Women's Store,Diner,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store
6,M2H,North York,Hillcrest Village,43.803762,-79.363452,2,Golf Course,Mediterranean Restaurant,Pool,Dog Run,Diner,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop
7,M3H,North York,"Bathurst Manor, Wilson Heights, Downsview North",43.754328,-79.442259,0,Coffee Shop,Bank,Middle Eastern Restaurant,Shopping Mall,Mobile Phone Shop,Park,Ice Cream Shop,Pharmacy,Pizza Place,Bridal Shop
8,M2J,North York,"Fairview, Henry Farm, Oriole",43.778517,-79.346556,0,Clothing Store,Coffee Shop,Fast Food Restaurant,Restaurant,Japanese Restaurant,Toy / Game Store,Tea Room,Bank,Bakery,Shoe Store
9,M3J,North York,"Northwood Park, York University",43.76798,-79.487262,0,Coffee Shop,Furniture / Home Store,Caribbean Restaurant,Massage Studio,Falafel Restaurant,Bar,Diner,Comfort Food Restaurant,Construction & Landscaping,Convenience Store


In [45]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(north_y_merged['Latitude'], north_y_merged['Longitude'], north_y_merged['Neighborhood'], north_y_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    print(cluster)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

4
0
0
0
0
0
2
0
0
0
0
0
0
0
0
0
0
0
1
0
3
0


In [46]:
# Cluster 1
north_y_merged.loc[north_y_merged['Cluster Labels'] == 0, north_y_merged.columns[[1] + list(range(5, north_y_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,North York,0,Intersection,Coffee Shop,Hockey Arena,Portuguese Restaurant,Discount Store,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega
2,North York,0,Clothing Store,Furniture / Home Store,Accessories Store,Boutique,Event Space,Coffee Shop,Miscellaneous Shop,Carpet Store,Vietnamese Restaurant,Bank
3,North York,0,Restaurant,Coffee Shop,Japanese Restaurant,Asian Restaurant,Gym,Beer Store,Sandwich Place,Clothing Store,Chinese Restaurant,Italian Restaurant
5,North York,0,Restaurant,Coffee Shop,Japanese Restaurant,Asian Restaurant,Gym,Beer Store,Sandwich Place,Clothing Store,Chinese Restaurant,Italian Restaurant
4,North York,0,Asian Restaurant,Sushi Restaurant,Pub,Japanese Restaurant,Women's Store,Diner,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store
7,North York,0,Coffee Shop,Bank,Middle Eastern Restaurant,Shopping Mall,Mobile Phone Shop,Park,Ice Cream Shop,Pharmacy,Pizza Place,Bridal Shop
8,North York,0,Clothing Store,Coffee Shop,Fast Food Restaurant,Restaurant,Japanese Restaurant,Toy / Game Store,Tea Room,Bank,Bakery,Shoe Store
9,North York,0,Coffee Shop,Furniture / Home Store,Caribbean Restaurant,Massage Studio,Falafel Restaurant,Bar,Diner,Comfort Food Restaurant,Construction & Landscaping,Convenience Store
10,North York,0,Chinese Restaurant,Café,Bank,Japanese Restaurant,Discount Store,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega
11,North York,0,Grocery Store,Park,Gym / Fitness Center,Electronics Store,Airport,Food Truck,Discount Store,Athletics & Sports,Liquor Store,Bank


In [51]:
# Cluster 2
north_y_merged.loc[north_y_merged['Cluster Labels'] == 1, north_y_merged.columns[[1] + list(range(5, north_y_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
19,North York,1,Baseball Field,Women's Store,Distribution Center,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store


In [52]:
#  Cluster 3

north_y_merged.loc[north_y_merged['Cluster Labels'] == 2, north_y_merged.columns[[1] + list(range(5, north_y_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,North York,2,Golf Course,Mediterranean Restaurant,Pool,Dog Run,Diner,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop


In [53]:
#Cluster 4

north_y_merged.loc[north_y_merged['Cluster Labels'] == 3, north_y_merged.columns[[1] + list(range(5, north_y_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
22,North York,3,Park,Convenience Store,Chinese Restaurant,Clothing Store,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Cosmetics Shop,Deli / Bodega,Department Store


In [54]:
#Cluster 5

north_y_merged.loc[north_y_merged['Cluster Labels'] == 4, north_y_merged.columns[[1] + list(range(5, north_y_merged.shape[1]))]]


Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,North York,4,Park,Food & Drink Shop,Diner,Clothing Store,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega
