# Section 1

Import the libraries

In [116]:
import requests
import pandas as pd
import numpy as np
import geocoder
import folium
import json
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

Retrieve the Wikipedia page and load the HTML body into a BeautifulSoup object

In [117]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = requests.get(url).text
soup = BeautifulSoup(html)

Use BeautifulSoup to parse the table contents into a Pandas dataframe. Ignore cells that contain **Not assigned**.

In [118]:
table_contents = []
table = soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)
df = pd.DataFrame(table_contents)

Fix the Borough and Neighborhood data and display a sample. 

In [119]:
df['Borough']=df['Borough'].replace(
    {
        'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
        'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
        'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
        'MississaugaCanada Post Gateway Processing Centre':'Mississauga'
    }
)
df.reset_index
df[0:12]

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills North
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


Display the number of rows and columns.

In [120]:
df.shape

(103, 3)

# Section 2

Grab the CSV data and create a Pandas dataframe from it.

In [121]:
geo_df = pd.read_csv('https://cocl.us/Geospatial_data', index_col='Postal Code')
geo_df

Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476
...,...,...
M9N,43.706876,-79.518188
M9P,43.696319,-79.532242
M9R,43.688905,-79.554724
M9V,43.739416,-79.588437


Merge the two dataframes on common **Postal Code** values.

In [122]:
toronto_df = df.join(geo_df, on='PostalCode')
toronto_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto Business,Enclave of M4L,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


# Section 3

## Exploration

Calculate the geographic coordinates of Toronto, Ontario

In [123]:
city = 'Toronto, Ontario'

location = geocoder.arcgis(city)
latitude = location.latlng[0]
longitude = location.latlng[1]
print('The geograpical coordinates of {} are ({:.6f}, {:.6f}).'.format(city, latitude, longitude))

The geograpical coordinates of Toronto, Ontario are (43.648690, -79.385440).


Create a map of Toronto using latitude and longitude values. Then add values from our geocoordinates dataframe.

In [124]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Borough'], toronto_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)

map_toronto

Get geographic coordinates for [The Danforth Music Hall](http://thedanforth.com).

In [125]:
venue_address = '147 Danforth Ave, Toronto, Ontario, Canada'
loc = geocoder.arcgis(venue_address)

venue_latitude = loc.latlng[0]
venue_longitude = loc.latlng[1]

print(
    'The geographic coordinates of {} are ({:.6f}, {:.6f}).'.format(
        venue_address,
        venue_latitude, 
        venue_longitude
    )
)

The geographic coordinates of 147 Danforth Ave, Toronto, Ontario, Canada are (43.676200, -79.356980).


A function to calculate distance using the Haversine formula.

In [126]:
def haversine(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    km = 6367 * c
    return km

Calculate the distances from `venue_address` to the neighborhoods in `city`.

In [127]:
# Initialize the DataFrame
distances = pd.DataFrame(columns=['Neighborhood', 'Distance'])

# Calculate the neighborhood distances
for n in toronto_df.itertuples():
    d = haversine(venue_latitude, venue_longitude, n[4], n[5])
    distances = distances.append({'Neighborhood':n[3], 'Distance':d}, ignore_index=True)

distances.sort_values(by='Distance')

Unnamed: 0,Neighborhood,Distance
41,"The Danforth West, Riverdale",0.536189
96,"St. James Town, Cabbagetown",1.255405
91,Rosedale,1.693289
35,The Danforth East,1.825874
54,Studio District,2.258178
...,...,...
6,"Malvern, Rouge",19.512225
12,"Rouge Hill, Port Union, Highland Creek",19.845843
89,"South Steeles, Silverstone, Humbergate, Jamest...",19.875633
76,Enclave of L4W,21.261937


Determine the neighborhood in which the venue is located.

In [128]:
# Get the name of the nearest neighborhood
venue_geo = distances.loc[distances['Distance'].idxmin()]
venue_neighborhood = venue_geo['Neighborhood']
venue_distance = venue_geo['Distance']

# Get the geographic coordinates of the nearest neighborhood
n_geo = toronto_df.loc[toronto_df['Neighborhood'] == venue_neighborhood]


# Print the results
print('Venue: ({:.7f}, {:.7f})'.format(venue_latitude, venue_longitude))
print('Neighborhood: ({}, {})'.format(n_geo.iloc[0]['Latitude'], n_geo.iloc[0]['Longitude']))
print('Distance: {:.2f} km'.format(venue_distance))
print('\n')
print('The venue at {} is located {:.2f} km from the center of the {} neighborhood.'.format(venue_address, venue_distance, venue_neighborhood))


Venue: (43.6762000, -79.3569800)
Neighborhood: (43.6795571, -79.352188)
Distance: 0.54 km


The venue at 147 Danforth Ave, Toronto, Ontario, Canada is located 0.54 km from the center of the The Danforth West, Riverdale neighborhood.


Establish my Foursquare credentials and request params.

In [129]:
CLIENT_ID = 'WGUXITLMCSKLZC1QV3TCM3MGE3CYT0AMBFNGMBKGTSE40OMX' # your Foursquare ID
CLIENT_SECRET = 'H3HWJFJVHLWK05GHOVWCDSS02SEMNGP41FYHZ5JWMR024X0X' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

Get surrounding venues data from Foursquare.

In [130]:
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, venue_latitude, venue_longitude, VERSION, 1000, 100)
url
results = requests.get(url).json()


A function that extracts the category of the venue

In [131]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Parse and display the Foursquare results.

In [132]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = pd.json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Tapas at Embrujo,Tapas Restaurant,43.676175,-79.358411
1,The Danforth Music Hall,Concert Hall,43.676338,-79.357071
2,Urban Nails,Spa,43.676668,-79.356602
3,Don Valley Trail,Trail,43.676331,-79.353923
4,7 Numbers,Italian Restaurant,43.677062,-79.353934


Display the total number of venues found.

In [133]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

80 venues were returned by Foursquare.


Display each of the unique categories of the venues.

In [134]:
nearby_venues['categories'].unique()

array(['Tapas Restaurant', 'Concert Hall', 'Spa', 'Trail',
       'Italian Restaurant', 'Pub', 'Cocktail Bar', 'Dance Studio',
       'Bakery', 'Grocery Store', 'Burger Joint', 'Café',
       'Cuban Restaurant', 'Greek Restaurant', 'Restaurant',
       'Cosmetics Shop', 'Ice Cream Shop', 'Yoga Studio', 'Brewery',
       'Park', 'Juice Bar', 'Pool', 'Pet Store', 'Fish & Chips Shop',
       'Flower Shop', 'Fruit & Vegetable Store', 'Breakfast Spot',
       'Pizza Place', 'Ramen Restaurant', 'Bookstore',
       'Furniture / Home Store', 'Coffee Shop', 'Scenic Lookout',
       'Dog Run', 'New American Restaurant', 'Dessert Shop',
       'American Restaurant', 'Indian Restaurant', 'Bubble Tea Shop',
       'Tibetan Restaurant', 'Caribbean Restaurant', 'Churrascaria',
       'Lounge', 'Frozen Yogurt Shop', 'Bank', 'Falafel Restaurant',
       'Fast Food Restaurant', 'Rental Car Location',
       'Mediterranean Restaurant'], dtype=object)

Let's find all the bars that are within ~~stumbling~~ walking distance of the venue.

In [135]:
# Where are the nearby bars?
nearby_bars = nearby_venues.loc[nearby_venues['categories'].isin(['Pub','Brewery','Cocktail Bar'])]

# Calculate the distance from the venue to the bars
for n in nearby_bars.itertuples():
    d2 = haversine(venue_latitude, venue_longitude, n[3], n[4])
    nearby_bars.loc[n[0],'Distance'] = d2

nearby_bars.sort_values(by=['Distance'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


Unnamed: 0,name,categories,lat,lng,Distance
6,Sidebar,Cocktail Bar,43.676441,-79.356745,0.032772
34,Dora Keogh,Pub,43.676418,-79.35738,0.040254
5,The Old Nick,Pub,43.676344,-79.357775,0.06585
55,The Edmund Burke,Pub,43.676336,-79.358131,0.093717
16,The Auld Spot Pub,Pub,43.677335,-79.35313,0.334175
20,Louis Cifer Brew Works,Brewery,43.677663,-79.351313,0.483629


In [136]:
nearby_bars = nearby_venues.loc[nearby_venues['categories'].isin(['Pub','Brewery','Cocktail Bar'])]

nearby_bars

Unnamed: 0,name,categories,lat,lng
5,The Old Nick,Pub,43.676344,-79.357775
6,Sidebar,Cocktail Bar,43.676441,-79.356745
16,The Auld Spot Pub,Pub,43.677335,-79.35313
20,Louis Cifer Brew Works,Brewery,43.677663,-79.351313
34,Dora Keogh,Pub,43.676418,-79.35738
55,The Edmund Burke,Pub,43.676336,-79.358131


In [137]:
map_bars = folium.Map(location=[venue_latitude, venue_longitude], zoom_start=25)

for lat, lng, name, category in zip(nearby_bars['lat'], nearby_bars['lng'], nearby_bars['name'], nearby_bars['categories']):
    label = '{} ({})'.format(name, category)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_bars)

folium.RegularPolygonMarker(
    [venue_latitude, venue_longitude],
    radius=5,
    popup=venue_address,
    color='red',
    fill=True,
    fill_color='red',
    fill_opacity=0.7,
    parse_html=False).add_to(map_bars)


map_bars

## Clustering

A function for retrieving venues.

In [138]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Run the previous function on all neighborhoods in Toronto.

In [139]:
toronto_venues = getNearbyVenues(names=toronto_df['Neighborhood'],
                                   latitudes=toronto_df['Latitude'],
                                   longitudes=toronto_df['Longitude']
                                  )

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Ontario Provincial Government
Islington Avenue
Malvern, Rouge
Don Mills North
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills South
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
The Danforth  East
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmount Park
Bayview Village
Downsview East
The Danforth

In [140]:
print(toronto_venues.shape)
toronto_venues.head()

(2111, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Brookbanks Pool,43.751389,-79.332184,Pool
2,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


In [141]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,4,4,4,4,4,4
"Alderwood, Long Branch",9,9,9,9,9,9
"Bathurst Manor, Wilson Heights, Downsview North",22,22,22,22,22,22
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",24,24,24,24,24,24
...,...,...,...,...,...,...
Willowdale South,34,34,34,34,34,34
Willowdale West,4,4,4,4,4,4
Woburn,4,4,4,4,4,4
Woodbine Heights,4,4,4,4,4,4


In [142]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 270 uniques categories.


In [143]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot

Unnamed: 0,Yoga Studio,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Train Station,Truck Stop,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2107,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2108,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2109,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [144]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Truck Stop,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.041667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,Willowdale South,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.029412,0.0,0.0,0.0,0.000000
95,Willowdale West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
96,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
97,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000


In [145]:
toronto_grouped.shape

(99, 270)

In [146]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

 0.08
3     Coffee Shop  0.08
4         Stadium  0.04


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
                 venue  freq
0      Airport Service  0.14
1      Harbor / Marina  0.07
2     Airport Terminal  0.07
3  Rental Car Location  0.07
4          Coffee Shop  0.07


----Caledonia-Fairbanks----
                       venue  freq
0                       Park  0.50
1              Women's Store  0.25
2                        Bar  0.25
3  Middle Eastern Restaurant  0.00
4        Monument / Landmark  0.00


----Cedarbrae----
                  venue  freq
0   Fried Chicken Joint  0.12
1                Bakery  0.12
2           Gas Station  0.12
3       Thai Restaurant  0.12
4  Caribbean Restaurant  0.12


----Central Bay Street----
                venue  freq
0         Coffee Shop  0.17
1      Sandwich Place  0.06
2                Café  0.05
3  Italian Restaurant  0.05
4     Thai Restaurant  0.03


----Christie----

In [147]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [148]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Skating Rink,Lounge,Breakfast Spot,Latin American Restaurant,Yoga Studio,Molecular Gastronomy Restaurant,Modern European Restaurant,Mobile Phone Shop,Miscellaneous Shop,Middle Eastern Restaurant
1,"Alderwood, Long Branch",Pizza Place,Pharmacy,Coffee Shop,Pub,Sandwich Place,Skating Rink,Playground,Gym,Home Service,Museum
2,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Park,Middle Eastern Restaurant,Shopping Mall,Mobile Phone Shop,Sandwich Place,Fried Chicken Joint,Supermarket,Frozen Yogurt Shop
3,Bayview Village,Japanese Restaurant,Café,Chinese Restaurant,Bank,Movie Theater,Motel,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Yoga Studio
4,"Bedford Park, Lawrence Manor East",Sandwich Place,Italian Restaurant,Coffee Shop,Women's Store,Pizza Place,Butcher,Café,Restaurant,Pub,Pharmacy


K-Means clustering

In [149]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 2], dtype=int32)

In [150]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_df

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood', how='right')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
78,M1S,Scarborough,Agincourt,43.7942,-79.262029,0,Skating Rink,Lounge,Breakfast Spot,Latin American Restaurant,Yoga Studio,Molecular Gastronomy Restaurant,Modern European Restaurant,Mobile Phone Shop,Miscellaneous Shop,Middle Eastern Restaurant
93,M8W,Etobicoke,"Alderwood, Long Branch",43.602414,-79.543484,0,Pizza Place,Pharmacy,Coffee Shop,Pub,Sandwich Place,Skating Rink,Playground,Gym,Home Service,Museum
28,M3H,North York,"Bathurst Manor, Wilson Heights, Downsview North",43.754328,-79.442259,0,Coffee Shop,Bank,Park,Middle Eastern Restaurant,Shopping Mall,Mobile Phone Shop,Sandwich Place,Fried Chicken Joint,Supermarket,Frozen Yogurt Shop
39,M2K,North York,Bayview Village,43.786947,-79.385975,0,Japanese Restaurant,Café,Chinese Restaurant,Bank,Movie Theater,Motel,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Yoga Studio
55,M5M,North York,"Bedford Park, Lawrence Manor East",43.733283,-79.41975,0,Sandwich Place,Italian Restaurant,Coffee Shop,Women's Store,Pizza Place,Butcher,Café,Restaurant,Pub,Pharmacy


In [151]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters