# Toronto Neighborhood Clustering

Import necessary libraries and web page.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

r = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(r.content,"lxml")

Save table as variable

In [2]:
table = soup.find_all('table')[0]

Construct a dictionary from the table containing the postal codes, boroughs, and neighborhoods.

In [3]:
postal_code = []
borough = []
neighborhood = []
for row in table.find_all('tr'):
    i = 0
    for col in row.find_all('td'):
        if i == 0:
            postal_code += [col.get_text()]
        elif i == 1:
            borough += [col.get_text()]
        else:
            neighborhood += [col.get_text()[:-1]]
        i += 1

d = {'PostalCode': postal_code, 'Borough': borough, 'Neighborhood': neighborhood}

Create data frame.

In [4]:
df = pd.DataFrame.from_dict(d)

Remove rows with a missing borough.

In [5]:
df = df[df['Borough'] != 'Not assigned']

Reindex data frame

In [6]:
df.index = range(df.shape[0])

Create new dictionary combining neighborhoods with the same postal code. This loop also fixes any neighborhoods that are not assigned.

In [7]:
d2 = {'PostalCode': [], 'Borough': [], 'Neighborhood': []}
for i in range(df.shape[0]):
    if df.loc[i,'PostalCode'] in d2['PostalCode']:
        d2['Neighborhood'][-1] += ', ' + df.loc[i,'Neighborhood']
    else:
        d2['PostalCode'] += [df.loc[i,'PostalCode']]
        d2['Borough'] += [df.loc[i,'Borough']]
        if df.loc[i,'Neighborhood'] == 'Not assigned':
            d2['Neighborhood'] += [df.loc[i,'Borough']]
        else:
            d2['Neighborhood'] += [df.loc[i,'Neighborhood']]

New data frame based on new dictionary.

In [8]:
df2 = pd.DataFrame.from_dict(d2)
df2.index = range(df2.shape[0])

Reorder columns.

In [9]:
df2 = df2[['PostalCode','Borough','Neighborhood']]

Importing geocoder.

In [10]:
import geocoder # import geocoder

Loop to get lat/lng coordinates for each postal code.

In [11]:
latitude = []
longitude = []

for m in df2['PostalCode']:
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.bing('{}, Toronto, Ontario'.format(m), key='')
        lat_lng_coords = g.latlng

    latitude.append(lat_lng_coords[0])
    longitude.append(lat_lng_coords[1])

Adding lat/lng columns to data frame.

In [12]:
df2['Latitude'] = latitude
df2['Longitude'] = longitude

## Final Data Frame

Note that the data frame is not in the same order as the one in the assignment, but the values match. My data frame is ordered by postal code.

In [13]:
df2.head(n=10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.751255,-79.329895
1,M4A,North York,Victoria Village,43.729958,-79.314201
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65522,-79.361969
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.722801,-79.450691
4,M7A,Queen's Park,Queen's Park,43.664486,-79.393021
5,M9A,Etobicoke,Islington Avenue,43.662743,-79.528427
6,M1B,Scarborough,"Rouge, Malvern",43.810154,-79.194603
7,M3B,North York,Don Mills North,43.749134,-79.362007
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.707577,-79.310913
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657467,-79.377708


In [14]:
df2.shape

(103, 5)

In what follows we duplicate the neighborhood analysis done on the New York dataset.

Data frame for boroughs with the word 'Toronto' in them.

In [15]:
df_toronto = df2[df2['Borough'].str.contains('Toronto')]

In [16]:
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65522,-79.361969
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657467,-79.377708
15,M5C,Downtown Toronto,St. James Town,43.651211,-79.375481
19,M4E,East Toronto,The Beaches,43.679611,-79.295692
20,M5E,Downtown Toronto,Berczy Park,43.64333,-79.372223


Foursquare credentials

In [17]:
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
radius = 500
LIMIT = 100

Function for nearby venues for neighborhoods.

In [18]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Application of previous function to new data frame.

In [19]:
toronto_venues = getNearbyVenues(names=df_toronto['Neighborhood'],
                                 latitudes=df_toronto['Latitude'],
                                 longitudes=df_toronto['Longitude'])

Harbourfront, Regent Park
Ryerson, Garden District
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Adelaide, King, Richmond
Dovercourt Village, Dufferin
Harbourfront East, Toronto Islands, Union Station
Little Portugal, Trinity
The Danforth West, Riverdale
Design Exchange, Toronto Dominion Centre
Brockton, Exhibition Place, Parkdale Village
The Beaches West, India Bazaar
Commerce Court, Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North, Forest Hill West
High Park, The Junction South
North Toronto West
The Annex, North Midtown, Yorkville
Parkdale, Roncesvalles
Davisville
Harbord, University of Toronto
Runnymede, Swansea
Moore Park, Summerhill East
Chinatown, Grange Park, Kensington Market
Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Rosedale
Stn A PO Boxes 25 The Esplanade
Cabbagetown, St. James Town
Fir

In [20]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 208 uniques categories.


In [21]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = ['Neighborhood'] + list(toronto_onehot.columns.drop('Neighborhood'))
toronto_onehot = toronto_onehot[fixed_columns]

Grouping neighborhoods and computing the frequency of each kind of location type.

In [22]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

In [23]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Produces top ten locations types per neighborhood.

In [24]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

In [26]:
toronto_grouped.head()

Unnamed: 0,Neighborhood,Accessories Store,Adult Boutique,American Restaurant,Arepa Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.01,0.0,0.04,0.0,0.01,0.0,0.0,0.02,0.0,...,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.01,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.014706,0.0,0.014706,0.0,0.0,...,0.0,0.0,0.029412,0.0,0.014706,0.0,0.0,0.0,0.0,0.0
3,Business reply mail Processing Centre969 Eastern,0.0,0.0,0.04,0.0,0.01,0.0,0.0,0.02,0.0,...,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.0,0.015152,0.0,0.0,0.0,0.0,0.015152,0.0,...,0.0,0.015152,0.015152,0.0,0.0,0.0,0.0,0.0,0.0,0.015152


K-means algorithm to group neighborhoods by popular location type in each. Assumed 5 clusters.

In [27]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

In [28]:
toronto_merged = df_toronto

# add clustering labels
toronto_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [30]:
toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65522,-79.361969,0,Coffee Shop,Café,Park,Breakfast Spot,Bakery,Yoga Studio,Bank,Gym / Fitness Center,Pub,Restaurant
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657467,-79.377708,0,Coffee Shop,Clothing Store,Café,Japanese Restaurant,Cosmetics Shop,Bar,Middle Eastern Restaurant,Sandwich Place,Italian Restaurant,Movie Theater
15,M5C,Downtown Toronto,St. James Town,43.651211,-79.375481,0,Restaurant,Coffee Shop,Café,Bakery,Clothing Store,Hotel,Seafood Restaurant,Cosmetics Shop,Japanese Restaurant,Gastropub
19,M4E,East Toronto,The Beaches,43.679611,-79.295692,0,Coffee Shop,Bakery,Pub,Trail,Asian Restaurant,Park,Pizza Place,Farm,Falafel Restaurant,Donut Shop
20,M5E,Downtown Toronto,Berczy Park,43.64333,-79.372223,0,Steakhouse,Beach,Concert Hall,Belgian Restaurant,Restaurant,Japanese Restaurant,Beer Bar,Italian Restaurant,Bar,Movie Theater


Producing the map with neighborhoods color-coded by cluster label.

In [32]:
import folium

In [33]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[43.657162,-79.378937], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## A quick look at cluster 1 - Coffee shops

In [35]:
toronto_merged[toronto_merged['Cluster Labels'] == 1]

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
79,M4S,Central Toronto,Davisville,43.703163,-79.385895,1,Coffee Shop,Dessert Shop,Café,Pizza Place,Seafood Restaurant,Sandwich Place,Italian Restaurant,Fast Food Restaurant,Thai Restaurant,Farmers Market
81,M6S,West Toronto,"Runnymede, Swansea",43.649441,-79.475655,1,Coffee Shop,Café,Bakery,Pizza Place,Gourmet Shop,Bookstore,Flower Shop,Burger Joint,Latin American Restaurant,Bus Line


# The map will not render on github. To see my map, go to the [map.png](https://github.com/jathiel/Coursera_Capstone/blob/master/map.png) file on my github page.