# project  Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import pandas as pd
### Import Folium for map rendering
# !conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium 

# Library to handle JSON files
import json 

# Import Nominatim
# Converts an address into latitude and longitude values
# !conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim 

# Library to handle requests
import requests

### Use Pandas read_html to get the table from the WikiPedia page

In [2]:
# Read the table
# The table headers are in row 0
table = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', header=0)

# Create the initial dataframe from the table
df = pd.DataFrame(data = table[0])

# Print the shape
print('The shape of the Raw Inital Datafram is: ', df.shape)

# Output the Head of the Table
df.head()



The shape of the Raw Inital Datafram is:  (288, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Handle rows where Borough is set but Neighbourhood is Not assigned
Some of the rows have Borough set but Neighbourhood is Not assigned. In this situation the Neighbourhood is to be set to the same value as the Borough.

In [3]:
# Find these instances
df[(df.Borough != 'Not assigned') & (df.Neighbourhood == 'Not assigned')]


Unnamed: 0,Postcode,Borough,Neighbourhood
8,M7A,Queen's Park,Not assigned


In [4]:
# The is only one such value so it is easiest to manually fix
df.loc[df.Borough == "Queen's Park", 'Neighbourhood'] = "Queen's Park"

In [5]:
df.loc[df.Borough == "Queen's Park", 'Neighbourhood'] = "Queen's Park"

In [6]:
# Check again
df[(df.Borough != 'Not assigned') & (df.Neighbourhood == 'Not assigned')]

Unnamed: 0,Postcode,Borough,Neighbourhood


### Remove rows where Borough & Neighbourhood are Not assigned

In [7]:

df = df[(df.Borough != 'Not assigned') | (df.Neighbourhood != 'Not assigned')]

# Print the shape
print('The shape of the Raw Inital Datafram is: ', df.shape)

# Output the Head of the Table
df.head()

The shape of the Raw Inital Datafram is:  (211, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### Count the distinct values in each colum

In [8]:
print('There are %d unique Postal Codes in the table' % df.Postcode.nunique())
print('There are %d unique Boroughs in the table' % df.Borough.nunique())
print('There are %d unique Neighbourhoods in the table' % df.Neighbourhood.nunique())

There are 103 unique Postal Codes in the table
There are 11 unique Boroughs in the table
There are 209 unique Neighbourhoods in the table


In [9]:
df.reset_index(drop=True, inplace=True)

### Some of the Neighbourhood values need to be cleaned up
There are issues with some of the Neighbourhood values containing the ] character

In [10]:
df[df.Neighbourhood.str.contains(']')]

Unnamed: 0,Postcode,Borough,Neighbourhood


In [11]:
df['Neighbourhood'] = df['Neighbourhood'].str.replace(']', '')

In [12]:
df[df.Neighbourhood.str.contains(']')]

Unnamed: 0,Postcode,Borough,Neighbourhood


### Group by Postal Code and BoroughÂ¶
The final task is to group by Postal Code and Borough and produce a list of all Neighbourhoods in each.

In [13]:

part_01 = pd.DataFrame(df.groupby(
    ['Postcode', 'Borough'])['Neighbourhood'].apply(
    lambda x: ', '.join(x))).reset_index()

In [14]:

part_01.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [15]:
part_01.tail()

Unnamed: 0,Postcode,Borough,Neighbourhood
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."
102,M9W,Etobicoke,Northwest


In [16]:
part_01.shape

(103, 3)

# Get Latitude & Longitude for each Postal Code

Now that you have built a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name, in order to utilize the Foursquare location data, we need to get the latitude and the longitude coordinates of each neighborhood.

The steps are:

Install geocoder if required

Create a list of Postal Codes

Search for the Latitude and Longitude of each postal code

Update the DataFrame with the Latitude and Longitude

In [None]:
# Install geocoder if required
!conda install -c conda-forge geocoder --yes

In [None]:
# Only run once to get the Latitude / Longitude then Pickled
import geocoder

# Create a copy of the part_01 dataframe
part_02 = part_01.copy()

# Create a list of all the Postal Codes
postal_codes = part_01['Postcode'].tolist()

# Search for the Latitude and Longitude of each postal code
for postal_code in postal_codes:
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    print('{}, Toronto, Ontario'.format(postal_code))
    while(lat_lng_coords is None):
      g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
      # g = geocoder.opencage('{}, Toronto, Ontario'.format(postal_code), key='13aa077d21ae42f287ec8607e07b2159')

      lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    
    part_02.loc[part_02.Postcode == postal_code, 'Latitude'] = latitude
    part_02.loc[part_02.Postcode == postal_code, 'Longitude'] = longitude

M1B, Toronto, Ontario


In [None]:
# Pickle the DataFrame to save having to make multiple calls
part_02.to_pickle('./toronto.pkl')

In [None]:
# Read from the Pickle File
part_02 = pd.read_pickle('./toronto.pkl')

In [None]:
# Show the new DataFrame for Part 02
part_02

# Segmenting and Clustering Neighborhoods in Toronto: Part 3
Explore and cluster the neighborhoods in Toronto. You can decide to work with only boroughs that contain the word Toronto and then replicate the same analysis we did to the New York City data. It is up to you.

Just make sure:

Add enough Markdown cells to explain what you decided to do and to report any observations you make.
Generate maps to visualize your neighborhoods and how they cluster together.
### Import Required Libraries and setup FourSquare Veriables

In [None]:

# Tranform JSON file into a pandas dataframe
from pandas.io.json import json_normalize 

# Library to handle data in a vectorized manner
import numpy as np 

# Pandas library for data analsysis
import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

### Define Foursquare Credentials and Version
CLIENT_ID = 'EUDO0LACN3MV3DFIHGMVUMSWAILSHCBAOUVYP5EOGLXYTXTY'
CLIENT_SECRET = '41LFAJPS4WAQATSRKHGKOBPZRZBCUKB0TMWTR0VAJCIJJ3SD'
VERSION = '20180605'

# Limit of number of venues returned by Foursquare API
LIMIT = 100 
# Define radius of search for Foursquare API
radius = 1000


In [22]:
# Function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [23]:
# Function to get Nearby Venues
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [24]:
# Function to sort the venues in descending order.
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]

### Use the getNearbyVenues Function to get a table of venues

In [26]:
toronto_venues = getNearbyVenues(names=part_02['Postcode'],
                                 latitudes=part_02['Latitude'],
                                 longitudes=part_02['Longitude']
                                )

KeyError: 'Latitude'

In [27]:
# Pickle the Venues DataFrame as well
toronto_venues.to_pickle('./toronto_venues.pkl')

NameError: name 'toronto_venues' is not defined

In [28]:
toronto_venues = pd.read_pickle('./toronto_venues.pkl')

FileNotFoundError: [Errno 2] No such file or directory: './toronto_venues.pkl'

In [29]:
print(toronto_venues.shape)
toronto_venues.head()

NameError: name 'toronto_venues' is not defined

In [30]:

toronto_venues.groupby('Neighborhood').count()

NameError: name 'toronto_venues' is not defined

In [31]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

NameError: name 'toronto_venues' is not defined

In [32]:
size = toronto_venues.Neighborhood.value_counts()
idx = size[size >= 10].index
toronto_venues_filtered = toronto_venues[toronto_venues.Neighborhood.isin(idx)]
toronto_venues_filtered.shape

NameError: name 'toronto_venues' is not defined

In [33]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues_filtered[['Venue Category']], prefix="", prefix_sep="")

# Add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# Move Neighborhood column to the first column
cols = toronto_onehot.columns.tolist()
cols.insert(0, cols.pop(cols.index('Neighborhood')))
toronto_onehot = toronto_onehot.reindex(columns= cols)

NameError: name 'toronto_venues_filtered' is not defined

In [34]:
toronto_onehot.head()

NameError: name 'toronto_onehot' is not defined

In [35]:
toronto_onehot.shape

NameError: name 'toronto_onehot' is not defined

In [36]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

NameError: name 'toronto_onehot' is not defined

In [37]:
num_top_venues = 10
indicators = ['st', 'nd', 'rd']

# Create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(
        toronto_grouped.iloc[ind, :], num_top_venues)

print(neighborhoods_venues_sorted.shape)
neighborhoods_venues_sorted


NameError: name 'toronto_grouped' is not defined

In [38]:
# Set number of clusters
kclusters = 5
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# Run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

NameError: name 'toronto_grouped' is not defined

In [39]:
toronto_merged = part_02[part_02.Postcode.isin(idx)].copy()
toronto_merged.rename(columns={'Postcode': 'Neighborhood'}, inplace=True)

# Add clustering labels
toronto_merged['Cluster Labels'] = kmeans.labels_

# Merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

print(toronto_merged.shape)
toronto_merged

NameError: name 'idx' is not defined

NameError: name 'toronto_merged' is not defined