# Segmenting and Clustering Neighborhoods in Toronto Part 2, and 3

### Part 2: Leveraging the geographical coordinates

In [2]:
# Import requires libraries
import random # library for random number generation
import numpy as np # library for vectorized computation
import pandas as pd # library to process data as dataframes
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt # plotting library

# Data collection
import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML and XML documents

# Map
!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Import k-means from clustering stage
from sklearn.cluster import KMeans

print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Libraries imported.


In [7]:
# Collecting data 
content = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
# parse data from the html into a beautifulsoup object
data = BeautifulSoup(content, 'html.parser')

In [8]:
# Process data
# Fetch PostalCode, Borough, and Neighborhood data from table content
postalCodeList = []
boroughList = []
neighborhoodList = []

In [9]:
# Loop through table; store PostalCode, Borough, and Neighborhood data into each list
# <tr><td>M9B</td><td><a href="/wiki/Etobicoke" title="Etobicoke">Etobicoke</a></td><td><a class="mw-redirect" href="/wiki/Islington,_Toronto" title="Islington, Toronto">Islington</a></td></tr>
for row in data.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalCodeList.append(cells[0].text)
        boroughList.append(cells[1].text)
        neighborhoodList.append(cells[2].text.rstrip('\n'))

In [10]:
# Define a dataframe consist data of three columns: PostalCode, Borough, and Neighborhood
df_toronto = pd.DataFrame({"PostalCode": postalCodeList,
                           "Borough": boroughList,
                           "Neighborhood": neighborhoodList})

df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [11]:
# Ignore cells with a borough that is Not assigned.
df_toronto_dropna = df_toronto[df_toronto.Borough != "Not assigned"].reset_index(drop=True)
df_toronto_dropna.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [15]:
# Group neighborhoods with the same boroug
df_toronto_grouped = df_toronto_dropna.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
df_toronto_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [39]:
# The neighborhood will be the same name as the borough
for index, row in df_toronto_grouped.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
        
df_toronto_grouped.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [17]:
# Ping the number rows of dataframe
df_toronto_grouped.shape

(103, 3)

In [20]:
# Load cvs file that has the geographical coordinates of each postal code
coordinates = pd.read_csv("Geospatial_Coordinates.csv")
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [21]:
# Make column name consistent
coordinates.rename(columns={"Postal Code": "PostalCode"}, inplace=True)
coordinates.head(10)

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [22]:
# Merge two dataframes; groupd by postal code
df_toronto_coordinates = df_toronto_grouped.merge(coordinates, on="PostalCode", how="left")
df_toronto_coordinates.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


### Part 3: Generate maps to visualize Toronto neighborhoods

In [24]:
# Use geopy library to get the latitude and longitude values of Toronto
address = 'Toronto, Ontario'
# geo user agent
geolocator = Nominatim(user_agent="tor_explorer")

# get the geographical coordinates of Toronto
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [25]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

In [26]:
# add markers to map, use dataframe df_toronto_coordinates from part2
for lat, lng, borough, neighborhood in zip(df_toronto_coordinates['Latitude'], df_toronto_coordinates['Longitude'], df_toronto_coordinates['Borough'], df_toronto_coordinates['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [27]:
# Leverage Foursquare API to explore the neighborhoods
# Defince Foursquare constants
CLIENT_ID = 'RP3BUFDNLXHK1UUETESUVFWFQRKH4NI1GG1RR5BCE5LWIR03' # your Foursquare ID
CLIENT_SECRET = 'TQ1GIAA3GING3RH5PMMXTC3CFUASDKT0F1UHEIL5OXYRAWEJ' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: RP3BUFDNLXHK1UUETESUVFWFQRKH4NI1GG1RR5BCE5LWIR03
CLIENT_SECRET:TQ1GIAA3GING3RH5PMMXTC3CFUASDKT0F1UHEIL5OXYRAWEJ


In [28]:
# Explore the first neighborhood in the dataframe
df_toronto_coordinates.loc[0, 'Neighborhood']

'Rouge, Malvern'

In [29]:
# Get the top 100 venues that are in Marble Hill within a radius of 500 meters
# Create the GET request URL. Name your URL url
radius = 500
LIMIT = 100
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, radius, LIMIT)

url

'https://api.foursquare.com/v2/venues/explore?client_id=RP3BUFDNLXHK1UUETESUVFWFQRKH4NI1GG1RR5BCE5LWIR03&client_secret=TQ1GIAA3GING3RH5PMMXTC3CFUASDKT0F1UHEIL5OXYRAWEJ&ll=43.653963,-79.387207&v=20180605&radius=500&limit=100'

In [40]:
# GET request and examine the resutls
results = requests.get(url).json()
#results
venues = results['response']['groups'][0]['items']
#venues

In [31]:
nearby_venues = json_normalize(venues) # flatten JSON
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

#nearby_venues

In [32]:
# Add category filter
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']   
        if len(categories_list) == 0:
            return None
        else:
            return categories_list[0]['name']

In [33]:
# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Downtown Toronto,Neighborhood,43.653232,-79.385296
1,Japango,Sushi Restaurant,43.655268,-79.385165
2,Sansotei Ramen 三草亭,Ramen Restaurant,43.655157,-79.386501
3,Cafe Plenty,Café,43.654571,-79.38945
4,Poke Guys,Poke Place,43.654895,-79.385052


In [34]:
# Generate new dataframe; adding postal code, borough and neighborhood
venues_list = []

for lat, long, post, borough, neighborhood in zip(df_toronto_coordinates['Latitude'], df_toronto_coordinates['Longitude'], df_toronto_coordinates['PostalCode'], df_toronto_coordinates['Borough'], df_toronto_coordinates['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    items = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in items:
        venues_list.append((
            post, 
            borough,
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))
        

In [37]:
# Create a new dataframe with the venues list
venues_df = pd.DataFrame(venues_list)

# define the column names
venues_df.columns = ['PostalCode', 'Borough', 'Neighborhood', 'BoroughLatitude', 'BoroughLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

venues_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,RIGHT WAY TO GOLF,43.785177,-79.161108,Golf Course
2,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
3,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
4,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


In [38]:
# check the size of the resulting dataframe
print(venues_df.shape)

(2272, 9)
