# Segmenting and Clustering Neighborhoods in Toronto

### IBM Applied Datascience Capstone Week 3 Assignment

### Part 1 : dataframe of the postal code of each neighborhood along

In [134]:

import pandas as pd
import numpy as np

In [148]:
# Get the table from this Wikipedia https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, scrape the  Wikipedia page
!wget -O 'postal_code_canada.htm' https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

--2020-04-18 04:26:03--  https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M
Resolving en.wikipedia.org (en.wikipedia.org)... 208.80.154.224, 2620:0:861:ed1a::1
Connecting to en.wikipedia.org (en.wikipedia.org)|208.80.154.224|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 51448 (50K) [text/html]
Saving to: ‘postal_code_canada.htm’


2020-04-18 04:26:03 (833 KB/s) - ‘postal_code_canada.htm’ saved [51448/51448]



In [149]:
# read firt table into DF
postal_df = pd.read_html('postal_code_canada.htm')[0]


In [150]:
#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
postal_df = postal_df[postal_df['Borough'] != 'Not assigned'] 

In [151]:
# More than one neighborhood can exist in one postal code area. Combine these rows  into one row with the neighborhoods separated with a comma.
# Replace '/' in Neighborhood with ',' 
postal_df['Neighborhood'] = postal_df['Neighborhood'].str.replace("/",",")
#concatenate neighborhoods under same postal code
postal_df['Neighborhood']= postal_df.groupby(['Postal code'])['Neighborhood'].transform(lambda x: ','.join(x))

In [152]:
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
postal_df['Neighborhood'] = np.where(postal_df['Neighborhood'].isna() , postal_df['Borough'],postal_df['Neighborhood'])
postal_df['Neighborhood'] = np.where(postal_df['Neighborhood'] == 'Not assigned' , postal_df['Borough'],postal_df['Neighborhood'])

In [153]:
# Rename column postal code
postal_df.rename(columns={"Postal code": "Postal Code"}, inplace=True)
postal_df.head(20)

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park , Harbourfront"
5,M6A,North York,"Lawrence Manor , Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,"Malvern , Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill , Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


### Part 2:  Get Geolocation of Neighborhood

In [154]:
#!conda install -c conda-forge geocoder

In [155]:
import geocoder # import geocoder

def get_geo_location (postal_code) :
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
      lat_lng_coords = g.latlng
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    return latitude, longitude

In [156]:
# lat, long = get_geo_location('M6A')
#print("Postal = {}, Lat= {}, Long = {}", 'M6A', lat,long)

In [157]:
# Read geolocation csv
geo_df = pd.read_csv('https://cocl.us/Geospatial_data')

In [158]:
geo_df.head()
geo_df.set_index('Postal Code')
postal_df.reset_index()
postal_df =postal_df.set_index('Postal Code')
postal_df
# Select duplicate rows except first occurrence based on all columns
#duplicateRowsDF = postal_df[postal_df.duplicated()]
 
#print("Duplicate Rows except first occurrence based on all columns are :")
#print(duplicateRowsDF)
#postal_df.set_index('Postal Code')
postal_df = pd.merge(postal_df, geo_df, on='Postal Code')


## Part 3 : Clustering and Analysis

In [129]:
#import libraries
#!conda install -c conda-forge geopy --yes # To install Foursquare API lab
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans


import folium # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          92 KB

The following NEW packages will be INSTALLED:

    geographiclib: 1.50-py_0   conda-forge
    geopy:         1.21.0-py_0 conda-forge


Downloading and Extracting Packages
geographiclib-1.50   | 34 KB     | ##################################### | 100% 
geopy-1.21.0         | 58 KB     | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Solving environ

In [130]:
address = 'Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [159]:
## Map of toronto
# create map of Tononto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(postal_df['Latitude'], postal_df['Longitude'], postal_df['Borough'], postal_df['Neighborhood']):
    label = '{}, {}'.format(postal_df, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [160]:
# Check Downtown Toronto data
Downtown_Toronto_data = postal_df[postal_df['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
Downtown_Toronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


In [161]:
address = 'Downtown Toronto, TO'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Downtown Toronto are 43.6541737, -79.38081164513409.


In [162]:
# map of downtown toronto
# create map of Manhattan using latitude and longitude values
map_downtown_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(Downtown_Toronto_data['Latitude'], Downtown_Toronto_data['Longitude'], Downtown_Toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_downtown_toronto)  
    
map_downtown_toronto

### Define Foursquare Credentials and Version

In [163]:
CLIENT_ID = 'IZ5ZKQHXHASMYOOZKZSNKZ0FJBNUZ5EHBGBGLYMZQYWNKNZI' # your Foursquare ID
CLIENT_SECRET = 'HK0NWKHNPYLHPA5DNH4FA5ZDSQWM5VKCRE115LL1NF4U4UE1' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: IZ5ZKQHXHASMYOOZKZSNKZ0FJBNUZ5EHBGBGLYMZQYWNKNZI
CLIENT_SECRET:HK0NWKHNPYLHPA5DNH4FA5ZDSQWM5VKCRE115LL1NF4U4UE1


In [164]:
# exlpore Downtown_Toronto 
Downtown_Toronto_data.loc[0, 'Neighborhood']

'Regent Park , Harbourfront'

In [165]:
neighborhood_latitude = Downtown_Toronto_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = Downtown_Toronto_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = Downtown_Toronto_data.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Regent Park , Harbourfront are 43.6542599, -79.3606359.


In [166]:
# Build url to get 100 venues near Regent Park
# type your answer here
search_query = "Regent Park"
radius=500
LIMIT=100
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, neighborhood_latitude, neighborhood_longitude, VERSION, radius, LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?client_id=IZ5ZKQHXHASMYOOZKZSNKZ0FJBNUZ5EHBGBGLYMZQYWNKNZI&client_secret=HK0NWKHNPYLHPA5DNH4FA5ZDSQWM5VKCRE115LL1NF4U4UE1&ll=43.6542599,-79.3606359&v=20180605&radius=500&limit=100'

In [167]:
# make FourSquare API request to get venues around Regent Park
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e9a864fdf2774001bddc192'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Corktown',
  'headerFullLocation': 'Corktown, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 48,
  'suggestedBounds': {'ne': {'lat': 43.6587599045, 'lng': -79.3544279001486},
   'sw': {'lat': 43.6497598955, 'lng': -79.36684389985142}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '54ea41ad498e9a11e9e13308',
       'name': 'Roselle Desserts',
       'location': {'address': '362 King St E',
        'crossStreet': 'Trinity St',
        'lat': 43.653446723052674,
        'lng': -79.3620167174383,
        'labeledLatLngs': [{'label': 'display',
 

In [168]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [169]:
venues = results['response']['groups'][0]['items']
#venues = results['response']['venues']    
nearby_venues = json_normalize(venues) # flatten JSON
nearby_venues

Unnamed: 0,reasons.count,reasons.items,referralId,venue.categories,venue.id,venue.location.address,venue.location.cc,venue.location.city,venue.location.country,venue.location.crossStreet,...,venue.location.labeledLatLngs,venue.location.lat,venue.location.lng,venue.location.neighborhood,venue.location.postalCode,venue.location.state,venue.name,venue.photos.count,venue.photos.groups,venue.venuePage.id
0,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-54ea41ad498e9a11e9e13308-0,"[{'id': '4bf58dd8d48988d16a941735', 'name': 'B...",54ea41ad498e9a11e9e13308,362 King St E,CA,Toronto,Canada,Trinity St,...,"[{'label': 'display', 'lat': 43.65344672305267...",43.653447,-79.362017,,M5A 1K9,ON,Roselle Desserts,0,[],
1,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-53b8466a498e83df908c3f21-1,"[{'id': '4bf58dd8d48988d1e0931735', 'name': 'C...",53b8466a498e83df908c3f21,368 King St E,CA,Toronto,Canada,at Trinity St,...,"[{'label': 'display', 'lat': 43.65355870959944...",43.653559,-79.361809,,,ON,Tandem Coffee,0,[],
2,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-574c229e498ebb5c6b257902-2,"[{'id': '52e81612bcbc57f1066b7a37', 'name': 'D...",574c229e498ebb5c6b257902,461 Cherry St,CA,Toronto,Canada,,...,"[{'label': 'display', 'lat': 43.65324910177244...",43.653249,-79.358008,,M5A 0H7,ON,Cooper Koo Family YMCA,0,[],
3,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-50760559e4b0e8c7babe2497-3,"[{'id': '4bf58dd8d48988d1ed941735', 'name': 'S...",50760559e4b0e8c7babe2497,497 King Street East,CA,Toronto,Canada,btwn Sackville St and Sumach St,...,"[{'label': 'display', 'lat': 43.65473505045365...",43.654735,-79.359874,,M5A 1L9,ON,Body Blitz Spa East,0,[],
4,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-4ae5b91ff964a520a6a121e3-4,"[{'id': '4bf58dd8d48988d143941735', 'name': 'B...",4ae5b91ff964a520a6a121e3,457 King St. E,CA,Toronto,Canada,Gilead Place,...,"[{'label': 'display', 'lat': 43.65394694263529...",43.653947,-79.361149,,M5A 1L6,ON,Morning Glory Cafe,0,[],39686393.0
5,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-5612b1cc498e3dd742af0dc8-5,"[{'id': '4bf58dd8d48988d1c4941735', 'name': 'R...",5612b1cc498e3dd742af0dc8,573 King St E,CA,Toronto,Canada,at St Lawrence St,...,"[{'label': 'display', 'lat': 43.65636850543279...",43.656369,-79.35698,,M5A 4L3,ON,Impact Kitchen,0,[],
6,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-51ccc048498ec7792efc955e-6,"[{'id': '4bf58dd8d48988d163941735', 'name': 'P...",51ccc048498ec7792efc955e,,CA,,Canada,,...,"[{'label': 'display', 'lat': 43.65561779974973...",43.655618,-79.356211,,,,Corktown Common,0,[],
7,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-4af59046f964a520e0f921e3-7,"[{'id': '4bf58dd8d48988d143941735', 'name': 'B...",4af59046f964a520e0f921e3,344 Queen St. E.,CA,Toronto,Canada,at Parliament St.,...,"[{'label': 'display', 'lat': 43.65567455427388...",43.655675,-79.364503,,M5A 1S8,ON,Figs Breakfast & Lunch,0,[],
8,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-4ad4c05ef964a520bff620e3-8,"[{'id': '4deefb944765f83613cdba6e', 'name': 'H...",4ad4c05ef964a520bff620e3,"btwn Front, Cherry, Gardiner & Parliament",CA,Toronto,Canada,,...,"[{'label': 'display', 'lat': 43.65024435658077...",43.650244,-79.359323,,M5A 3C4,ON,The Distillery Historic District,0,[],
9,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-566e1294498e3f6629006bc3-9,"[{'id': '4bf58dd8d48988d11b941735', 'name': 'P...",566e1294498e3f6629006bc3,500 Queen Street East,CA,Toronto,Canada,,...,"[{'label': 'display', 'lat': 43.65691857501867...",43.656919,-79.358967,,M5A 1T9,ON,Dominion Pub and Kitchen,0,[],


In [170]:
# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
#filtered_columns = ['name', 'categories', 'location.lat', 'location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Roselle Desserts,Bakery,43.653447,-79.362017
1,Tandem Coffee,Coffee Shop,43.653559,-79.361809
2,Cooper Koo Family YMCA,Distribution Center,43.653249,-79.358008
3,Body Blitz Spa East,Spa,43.654735,-79.359874
4,Morning Glory Cafe,Breakfast Spot,43.653947,-79.361149


In [171]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

48 venues were returned by Foursquare.


In [173]:
# Explore venues in Regent Park
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [174]:
# Get Venues near Regent Park

regent_park_venues = getNearbyVenues(names=Downtown_Toronto_data['Neighborhood'],
                                   latitudes=Downtown_Toronto_data['Latitude'],
                                   longitudes=Downtown_Toronto_data['Longitude']
                                  )

Regent Park , Harbourfront
Queen's Park , Ontario Provincial Government
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Christie
Richmond , Adelaide , King
Harbourfront East , Union Station , Toronto Islands
Toronto Dominion Centre , Design Exchange
Commerce Court , Victoria Hotel
University of Toronto , Harbord
Kensington Market , Chinatown , Grange Park
CN Tower , King and Spadina , Railway Lands , Harbourfront West , Bathurst Quay , South Niagara , Island airport
Rosedale
Stn A PO Boxes
St. James Town , Cabbagetown
First Canadian Place , Underground city
Church and Wellesley


In [175]:
print(regent_park_venues.shape)
regent_park_venues.head()

(1215, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park , Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park , Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park , Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park , Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Regent Park , Harbourfront",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot


In [176]:
#Let's check how many venues were returned for each neighborhood
regent_park_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,56,56,56,56,56,56
"CN Tower , King and Spadina , Railway Lands , Harbourfront West , Bathurst Quay , South Niagara , Island airport",18,18,18,18,18,18
Central Bay Street,59,59,59,59,59,59
Christie,18,18,18,18,18,18
Church and Wellesley,73,73,73,73,73,73
"Commerce Court , Victoria Hotel",100,100,100,100,100,100
"First Canadian Place , Underground city",100,100,100,100,100,100
"Garden District, Ryerson",100,100,100,100,100,100
"Harbourfront East , Union Station , Toronto Islands",100,100,100,100,100,100
"Kensington Market , Chinatown , Grange Park",57,57,57,57,57,57


In [178]:
#Let's find out how many unique categories can be curated from all the returned venues
print('There are {} uniques categories.'.format(len(regent_park_venues['Venue Category'].unique())))

There are 205 uniques categories.
