# Segmenting and Clustering Neighborhoods in Toronto

### Introduction
##### Web scraping the following Wikipedia page, 
###### https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M,
##### in order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe

Before we start,let's download all the dependencies that we will need.

In [28]:
#pip install requests BeautifulSoup4
import random # library for random number generation
import numpy as np # library for vectorized computation
import pandas as pd # library to process data as dataframes
import requests

import matplotlib.pyplot as plt # plotting library
# backend for rendering plots within the browser
%matplotlib inline 

from pandas.io.json import json_normalize 
from sklearn.cluster import KMeans 
from sklearn.datasets.samples_generator import make_blobs
from geopy.geocoders import Nominatim
from selenium import webdriver
from bs4 import BeautifulSoup

print('Libraries imported.')

Libraries imported.


In [2]:
results = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
#print(results.status_code)
#print(results.headers)
src=results.content
soup = BeautifulSoup(src,'lxml')
table = soup.find("table",attrs={"class":"wikitable"})
#table_header = table.find_all("th")
table_body = table.find_all("td")
#print(table_header)
print(len(table_body))

540


In [3]:
pc=[]
bor=[]
nb=[]
for i in range(0,len(table_body),3):
    pc.append((table_body[i].text).strip())
    bor.append((table_body[i+1].text).strip())
    nb.append((table_body[i+2].text).strip())
    #print(table_body[i].text)
#print(pc)
#print(bor)
#print(nb)

Toronto = (pd.DataFrame({'Postal Code':pc,'Borough':bor,'Neighborhood':nb})).reset_index(drop=True)
Toronto.head()


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


**Removing Not Assigned Boroughs**

In [4]:
Toronto = Toronto[Toronto.Borough != 'Not assigned']
Toronto.head()
Toronto.describe(include='all')

Unnamed: 0,Postal Code,Borough,Neighborhood
count,103,103,103
unique,103,10,99
top,M5G,North York,Downsview
freq,1,24,4


**Setting Not Assigned Neighborhoods to same Borough**

In [5]:
Toronto['Neighborhood'] = np.where(Toronto['Neighborhood'] == 'Not assigned', Toronto['Borough'], Toronto['Neighborhood'])
Toronto.describe(include='all')

Unnamed: 0,Postal Code,Borough,Neighborhood
count,103,103,103
unique,103,10,99
top,M5G,North York,Downsview
freq,1,24,4


**Created DataFrame Shape**

In [6]:
Toronto.shape

(103, 3)

In [7]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(Toronto['Borough'].unique()),
        Toronto.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


In [8]:
geo = pd.read_csv('Geospatial_Coordinates.csv')
geo.head()

Toronto = pd.merge(left=Toronto, right=geo, left_on='Postal Code', right_on='Postal Code')

#Toronto = Toronto.merge(geo, on='Postal Code')
#Toronto
#print('finished getting longitude and lattitude')

In [9]:
Toronto.dtypes

Postal Code      object
Borough          object
Neighborhood     object
Latitude        float64
Longitude       float64
dtype: object

In [10]:
address = 'Toronto, Ontario, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto, Ontario are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto, Ontario are 43.6534817, -79.3839347.


In [15]:
# create map of using latitude and longitude values
import folium 
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(Toronto['Latitude'], Toronto['Longitude'], Toronto['Borough'], Toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

map_toronto

#### Define Foursquare Credentials and Version

In [16]:
CLIENT_ID = 'TJQFNQLD13APOGBQ0IZUQEHVLUX4UQVIGYWRO4CYRVHOWBV4' # your Foursquare ID
CLIENT_SECRET = 'PECTIYU42AT2IKKGL4DG2KZGCKW5LC1VPTLVGMNAFUVJQCYT' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: TJQFNQLD13APOGBQ0IZUQEHVLUX4UQVIGYWRO4CYRVHOWBV4
CLIENT_SECRET:PECTIYU42AT2IKKGL4DG2KZGCKW5LC1VPTLVGMNAFUVJQCYT


In [18]:
Toronto.loc[0, 'Neighborhood']

'Parkwoods'

In [31]:
LIMIT = 100 
radius = 500
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [34]:
toronto_venues = getNearbyVenues(names=Toronto['Neighborhood'],
                                   latitudes=Toronto['Latitude'],
                                   longitudes=Toronto['Longitude']
                                  )


Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Queen's Park, Ontario Provincial Government
Islington Avenue, Humber Valley Village
Malvern, Rouge
Don Mills
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto, Broadview North (Old East York)
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmo

Number of venues were returned for each neighborhood

In [37]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,4,4,4,4,4,4
"Alderwood, Long Branch",8,8,8,8,8,8
"Bathurst Manor, Wilson Heights, Downsview North",22,22,22,22,22,22
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",27,27,27,27,27,27
...,...,...,...,...,...,...
"Willowdale, Willowdale East",34,34,34,34,34,34
"Willowdale, Willowdale West",6,6,6,6,6,6
Woburn,5,5,5,5,5,5
Woodbine Heights,5,5,5,5,5,5


In [38]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 267 uniques categories.
