# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
#imports all neccessary libraries
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup #for scraping website

### Part 1: Scraping Wikipedia using BeautifulSoup

In [2]:
#website for scraping
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

#gets code from website as text
code = requests.get(url).text
soup = BeautifulSoup(code,'html.parser')

#print(soup.prettify()) #un-comment to check code

### Extracts table from code

In [3]:
table = soup.find('table')
#print(table) #un-comment to check table

### Extracts postalcode, borough and neighborhood into arrays

In [4]:
#sets up the column variables
postalcode = []
borough = []
neighborhood = []

#loop to extract postalcode, borough, neighborhood into arrays
for tr in table.find_all('tr'): #look for 'tr' from the table
    counter = 1 #used as a pointed to assign postalcode, borough, or neighborhood
    for td in tr.find_all('td'): #look for 'td' within 'tr' cell of the table
        if counter == 1: #stores the postalcode
            postalcode.append(td.text.replace('\n',''))
        if counter == 2: #stores the borough
            borough.append(td.text.replace('\n',''))
        if counter == 3: #stores the neighborhood
            neighborhood.append(td.text.replace('\n',''))
        counter += 1

### Sets up for Toronto DataFrame

In [5]:
col = ['PostalCode', 'Borough', 'Neighborhood']
Toronto = pd.DataFrame({'PostalCode': postalcode,
                        'Borough': borough,
                        'Neighborhood': neighborhood})

#drops any 'Not assigned' borough
Toronto.drop(Toronto[Toronto.Borough == 'Not assigned'].index, inplace=True)

#replaces any 'Not assigned' neighborhood as borough
Toronto.loc[(Toronto.Neighborhood == 'Not assigned'),'Neighborhood']=Toronto.Borough

#resets index
Toronto.reset_index(drop=True, inplace=True)

#output
Toronto.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [6]:
#outputs the shape
Toronto.shape

(103, 3)

### Part 2: Downloads and saves the geo info

In [7]:
!wget -q -O 'Geospatial_Coordinates.csv' https://cocl.us/Geospatial_data

In [8]:
df = pd.read_csv('Geospatial_Coordinates.csv')
df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [9]:
df.shape

(103, 3)

### Merges Toronto DataFrame with latitude and longitude

In [10]:
Toronto = pd.merge(Toronto, df, left_on='PostalCode', right_on='Postal Code') #merges based on matching Postal Code
Toronto.drop(['Postal Code'], axis=1, inplace = True) #drops the duplicate column,'Postal Code', from df

In [11]:
Toronto.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


### Part 3: Mapping and clustering

In [12]:
!pip install geopy
from geopy.geocoders import Nominatim
!pip install folium
import folium



### Gets coordinates of Toronto

In [13]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


### Shows map of Toronto

In [14]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(Toronto['Latitude'], Toronto['Longitude'], Toronto['Borough'], Toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)

map_toronto

### Uses KMean to cluster

In [15]:
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

#sets k as 10, can be adjusted based on number of clusters
k = 10

Toronto_cluster = Toronto.drop(['PostalCode','Borough','Neighborhood'], 1)
kmeans = KMeans(n_clusters = k, random_state=0).fit(Toronto_cluster)
Toronto.insert(0,'Cluster', kmeans.labels_)

In [16]:
#result of clustering
Toronto

Unnamed: 0,Cluster,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,4,M3A,North York,Parkwoods,43.753259,-79.329656
1,8,M4A,North York,Victoria Village,43.725882,-79.315572
2,1,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,9,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,7,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,4,M3B,North York,Don Mills,43.745906,-79.352188
8,8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


### Shows clustered map

In [17]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

#sets colour for clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

#adds markers to map
for lat, lng, borough, neighborhood, cluster in zip(Toronto['Latitude'], Toronto['Longitude'], Toronto['Borough'], Toronto['Neighborhood'], Toronto['Cluster']):
    label = folium.Popup('Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters