# Segmenting and Clustering Neighborhoods in Toronto


*@author : __Mouad Choukhairi__*

### The code below is used for the Toronto neighborhood clustering exercise


In [1]:
import numpy as np
import pandas as pd

import requests 
from bs4 import BeautifulSoup 

In [2]:
#Fetching the data from the given website
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

SourceData = requests.get(URL).text
DataC = BeautifulSoup(SourceData, 'lxml')

# Extracting only table
Table = DataC.find('table',{'class':'wikitable sortable'})
#print(Table.tr.text)

print('Page Scrapped.')

Page Scrapped.


In [3]:
#Getting all values in tr and seperating each td within by ",""
dataC = list()
for rows in Table.find_all('tr'):
    
    row = rows.find_all('td')
    if row:
        postalcode = row[0].text.rstrip()
        borough = row[1].text.rstrip()
        neighborhood = row[2].text.rstrip()
        if borough != 'Not assigned':
            if neighborhood == 'Not assigned':
                neighborhood = borough
            dataC.append([postalcode, borough, neighborhood])

col_head = list()
for cols in Table.tr.find_all('th'):
    col_head.append(cols.text.strip())

print('Cleaning. Part-1')

Cleaning. Part-1


In [4]:
#Converting into dataframe and assigning column names
df = pd.DataFrame(dataC)
df.columns = ['Postalcode','Borough','Neighborhood']

print('Dataframe Created.')

Dataframe Created.


In [5]:
# Get names of indexes for which column Borough has value "Not assigned"
empty = 'Not assigned'
indexNames = df[ df['Borough'] ==empty].index

# Delete these row indexes from dataFrame
df.drop(indexNames , inplace=True)

#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
df.loc[df['Neighborhood'] =='Not assigned' , 'Neighborhood'] = df['Borough']
df.head(12)

#rows will be same postalcode will combined into one row with the neighborhoods separated with a comma
result = df.groupby(['Postalcode','Borough'], sort=False).agg( ', '.join)
df_new=result.reset_index()

print('Cleaning. Part-2')

Cleaning. Part-2


In [6]:
df_new.head(12)

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [7]:
#use the .shape method to print the number of rows of your dataframe
print('The DataFrame shape is', df_new.shape)

The DataFrame shape is (103, 3)


In [8]:
#We will be using a csv file that has the geographical coordinates of each postal code: http://cocl.us/Geospatial_data
df_coord = pd.read_csv('Geospatial_Coordinates.csv')
df_coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [9]:
df_coord.rename(columns={'Postal Code': 'Postalcode'}, inplace=True) #Postal Code ==> Postalcode
df_coord.head()

Unnamed: 0,Postalcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
DFtmp = pd.merge(df_new , df_coord , on='Postalcode' )
DFtmp.head(12)

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
