### Segmenting and Clustering Neighborhoods in Toronto

In [1]:
#import necessary libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

# get target url and make soup
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
req = requests.get(url)
soup = BeautifulSoup(req.content,'html.parser')
#print(soup.prettify())

In [2]:
# find the target table within the webpage
my_table = soup.find('table',{'class':'wikitable sortable'})
#my_table

In [3]:
# create 3 lists to store each column within the table
PostalCode = []
Borough = []
Neighborhood = []

# rows start with 'tr', so use find_all to extract all rows
rows = my_table.find_all('tr')

# cycle through all rows to populate the lists created above
for row in rows:
    # cells start with 'td', so use find_all to extract cells within each row
    cells = row.find_all('td')
    if len(cells) > 1:
        # in order, append postal code, borough, and neighborhood information
        pc = cells[0]
        #use .text.strip() to clean up the data
        PostalCode.append(pc.text.strip())
        
        br = cells[1]
        Borough.append(br.text.strip())
        
        nb = cells[2]
        Neighborhood.append(nb.text.strip())
#test the populated list
print(Neighborhood)

['Not assigned', 'Not assigned', 'Parkwoods', 'Victoria Village', 'Regent Park, Harbourfront', 'Lawrence Manor, Lawrence Heights', "Queen's Park, Ontario Provincial Government", 'Not assigned', 'Islington Avenue, Humber Valley Village', 'Malvern, Rouge', 'Not assigned', 'Don Mills', 'Parkview Hill, Woodbine Gardens', 'Garden District, Ryerson', 'Glencairn', 'Not assigned', 'Not assigned', 'West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale', 'Rouge Hill, Port Union, Highland Creek', 'Not assigned', 'Don Mills', 'Woodbine Heights', 'St. James Town', 'Humewood-Cedarvale', 'Not assigned', 'Not assigned', 'Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood', 'Guildwood, Morningside, West Hill', 'Not assigned', 'Not assigned', 'The Beaches', 'Berczy Park', 'Caledonia-Fairbanks', 'Not assigned', 'Not assigned', 'Not assigned', 'Woburn', 'Not assigned', 'Not assigned', 'Leaside', 'Central Bay Street', 'Christie', 'Not assigned', 'Not assigned', 'Not assigned', '

In [19]:
# transform the lists into Pandas dataframe
# first concatnate the lists into numpy array, and transpose the array so the format matches that of a dataframe
df = np.transpose(np.array([PostalCode,Borough,Neighborhood]))

# convert to dataframe, and name the columns
df = pd.DataFrame(df, columns = ['PostalCode','Borough','Neighborhood'])

# filter out rows with 'Not assigned' Borough data
df = df[df.Borough != 'Not assigned']

# assign neighborhood values to same as borough values for those with 'Not assigned' neighborhood value
df[df.Neighborhood == 'Not assigned'].Neighborhood = df[df.Neighborhood == 'Not assigned'].Borough

df.reset_index(drop = True, inplace = True)

df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [20]:
# get the shape of the dataframe
df.shape

(103, 3)

In [21]:
coordinates = pd.read_csv('../data/Geospatial_Coordinates.csv')
coordinates

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [31]:
dfc = df.merge(coordinates, how = 'left',left_on = 'PostalCode', right_on = 'Postal Code')
dfc

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M3A,North York,Parkwoods,M3A,43.753259,-79.329656
1,M4A,North York,Victoria Village,M4A,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",M5A,43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",M6A,43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",M7A,43.662301,-79.389494
...,...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",M8X,43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,M4Y,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",M7Y,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",M8Y,43.636258,-79.498509


In [32]:
dfc.drop(columns = 'Postal Code')

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509
