# Segmenting and Clustering Neighborhoods in Toronto

### Notebook by Skyler Schilke for Applied Data Science Capstone course on Coursera

### First we Retrieve Table Contents with BeautifulSoup

In [1]:
# import necessary packages
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# define url and get contents
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url)
# return prettify html content
soup = BeautifulSoup(page.content, 'html.parser')
# print(soup.prettify())


In [3]:
# define the target table from the html
my_table = soup.find('table', {'class': 'wikitable sortable'})

In [4]:
# within the table, get the content
headers = my_table.findAll('th')
contents = my_table.findAll('td')

### Next, populate the contents of the table into lists

In [5]:
# fill up the lists of columns and all_content
columns = []
all_content = []
for header in headers:
    columns.append(header.get_text())
for content in contents:
    all_content.append(content.get_text())

In [6]:
# split up all_content into their appropriate columns 
postcode = all_content[0::3]
borough = all_content[1::3]
neigh = all_content[2::3]

# remove the '\n' from the Neighbourhood column
columns[2] = columns[2][:-1] # remove the '\n' from the Neighbourhood column

In [7]:
# check how long the columns are and verify the first three values of each list
print('The postcode list is length: ', len(postcode), '.  The first three values are: ', postcode[0:3])
print('The borough list is length: ', len(borough), '.  The first three values are: ', borough[0:3])
print('The neigh list is length: ', len(neigh), '.  The first three values are: ', neigh[0:3])

The postcode list is length:  288 .  The first three values are:  ['M1A', 'M2A', 'M3A']
The borough list is length:  288 .  The first three values are:  ['Not assigned', 'Not assigned', 'North York']
The neigh list is length:  288 .  The first three values are:  ['Not assigned\n', 'Not assigned\n', 'Parkwoods\n']


In [8]:
# create the dataframe with the argument columns=columns to preserve the order

df = pd.DataFrame({columns[0]: postcode,
                  columns[1]: borough,
                  columns[2]: neigh}, 
                 columns=columns)

# return sample of df
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


In [9]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


### Now time to clean up the data

In [10]:
# remove the \n from neighbourhood
df['Neighbourhood'] = df['Neighbourhood'].str[:-1]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### First remove the rows with borough = 'Not assigned'

In [11]:
# ignore cells with a borough that is Not assigned
df = df[df['Borough'] != 'Not assigned']
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### Combine rows with the same Postcode and include all neighbourhoods in same row, comma separated

In [12]:
# create a new df groupby and aggregate with a comma
df2 = df.set_index('Postcode').astype(str).groupby(level=0).agg(', '.join).reset_index()

df2.drop(columns = 'Borough', axis=1, inplace=True)
df.drop(columns = 'Neighbourhood', axis=1, inplace=True)

In [13]:
# merge the two df's on the Postcode and drop duplicate postcodes in a new df
df3 = pd.merge(df2, df, on='Postcode')
df3 = df3.drop_duplicates('Postcode')
df3 = df3[columns]

### If Neighborhood is 'Not assigned', set it equal to the Bourough for the same row

In [14]:
# check to see if there is anywhere where Neighbourhood == 'Not assigned'
df3[df3['Neighbourhood'] == 'Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood
159,M7A,Queen's Park,Not assigned


In [15]:
# Whereever Neighbourhood == 'Not assigned', make it equal the Bourough for the same row
df3['Neighbourhood'][df3['Neighbourhood'] == 'Not assigned'] = df3['Borough']
# make sure the result is blank
df3[df3['Neighbourhood'] == 'Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood


### Results

In [16]:
# return a sample set of the final df
df3.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
2,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
5,M1E,Scarborough,"Guildwood, Morningside, West Hill"
8,M1G,Scarborough,Woburn
9,M1H,Scarborough,Cedarbrae
10,M1J,Scarborough,Scarborough Village
11,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
14,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
17,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
20,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [17]:
# return the shape of the final df
df3.shape

(103, 3)