# Segmenting and Clustering Neighborhoods in Toronto

This notebook is part of the capstone project for the [IBM Data Science Professional Certificate](https://www.coursera.org/professional-certificates/ibm-data-science) course.
In this project I’m exploring and clustering the neighborhoods in Toronto.

In [3]:
import requests
import lxml.html as html
import pandas as pd

In [4]:
# Downloading and parsing the wiki page
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki_page = requests.get(wiki_url)
wiki_doc = html.fromstring(wiki_page.content)

In [33]:
# Navigating to the table
# NOTE: Changes to the page will require changes in this code.
wiki_table = wiki_doc.xpath('//*[@id="mw-content-text"]/div/table[1]')

# Let's make sure we found the right table.abs
import re

re_postal_code = r'\nM\d[A-Z]\n'
re_postal_code_flags = re.MULTILINE | re.UNICODE
if len(wiki_table) == 0 or re.search(re_postal_code, wiki_table[0].text_content(), re_postal_code_flags) == None:
    raise Exception('Could not find the table of postal codes. Consider updating the XPath.')

postal_codes_table = wiki_table[0]
postal_codes_table

<Element table at 0x2693bef30e8>

In [56]:
# Parse the table into a dataframe

rows = postal_codes_table.findall('.//tr')
if len(rows) == 0:
    raise Exception('Could not find any rows in the table')

EXPECTED_COLS_NUM = 3
table_dict = { 'PostalCode': [], 'Borough': [], 'Neighborhood': [] }

for row in rows:
    cols = row.findall('.//td')
    num_cols = len(cols)

    # Skip rows without td elements (like the header)
    if num_cols == 0:
        continue

    # Make sure we always the expected number of columns
    if num_cols != EXPECTED_COLS_NUM:
        raise Exception('Expected exactly {} columns but got {}.'.format(EXPECTED_COLS_NUM, num_cols))

    borough = cols[1].text_content().strip()

    # Ignore rows without borough as per the task description
    if borough == 'Not assigned':
        continue

    neighborhoods = cols[2].text_content().strip().split(' / ')

    # Make neighborhood same as borough if the former isn't specified
    if len(neighborhoods) == 0 or neighborhoods[0] == 'Not assigned':
        neighborhoods = [borough]

    table_dict['PostalCode'].append(cols[0].text_content().strip())
    table_dict['Borough'].append(borough)
    table_dict['Neighborhood'].append(', '.join(neighborhoods))

df = pd.DataFrame.from_dict(table_dict)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [58]:
df.shape

(103, 3)