## Segmenting and Clustering Neighborhoods in Toronto

### Part 1

#### Importing the libraries

In [14]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

#### Scraping the Wikipedia page and using Beautiful Soup to find our table

In [15]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html_text = requests.get(wiki_url).text

#### Converting the table into a pandas dataframe

In [26]:
# Using Beautiful Soup to extract the table data
soup = BeautifulSoup(html_text)
table = soup.find('table', attrs={'class':'wikitable sortable'})
trs = table.find_all('tr')

# Extracting the text from the table cells
rows = list()
for tr in trs:
    td = tr.find_all('td')
    row = [ele.text.strip() for ele in td]
    if row:
        # Ignore empty rows with no 'td',
        # applicable for the column headers row.
        rows.append(row)

#### Creating the pandas dataframe

In [27]:
df = pd.DataFrame(rows, columns=['PostalCode', 'Borough', 'Neighborhood'])
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


#### Removing the cells with a borough that is not assigned

In [28]:
df = df[df.Borough != 'Not assigned']
df.reset_index(inplace=True, drop=True)
df.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


#### As there are no cells with assigned borough and unagidgned neighborhoods, let's use the code to replace "/" for ","

In [29]:
df = df.replace('/', ',', regex=True)

In [30]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


In [31]:
df.shape

(103, 3)