# Segmenting and Clustering Neighborhoods in Toronto

## Acquiring Data

#### First we will install the required libraries for this project

In [1]:
!pip install beautifulsoup4



In [1]:
!pip install lxml



#### Importing all the required libraries

In [153]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd

#### Using request to call the url and parse using Beautiful soup and lxml parser

In [154]:
website_url = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text

soup = BeautifulSoup(website_url,'lxml')

#### Extracting the table data from the html file

In [155]:
My_table = soup.find('table',{'class':'wikitable sortable'})
cnames = []
for th in My_table.find_all('th'):
    cnames.append(th.string)

Extacting column values from the object My_table

In [156]:
c1=[]
c2=[]
c3=[]
for tr in My_table('tr')[1:]:
    tds = tr.find_all('td')
    c1.append(tds[0].text)
    c2.append(tds[1].text)
    c3.append(tds[2].text)

#### Convering these 3 lists into a data frame

In [157]:
import pandas as pd
df = pd.DataFrame()
df['PostalCode']=c1
df['Borough']=c2
df['Neighborhood']=c3    

Cleaning the table

In [158]:
df['Neighborhood'] = df['Neighborhood'].replace('\n', '', regex=True)

In [159]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### Ignoring cells with a borough that is Not assigned

In [160]:
df.drop(df[df.Borough == 'Not assigned'].index, inplace=True)

In [161]:
df = df.reset_index(drop=True)

In [162]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


#### Combining two rows into one row with the neighborhoods separated with a comma in the case where more than one neighborhood can exist in one postal code area

In [163]:
df = df.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()

In [164]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### Fulfilling the condition that, if a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

In [165]:
df.Neighborhood.replace('Not assigned', np.nan, inplace = True)

In [166]:
df.Neighborhood.fillna(df.Borough, inplace=True)

In [167]:
df.shape

(103, 3)