# Applied Data Science Capstone - Segment & Cluster

# Scraping Data From Wikipedia

In [249]:
import pandas
import requests
from bs4 import BeautifulSoup
website_text = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_text,'xml')

table = soup.find('table',{'class':'wikitable sortable'})
table_rows = table.find_all('tr')
#table_rows

# Converting Data Into a Pandas Dataframe

In [250]:
data = []
for row in table_rows:
    data.append([t.text.strip() for t in row.find_all('td')])

df = pandas.DataFrame(data, columns=['PostalCode', 'Borough', 'Neighbourhood'])
df = df[~df['PostalCode'].isnull()]  # to filter out bad rows

In [251]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


# Removing Boroughs That Are Not Assigned

In [252]:
df = df[df.Borough != 'Not assigned']
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


In [253]:
df[df.Borough == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighbourhood


# Replacing Neighbourhoods That Are Not Assigned To Equal Borough

In [254]:
df[df.Neighbourhood == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighbourhood
9,M7A,Queen's Park,Not assigned


In [255]:
df = df.replace(to_replace ="Not assigned", 
                 value ="Queen\'s Park")

# Check That All Boroughs and Neighbourhoods Are Assigned

In [256]:
df[df.Borough == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighbourhood


In [257]:
df[df.Neighbourhood == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighbourhood


# Merging Neighbourhoods With The Same PostalCode

In [258]:
df[df.PostalCode == 'M5A']

Unnamed: 0,PostalCode,Borough,Neighbourhood
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park


In [259]:
df[df.Neighbourhood == 'Harbourfront']

Unnamed: 0,PostalCode,Borough,Neighbourhood
5,M5A,Downtown Toronto,Harbourfront


In [260]:
df[df.Neighbourhood == 'Regent Park']

Unnamed: 0,PostalCode,Borough,Neighbourhood
6,M5A,Downtown Toronto,Regent Park


In [261]:
df = df[df.Neighbourhood != 'Harbourfront']

In [262]:
df = df.replace(to_replace ="Regent Park", 
                 value ="Regent Park, Harbourfront")

In [263]:
df[df.PostalCode == 'M5A']

Unnamed: 0,PostalCode,Borough,Neighbourhood
6,M5A,Downtown Toronto,"Regent Park, Harbourfront"


# Displaying The Shape and Final Dataframe

In [264]:
df.shape

(210, 3)

In [265]:
df

Unnamed: 0,PostalCode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
6,M5A,Downtown Toronto,"Regent Park, Harbourfront"
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Queen's Park
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge
13,M1B,Scarborough,Malvern
15,M3B,North York,Don Mills North
