<h1> Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto</h1>

<h2>Extracting Toronto Data Using a Web Scraping</h2>

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

### Using Webscraping to Extract Toronto Data

In [2]:
# Download the contents of the web
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# Use get to download the content of the webpage

html_data = requests.get(url).text

In [3]:
#Parse the html data using beautiful_soup.
soup = BeautifulSoup(html_data,"html5lib")

In [4]:
for link in soup.find_all('a',href=True):  # in html anchor/link is represented by the tag <a>

    print(link.get('href'))

/wiki/Wikipedia:Protection_policy#semi
#mw-head
#searchInput
/wiki/Postal_codes_in_Canada
/wiki/Toronto
/wiki/Ontario
/wiki/Canada_Post
#cite_note-1
/wiki/Mobile_app
/wiki/Smartphones
/wiki/IPhone
/wiki/BlackBerry
#cite_note-2
/wiki/CD-ROM
/wiki/Toronto
/wiki/Postal_codes_in_Canada#Forward_sortation_areas
/wiki/Amazon_(company)
/wiki/North_York
/wiki/Parkwoods
/wiki/North_York
/wiki/Victoria_Village
/wiki/Downtown_Toronto
/wiki/Regent_Park
/wiki/Harbourfront,_Toronto
/wiki/North_York
/wiki/Lawrence_Manor
/wiki/Lawrence_Heights
/wiki/Queen%27s_Park_(Toronto)
/wiki/Etobicoke
/wiki/Islington_Avenue
/wiki/Scarborough,_Toronto
/wiki/Malvern,_Toronto
/wiki/Rouge,_Toronto
/wiki/North_York
/wiki/Don_Mills
/wiki/East_York
/wiki/Parkview_Hill
/wiki/Woodbine_Gardens
/wiki/Downtown_Toronto
/wiki/Garden_District,_Toronto
/wiki/Ryerson_University
/wiki/North_York
/wiki/Etobicoke
/wiki/West_Deane_Park
/wiki/Princess_Gardens
/wiki/Martin_Grove_Road
/wiki/Islington,_Toronto
/wiki/Scarborough,_Toronto
/

In [5]:
# Create a list
table_contents = []

In [6]:
table = soup.find('table')

In [7]:
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)


In [8]:
# print(table_contents)
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

In [9]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   PostalCode    103 non-null    object
 1   Borough       103 non-null    object
 2   Neighborhood  103 non-null    object
dtypes: object(3)
memory usage: 2.5+ KB


In [11]:
#Verify  if there is need to clean something

df.isnull().sum()

PostalCode      0
Borough         0
Neighborhood    0
dtype: int64

There is no ned to clean the data.

In [12]:
df.shape

(103, 3)

### Latitude and Longitude

In [13]:
# Import package
from urllib.request import urlretrieve


# Assign url of file: url
url2 = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv'

# Save file locally
urlretrieve(url2, 'latitude-longitude.csv' )

# Read file into a DataFrame and print its head
df2 = pd.read_csv('latitude-longitude.csv', sep=',')
print(df2.head())

  Postal Code   Latitude  Longitude
0         M1B  43.806686 -79.194353
1         M1C  43.784535 -79.160497
2         M1E  43.763573 -79.188711
3         M1G  43.770992 -79.216917
4         M1H  43.773136 -79.239476


In [14]:
df2.shape

(103, 3)

In [15]:
df2.columns

Index(['Postal Code', 'Latitude', 'Longitude'], dtype='object')

In [19]:
# Change the name of "Postal Code" to "PostalCode"

df2.rename(columns={'Postal Code': "PostalCode"}, inplace=True)


In [20]:
df2.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [21]:
df3 = df.merge(df2, how='inner', on='PostalCode')

df3.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


In [22]:
df3.shape

(103, 5)