# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto (Part 2)

In [4]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [5]:
# Using Beautiful Soup
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml') 

# Getting the column names con column_names
table = soup.find('table')
table_header_th = table.find_all('th')

column_names=[]
for element in table_header_th: 
    column_names.append(element.text.strip())

# Adding each row of the table to the list list_of_rows

table_body = table.find('tbody')
table_rows = table_body.find_all('tr')

list_of_rows = []

for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    list_of_rows.append(row)


In [6]:
# Creates df DataFrame with the data
df=pd.DataFrame(list_of_rows, columns=column_names)
df.drop(df.index[0], inplace = True)
df = df.reset_index(drop = True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


In [7]:
# Changes the names of the columns
df.rename(columns={'Postcode':'PostalCode','Neighbourhood':'Neighborhood'}, inplace=True)


In [8]:
# Eliminates blank spaces at the end 
df['PostalCode'] = df['PostalCode'].str.rstrip()
df['Neighborhood'] = df['Neighborhood'].str.rstrip()
df['Borough'] = df['Borough'].str.rstrip()

In [9]:
# Eliminates rows where Borough == 'Not assigned'
df = df[df.Borough != 'Not assigned']
df.shape

(211, 3)

In [10]:
# Data Frame A contains the rows with Neighborhoods concatenated *with same Postal Code*
A=df.groupby('PostalCode')['Neighborhood'].apply(lambda tags:', '.join(tags)).to_frame().reset_index() 

# Removes column Neighborhood from df
df = df.drop(['Neighborhood'], axis=1)

# Eliminates duplicated rows on df 
df.drop_duplicates(inplace = True)

# Joins both Data Frames on df_new
df_new = pd.merge(df, A, on='PostalCode', how='inner')

# Replace not assigned neighborhoods wiht its borough name
df_new.loc[df_new.Neighborhood == 'Not assigned', 'Neighborhood'] = df_new.loc[df_new.Neighborhood == 'Not assigned'].Borough

df_new.shape

(103, 3)

In [12]:
# Parte 2
# Using the link to the csv file that has the geographical coordinates of each postal code of Toronto.

In [13]:
!wget -q -O 'Geospatial_Coordinates.csv' https://cocl.us/Geospatial_data
print('Data downloaded!')

Data downloaded!


In [14]:
GeoData_df = pd.read_csv('Geospatial_Coordinates.csv')
GeoData_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [15]:
GeoData_df.shape

(103, 3)

In [16]:
# Changes the names of the Postal Code to PostalCode
GeoData_df.rename(columns={'Postal Code':'PostalCode'}, inplace=True)

# Joins Data Frames, df_new and GeoData_df on df_GeoComplete
df_GeoComplete = pd.merge(df_new, GeoData_df, on='PostalCode', how='outer')

df_GeoComplete.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
