#Toronto Neighborhood Clustering - Part II of Week 3

By Rashmitha Varma Pandati

**For this part of assignment we need to geocode the neighborhood data**

In [0]:
# import necessarry libraries

from requests import get
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

In [0]:
# Web scrape the wiki page using beautiful soup so that we can load the table into a dataframe

wiki_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
wiki_page = urlopen(wiki_url)
b_soup = BeautifulSoup(wiki_page,'html.parser')

In [0]:
# Find the table

wiki_table = b_soup.body.table.tbody

# Create a dataframe to store the table

table_df = pd.DataFrame(columns=['Postal Code','Borough','Neighborhood'])

In [0]:
# Extract the table values and append into dataframe

for tr_cell in wiki_table.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        table_df.loc[len(table_df)] = row_data

In [0]:
# Check initial dataframe shape

table_df.shape

(288, 3)

**Data Cleaning**

In [0]:
# remove rows where Borough is 'Not assigned'

table_df=table_df[table_df['Borough']!='Not assigned']

In [0]:
# assign Neighbourhood=Borough where Neighbourhood is 'Not assigned'

table_df[table_df['Neighborhood']=='Not assigned']=table_df['Borough']

In [0]:
# Check the Table

table_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [0]:
# group multiple Neighbourhood under one Postcode

temp_df=table_df.groupby('Postal Code')['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))
temp_df=temp_df.reset_index(drop=False)
temp_df.rename(columns={'Neighborhood':'Neighborhood_joined'},inplace=True)

In [0]:
# join the newly constructed joined data frame

df_merged = pd.merge(table_df, temp_df, on='Postal Code')

In [0]:
# drop the Neighbourhood column

df_merged.drop(['Neighborhood'],axis=1,inplace=True)

In [0]:
# drop duplicates from the data frame

df_merged.drop_duplicates(inplace=True)

In [0]:
# rename Neighbourhood_joined back to Neighbourhood

df_merged.rename(columns={'Neighborhood_joined':'Neighborhood'},inplace=True)
df_merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
4,M6A,North York,"Lawrence Heights, Lawrence Manor"
6,Queen's Park,Queen's Park,Queen's Park


In [0]:
# The postalcode for Queen's park is missing and has to be replaced with its code M7A

df_merged["Postal Code"].replace(to_replace="Queen's Park", value="M7A", inplace=True)
df_merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
4,M6A,North York,"Lawrence Heights, Lawrence Manor"
6,M7A,Queen's Park,Queen's Park


In [0]:
# The postalcode for Queen's park is missing and has to be replaced with its code M7A

df_merged["Postal Code"].replace(to_replace="Queen's Park", value="M7A", inplace=True)
df_merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
4,M6A,North York,"Lawrence Heights, Lawrence Manor"
6,M7A,Queen's Park,Queen's Park


# Part II - Geocoding of the above Dataframe

In [0]:
# Since it takes long time to read from the geocode package we will use the csv file provided

geo_df=pd.read_csv('http://cocl.us/Geospatial_data')
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


**Since the above dataframe is sorted on the basis of postal code we need to sort the neighborhood data frame in a similar manner**

In [0]:
# Before we merge the Neighborhood data frame with the Geospatial dataframe we need to sort the Neighborhood data frame by the Postal Code first

df_merged_sorted = df_merged.sort_values("Postal Code")
df_merged_sorted.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
8,M1B,Scarborough,"Rouge, Malvern"
21,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
33,M1E,Scarborough,"Guildwood, Morningside, West Hill"
39,M1G,Scarborough,Woburn
43,M1H,Scarborough,Cedarbrae


In [0]:
# Merge the two dataframes

geo_merged_df = pd.merge(df_merged_sorted, geo_df, on='Postal Code')
geo_merged_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [0]:
geo_merged_df.shape

(103, 5)

**End of Part II**