# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
# Task to explore, segment, and cluster the neighborhoods in the city of Toronto.
# As there is no readily available Toronto neighborhood data, it is required to scrape from wiki and complete data wrangel like 
# convert to dataframe using either Python pandas or Beautifulsoap, I have used Python pandas library. 
# 
# Once the data is in a structured format, explore and cluster the neighborhoods in the city of Toronto.

# Store the notebook in Github repository.

In [2]:
# import libraries 
import requests
import pandas as pd

In [3]:
# Obtian the HTML of wiki, convert into table using read_html
url_wiki_toronto = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki_page_toronto = requests.get(url_wiki_toronto)

# convert to dataframe 
df_toronto_preprocess = pd.read_html(wiki_page_toronto.content, header = 0)[0]
df_toronto_preprocess.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [4]:
# remove cell that contain not assigned from Borough column
df_toronto_preprocess = df_toronto_preprocess[df_toronto_preprocess.Borough !='Not assigned']
df_toronto_preprocess.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [5]:
# Find if there are any Not assigned in Neighbourhood and then assign Borough name to it.
df_toronto_preprocess.loc[df_toronto_preprocess.Neighbourhood == 'Not assigned']

Unnamed: 0,Postal Code,Borough,Neighbourhood


In [6]:
# Looks like there are no 'Not assigned' neighbourhood in the table, therefore dataframe is fully formatted

In [7]:
# Rename as formatted data frame.
df_toronto_formatted = df_toronto_preprocess
df_toronto_formatted.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [8]:
# Group the neighbourhood with same postal code
df_toronto_formatted = df_toronto_formatted.groupby(['Postal Code', 'Borough'])['Neighbourhood'].apply(lambda x: '.'.join(x))

# reset the index in the dataframe
df_toronto_formatted = df_toronto_formatted.reset_index()
df_toronto_formatted.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [9]:
df_toronto_formatted.shape


(103, 3)

# this answer first part of submission

In [10]:
# Get the latitude and the longitude coordinates of each neighborhood using csv file 

url = 'http://cocl.us/Geospatial_data'
df_toronto_geo=pd.read_csv(url)
df_toronto_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
#check the sshape o the csv file
df_toronto_geo.shape

(103, 3)

In [12]:
# Both tables have the same number of columns and rows, can join longitude and latitude as new colums to df_toronto_formatted dataframe.

# No need to change name of column as they are same


df_toronto_formatted = pd.merge(df_toronto_formatted, df_toronto_geo)

df_toronto_formatted.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# This answer second part of submission