# Segmenting and Clustering Neighborhoods in Toronto

<b>Before we get the data and start exploring it, let's download all the dependencies that we will need.</b>

In [144]:
#conda install -c conda-forge geopy

In [145]:
#conda install -c anaconda beautifulsoup4

In [146]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

import folium # map rendering library

print('Libraries imported.')

Libraries imported.


# Question 1

<b>Use pandas, or the BeautifulSoup package, or any other way you are comfortable with to transform the data in the table on the Wikipedia page into the above pandas dataframe.</b>

In [148]:
from bs4 import BeautifulSoup

website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text


soup = BeautifulSoup(website_url,'html.parser')
#print(soup.prettify())

<b>Inspecting the website, it is found that the data we require is available in table and is in the class = "wikitable sortable". So, the table should be extracted</b>

In [149]:
Toronto_table = soup.find('table',{'class':'wikitable sortable'})
#print(Toronto_table)

In [150]:
print(Toronto_table.tr.text)


Postal Code

Borough

Neighborhood



In [151]:
headers="Postcode,Borough,Neighbourhood"

<b>Storing the table in a list</b>

In [152]:
Toronto_data=[]
for tr in Toronto_table.find_all('tr'):
    row1=""
    res=""
    for tds in tr.find_all('td'):
        row1=row1+";"+tds.text
        res = "" 
        for sub in row1: 
            res = res + sub.replace("\n", "")
    Toronto_data.append(res[1:])
#print(Toronto_data)

<b>Now converting the list to Pandas dataframe and assigning column names</b>

In [153]:
df = pd.DataFrame([sub.split(";") for sub in Toronto_data])
df.head(10)

Unnamed: 0,0,1,2
0,,,
1,M1A,Not assigned,
2,M2A,Not assigned,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M8A,Not assigned,
9,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"


In [154]:
df.columns=["PostalCode","Borough","Neighbourhood"]
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,,,
1,M1A,Not assigned,
2,M2A,Not assigned,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M8A,Not assigned,
9,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"


<b>Dropping the rows where Borough is Not assigned</b>

In [155]:
indexNames = df[df['Borough'] =='Not assigned'].index
df.drop(indexNames , inplace=True)

df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,,,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
9,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
10,M1B,Scarborough,"Malvern, Rouge"
12,M3B,North York,Don Mills
13,M4B,East York,"Parkview Hill, Woodbine Gardens"


<b>Dropping the rows in which Borough has NaN vaues</b>

In [156]:
df.dropna(subset=["Borough"], axis=0, inplace=True)
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
9,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
10,M1B,Scarborough,"Malvern, Rouge"
12,M3B,North York,Don Mills
13,M4B,East York,"Parkview Hill, Woodbine Gardens"
14,M5B,Downtown Toronto,"Garden District, Ryerson"


<b>If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough</b>


In [157]:
df.loc[df['Neighbourhood'] =='Not assigned' , 'Neighbourhood'] = df['Borough']
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
9,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
10,M1B,Scarborough,"Malvern, Rouge"
12,M3B,North York,Don Mills
13,M4B,East York,"Parkview Hill, Woodbine Gardens"
14,M5B,Downtown Toronto,"Garden District, Ryerson"


<b>Multiple rows with same PostalCode will be merged into a single row with the Neighborhoods separated by comma</b>

In [158]:
result = df.groupby(['PostalCode','Borough'], sort=False).agg( ', '.join)
df_new=result.reset_index()
df_new.head(20)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [159]:
df_new.shape


(103, 3)