## Segmenting and Clustering Neighborhoods in Toronto¶


### Part 2


#### Recalling Part 1:

In [4]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [24]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html_text = requests.get(wiki_url).text

# Using Beautiful Soup to extract the table data
soup = BeautifulSoup(html_text)
table = soup.find('table', attrs={'class':'wikitable sortable'})
trs = table.find_all('tr')

# Extracting the text from the table cells
rows = list()
for tr in trs:
    td = tr.find_all('td')
    row = [ele.text.strip() for ele in td]
    if row:
        # Ignore empty rows with no 'td',
        # applicable for the column headers row.
        rows.append(row)

df = pd.DataFrame(rows, columns=['PostalCode', 'Borough', 'Neighborhood'])

df = df[df.Borough != 'Not assigned']
df.reset_index(inplace=True, drop=True)

df = df.replace('/', ',', regex=True)

print(df.head(10))
print(df.shape)

  PostalCode           Borough                                  Neighborhood
0        M3A        North York                                     Parkwoods
1        M4A        North York                              Victoria Village
2        M5A  Downtown Toronto                    Regent Park , Harbourfront
3        M6A        North York             Lawrence Manor , Lawrence Heights
4        M7A  Downtown Toronto  Queen's Park , Ontario Provincial Government
5        M9A         Etobicoke                              Islington Avenue
6        M1B       Scarborough                               Malvern , Rouge
7        M3B        North York                                     Don Mills
8        M4B         East York              Parkview Hill , Woodbine Gardens
9        M5B  Downtown Toronto                      Garden District, Ryerson
(103, 3)


### Starting Part 2 - Getting the coordinates for each postal code

#### Reading the CSV file containing the coordinates

In [25]:
geospatial = pd.read_csv('https://cocl.us/Geospatial_data')
geospatial.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Creating a new dataframe from the merge of the df with the coordinates and the df from part 1

In [26]:
df = pd.concat([df.set_index('PostalCode'), geospatial.set_index('Postal Code')], axis=1, join='inner')
df.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
M3A,North York,Parkwoods,43.753259,-79.329656
M4A,North York,Victoria Village,43.725882,-79.315572
M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494


#### As the Postal Code column has been set to index, let's reset the index and rename it to PostalCode

In [27]:
df.reset_index(inplace=True)
df.head()

Unnamed: 0,index,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494


In [29]:
df.rename(columns={'index':'PostalCode'}, inplace=True)
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill , Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [32]:
df.shape

(103, 5)