## Segmenting and Clustering Neighborhoods in Toronto- Part 1


**In this notebook, we will retreive a latitude and longitude coordinate for each neighborhood from part 1**

In [3]:
# Import necessary modules

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

In [9]:
# Code from part 1 to retreive the table with neighborhoods and postalcodes

# load html text from the wikipedia url
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html_txt = requests.get(url).text

# Using BeautifulSoup Module extract text
soup = BeautifulSoup(html_txt)
table = soup.find('table', attrs={'class':'wikitable sortable'})
places = table.find_all('tr')

# Extract text from table cells and add rows to extraced list of rows
rows = list()
for tr in places:
    post = tr.find_all('td')
    row = [ele.text.strip() for ele in post]
    if row:
        rows.append(row)
        

df = pd.DataFrame(rows,columns=['PostalCode', 'Borough', 'Neighborhood'])

df = df[df.Borough != 'Not assigned']
df.reset_index(inplace=True, drop=True)

df['Neighborhood'] = df.apply(
    lambda row: 
    row['Borough'] if row['Neighborhood'] == 'Not assigned' 
    else row['Neighborhood'],
    axis=1)

df = df.groupby(['PostalCode', 'Borough'])['Neighborhood'].\
    apply(', '.join).to_frame()
df.reset_index(inplace=True)
df.head(15)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


## Get Latitude and Longitude Data

 We will be using this link to this csv file: http://cocl.us/Geospatial_data to retreive the coordinates

In [14]:
# Read in the data 
latlongs = pd.read_csv('https://cocl.us/Geospatial_data')
latlongs.head(10)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [15]:
# Check that it matches the neighborhood table size
latlongs.shape

(103, 3)

Merge neighborhood and and geospatial datasets

In [16]:
df = pd.concat([df.set_index('PostalCode'), geospatial_data.set_index('Postal Code')],axis=1, join='inner')
df.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
M1G,Scarborough,Woburn,43.770992,-79.216917
M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [17]:
# Reformat table so postal code has an indexed column
df.reset_index(inplace=True)
df.rename(columns={'index': 'PostalCode'}, inplace=True)
df.head(12)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [18]:
df.shape

(103, 5)