### <b> Segmenting and Clustering neighborhood in Toronto - part 2 </b> ######

Assessing geographical coordinates of the neighborhoods 

In [114]:
import pandas as pd
import wikipedia as wp
import numpy as np
from bs4 import BeautifulSoup
import requests
import io

In [115]:
html = wp.page(' List of postal codes of Canada: M').html().encode('UTF-8')  
df = pd.read_html(html, header = 0)[0]    ## Extracting data from Wiki into DataFrame

In [116]:
df = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).to_frame().reset_index() #Grouping data with Neighbourhoods separated in column

In [117]:
# If Neighbourhood is "Not Assigned", mark it as value of Borough
for index, row in df.iterrows():
    if row['Neighbourhood'] == 'Not assigned':
        row['Neighbourhood'] = row['Borough']

In [118]:
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M1B,Scarborough,"Rouge, Malvern"
2,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
3,M1E,Scarborough,"Guildwood, Morningside, West Hill"
4,M1G,Scarborough,Woburn
...,...,...,...
175,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."
176,M9W,Etobicoke,Northwest
177,M9X,Not assigned,Not assigned
178,M9Y,Not assigned,Not assigned


In [119]:
# Getting latitude and the longitude coordinates of each neighborhood
url = "http://cocl.us/Geospatial_data"
s = requests.get(url).content
df_loc = pd.read_csv(io.StringIO(s.decode('utf-8')))
df_loc

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [120]:
# Merging Data Frames on Postcode column
df_loc.columns = ['Postcode', 'Latitude', 'Longitude']
df = pd.merge(df_loc, df, on='Postcode')
df

Unnamed: 0,Postcode,Latitude,Longitude,Borough,Neighbourhood
0,M1B,43.806686,-79.194353,Scarborough,"Rouge, Malvern"
1,M1C,43.784535,-79.160497,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,43.763573,-79.188711,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,43.770992,-79.216917,Scarborough,Woburn
4,M1H,43.773136,-79.239476,Scarborough,Cedarbrae
...,...,...,...,...,...
98,M9N,43.706876,-79.518188,York,Weston
99,M9P,43.696319,-79.532242,Etobicoke,Westmount
100,M9R,43.688905,-79.554724,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,43.739416,-79.588437,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


In [121]:
dfx2 = df.rename(columns={'Postcode':'PostalCode'})  # matching columns with PostalCode

In [122]:
# Reordering columns
cols = dfx2.columns.tolist()
cols = cols[0:1] + cols[3:5] + cols[1:3]
dfx2 = dfx2[cols]
dfx2

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437
