# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

In [21]:
from bs4 import BeautifulSoup
import requests
import csv
import json
import xml
import pandas as pd
import numpy as np 
from sklearn.cluster import KMeans

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

print('Importing the libraries is complete.')

Importing the libraries is complete.


## Extract Toronto neighborhoods data from Wikipedia page and create the dataframe

In [22]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

results = requests.get(url).text 
parsed_results = BeautifulSoup(results, 'html.parser') 

info_table = parsed_results.find('table', class_ = 'wikitable')
raw_rows = info_table.find_all('tr')

neighborhood_info = []
for row in raw_rows:
    info = row.text.split('\n')[1:-1] 
    neighborhood_info.append(info)
    
neighborhood_info[0][-1] = 'Neighborhood' 
main_df = pd.DataFrame(neighborhood_info[1:], columns=neighborhood_info[0])

main_df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


# Remove the "not assigned" boroughs

In [25]:
not_assigned_boroughs = main_df.index[main_df['Borough'] == 'Not assigned']
not_assigned_neighborhoods = main_df.index[main_df['Neighborhood'] == 'Not assigned']
main_df.drop(main_df.index[not_assigned_boroughs], inplace=True)
main_df.reset_index(drop=True, inplace=True)
main_df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


# Replace the "not assigned" Neighborhood values with the corresponding Borough

In [35]:
not_assigned_neighborhoods = main_df.index[main_df['Neighborhood'] == 'Not assigned'] # run this again because the indexes on the dataframe where reset
for idx in not_assigned_neighborhoods:
    main_df['Neighborhood'][idx] = main_df['Borough'][idx]    
main_df.head(12)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [27]:
main_df.shape

(212, 3)

In [28]:
group = main_df.groupby('Postcode')
grouped_neighborhoods = group['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))
grouped_boroughs = group['Borough'].apply(lambda x: set(x).pop())
grouped_df = pd.DataFrame(list(zip(grouped_boroughs.index, grouped_boroughs, grouped_neighborhoods)))
grouped_df.columns = ['Postcode', 'Borough', 'Neighborhood']

grouped_df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [29]:
grouped_df.shape

(103, 3)

In [30]:
coordinates_df = pd.read_csv('http://cocl.us/Geospatial_data')

print('Coordinates dataframe shape: ', coordinates_df.shape)
coordinates_df.head()

Coordinates dataframe shape:  (103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [34]:
# Join dataframe with coordinates

with_coordinates_df = grouped_df.join(coordinates_df.set_index('Postal Code'), on='Postcode')
with_coordinates_df.head(12)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [33]:
with_coordinates_df.shape

(103, 5)