# Segmenting and Clustering Neighborhoods in Toronto

Before we get the data and start exploring it, let's download all the dependencies that we will need.

In [1]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import json # library to handle JSON files
import requests # library to handle requests
from bs4 import BeautifulSoup

print('Libraries imported.')

Libraries imported.


<a id='item1'></a>

## 1. Get Dataset by scrapping

#### Get html

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url).text
soup = BeautifulSoup(source, 'html.parser')

wiki_table = soup.body.table.tbody


#### Parse html

In [3]:
def get_cell(element):
    cells = element.find_all('td')
    row = []
    
    for cell in cells:
        if cell.a:            
            if (cell.a.text):
                row.append(cell.a.text)
                continue
        row.append(cell.string.strip())
        
    return row

def get_row(source):    
    data = []  
    
    for tr in source.find_all('tr'):
        row = get_cell(tr)
        if len(row) != 3:
            continue
        data.append(row)        
    
    return data

In [4]:
data = get_row(wiki_table)
columns = ['Postcode', 'Borough', 'Neighbourhood']
df = pd.DataFrame(data, columns=columns)
df.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


## 2. Clean Dataset

Remove the "Not assigned"

In [5]:
df = df[df.Borough != 'Not assigned']
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


Reindex

In [6]:
df.reset_index(inplace=True)
df.drop('index',axis=1,inplace=True)

df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


Groupby Neighbourhood

In [7]:
df = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda x: ", ".join(x.astype(str))).reset_index()
df = df.sample(frac=1).reset_index(drop=True)
df.head(12)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
1,M5C,Downtown Toronto,St. James Town
2,M5B,Downtown Toronto,"Ryerson, Garden District"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M6M,York,"Del Ray, Keelesdale, Mount Dennis, Silverthorn"
5,M2R,North York,Willowdale West
6,M9L,North York,Humber Summit
7,M5E,Downtown Toronto,Berczy Park
8,M5W,Downtown Toronto,Stn A PO Boxes 25 The Esplanade
9,M1S,Scarborough,Agincourt


In [8]:

df.shape

(103, 3)

## 3. Add geospatial data

In [9]:
url2="http://cocl.us/Geospatial_data"
geo_data=pd.read_csv(url2)
geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
data = df.set_index('Postcode').join(geo_data.set_index('Postal Code'))
data = data.reset_index()
data.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
1,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
2,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M6M,York,"Del Ray, Keelesdale, Mount Dennis, Silverthorn",43.691116,-79.476013
5,M2R,North York,Willowdale West,43.782736,-79.442259
6,M9L,North York,Humber Summit,43.756303,-79.565963
7,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
8,M5W,Downtown Toronto,Stn A PO Boxes 25 The Esplanade,43.646435,-79.374846
9,M1S,Scarborough,Agincourt,43.7942,-79.262029
