# Segmenting and Clustering Neighborhoods in Toronto

## Creating dataframe of Neighborhoods in Toronto

In [6]:
pip install BeautifulSoup4

Collecting BeautifulSoup4
[?25l  Downloading https://files.pythonhosted.org/packages/d1/41/e6495bd7d3781cee623ce23ea6ac73282a373088fcd0ddc809a047b18eae/beautifulsoup4-4.9.3-py3-none-any.whl (115kB)
[K     |████████████████████████████████| 122kB 5.1MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2; python_version >= "3.0" (from BeautifulSoup4)
  Downloading https://files.pythonhosted.org/packages/6f/8f/457f4a5390eeae1cc3aeab89deb7724c965be841ffca6cfca9197482e470/soupsieve-2.0.1-py3-none-any.whl
Installing collected packages: soupsieve, BeautifulSoup4
Successfully installed BeautifulSoup4-4.9.3 soupsieve-2.0.1
Note: you may need to restart the kernel to use updated packages.


#### Download libraries

In [20]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import requests 
from bs4 import BeautifulSoup 

## Scrap wiki page of the neighborhoods in Toronto

#### Download the data

In [74]:
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
#r = requests.get(URL)
#soup = BeautifulSoup(r.content, 'html5lib') 
r = requests.get(URL).text
soup = BeautifulSoup(r, 'html.parser')

#### Parsing data

In [78]:
table_rows = soup.tbody.find_all("tr")

res = []
for tr in table_rows:
    
    td = tr.find_all("td")
    row_roh = [tr.text for tr in td]
    row = [i.strip() for i in row_roh]
    
    # Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
    if row != [] and "Not assigned" not in row[1]:
        # If a cell has a borough but a "Not assigned" neighborhood, then the neighborhood will be the same as the borough.
        if "Not assigned" in row[2]: 
            row[2] = row[1]
        res.append(row)

# Dataframe with 3 columns
df = pd.DataFrame(res, columns = ["PostalCode", "Borough", "Neighborhood"])
df.head(15)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


#### Combining the neighbourhoods with same Postalcode

In [79]:
df = df.groupby(["PostalCode", "Borough"],sort=False)["Neighborhood"].apply(", ".join).reset_index()
df.head(15)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [80]:
df.shape    # print the number of rows of the dataframe

(103, 3)

#### Saving the data

In [81]:
df.to_csv('Toronto_data.csv')