# Segmenting and Clustering Neighborhoods in Toronto

### Importing all required packages

In [1]:
from requests import get
import bs4 as bs
from urllib.request import urlopen
import pandas as pd
import numpy as np 

In [2]:
url= "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [3]:
from IPython.display import IFrame
IFrame('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M#Toronto_-_103_FSAs', width=1000, height=350)

### Scraping the wiki page using BeautifulSoup

In [4]:
# 
# Using BS4 as suggested in Assignment.
# scrape_table_bs4 <tableClassName> <expected numberOfColumns>
# 
def scrape_table(cname,cols):
    page  = urlopen(url)
    soup  = bs.BeautifulSoup(page,'lxml')
    table = soup.find("table",class_=cname)
    header = [head.findAll(text=True)[0].strip() for head in table.find_all("th")]
    data   = [[td.findAll(text=True)[0].strip() for td in tr.find_all("td")]
              for tr in table.find_all("tr")]
    data    = [row for row in data if len(row) == cols]
    # Store data to this temporary dataframe
    temp_df = pd.DataFrame(data,columns=header)
    return temp_df

- The dataframe will consist of three columns: Postal code, Borough, and Neighborhood

In [5]:
all_postal_codes = scrape_table("wikitable",3)

In [6]:
# Replacing esiting slashes '/' with commas ',' using regular expression
import re
for i in range(0,len(all_postal_codes['Neighborhood'])):
    all_postal_codes['Neighborhood'][i] = re.sub("[/]", ",", all_postal_codes['Neighborhood'][i])

In [7]:
all_postal_codes.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park , Harbourfront"


- Only process the cells that have an assigned borough. Ignore cells with a borough that is <b>Not assigned<b>.

In [8]:
postal_codes=all_postal_codes[~all_postal_codes['Borough'].isin(['Not assigned'])]
postal_codes.head()

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park , Harbourfront"
5,M6A,North York,"Lawrence Manor , Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


In [9]:
# Sort and Resetting index, as rows has been removed
postal_codes=postal_codes.reset_index(drop=True)

In [10]:
postal_codes.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


- If a cell has a borough but a <b>Not assigned</b> neighborhood, then the neighborhood will be the same as the borough.

In [11]:
postal_codes.loc[postal_codes['Neighborhood'] == 'Not assigned', ['Neighborhood']] = postal_codes['Borough']

- More than one neighborhood can exist in one postal code area. In such case, these two rows will be combined into one row with the neighborhoods separated with a comma.

However, it seems the data has been modified since the assignment was last updated.

In [12]:
postal_codes = postal_codes.groupby(['Postal code','Borough'])['Neighborhood'].apply(', '.join).reset_index()
postal_codes

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern , Rouge"
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village , St. Phillips , Martin Grov..."
101,M9V,Etobicoke,"South Steeles , Silverstone , Humbergate , Jam..."


In [13]:
postal_codes.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern , Rouge"
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


- Number of rows of dataframe

In [14]:
postal_codes.shape

(103, 3)

### Exporting data to reuse it

In [15]:
postal_codes.to_csv('canada_postal_codes.csv', index=False)