# Segmenting and Clustering - Part 1

#### Let us first import all the required libraries 

In [2]:
import numpy as np
import pandas as pd
import json
import requests
from bs4 import BeautifulSoup
from pandas.io.json import json_normalize
import lxml.html as lh

#### I will use BeautifulSoup to scrape the table from the Wikipedia page

In [3]:
url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(url,'lxml')

#### Using find_all function to scrape td(table data) tags and adding the data to a dataframe

In [6]:
table = soup.find('table')
elements = table.find_all('td')
postalcode = []
borough = []
neighbourhood = []

for i in range(0, len(elements), 3):
    postalcode.append(elements[i].text.strip())
    borough.append(elements[i+1].text.strip())
    neighbourhood.append(elements[i+2].text.strip())
        
df= pd.DataFrame(data=[postalcode, borough, neighbourhood]).transpose()
df.columns = ['Postal Code', 'Borough', 'Neighbourhood']
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Removing "Not Assigned" boroughs

In [7]:
df['Borough'].replace('Not assigned', np.nan, inplace=True)
df.dropna(subset=['Borough'], inplace=True)

df.head(15)

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


#### Aggregating multiple neighbourhoods for a single postal code

In [8]:
df1 = df.groupby(['Postal Code', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df1.columns = ['Postal Code', 'Borough', 'Neighbourhood']

df1.head(15)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [9]:
df1.shape

(103, 3)

#### Saving the dataframe to a file

In [10]:
df1.to_csv('location.csv', index=False)

### Stay tuned for part 2!!!