# Segmenting and Clustering neighbourhoods of Toronto

### This notebook contains initial process of extracting data from website and preprocessing the dataframe as per requirements mentioned in first part of assignment


#### Part 1

In [4]:
#Installing beautifulSoup and xml parser

!pip install beautifulsoup4
!pip install lxml



In [5]:
#Importing required libraries
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [6]:
#Getting the wikipedia page into soup object html response

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

wikipagecontent = requests.get(url)

wikipagecontent

soup = BeautifulSoup(wikipagecontent.text,'lxml')

### Scrape the wikipedia page to create a dataframe

In [57]:

row_dataframe=[]
table_soup=soup.find("table",class_="wikitable sortable")
#print(table_soup)


#Get table Heading and row contents from soup object
headings=table_soup.tbody.find_all('th')
table_data=table_soup.tbody.find_all('tr')


#extract Columns of dataframe     
columns_dataframe=[th.text.strip() for th in headings]

for tr in table_data:
 #Extract rows of dataframe   
    if tr.find_all('th'):
        continue;
    tds=tr.find_all('td')
    data = [td.text.strip() for td in tds];
    row_dataframe.append(data)
    

#Get the dataframe

postal_df=pd.DataFrame(row_dataframe,columns=columns_dataframe)

postal_df






Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
283,M8Z,Etobicoke,Mimico NW
284,M8Z,Etobicoke,The Queensway West
285,M8Z,Etobicoke,Royal York South West
286,M8Z,Etobicoke,South of Bloor


## Clean the data as per requirements
    

### Drop data with Borough as unassigned

In [58]:
postal_df.drop(postal_df[postal_df['Borough']=='Not assigned'].index,inplace=True)

postal_df.reset_index(drop=True)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
...,...,...,...
206,M8Z,Etobicoke,Kingsway Park South West
207,M8Z,Etobicoke,Mimico NW
208,M8Z,Etobicoke,The Queensway West
209,M8Z,Etobicoke,Royal York South West


### Merge the data within neighbourhood into comma seperated values for those which belong to same group of postal code

In [59]:
postal_grp=postal_df.groupby(['Postcode','Borough'],as_index=False)['Neighbourhood'].apply(lambda x: ','.join(x)).reset_index()
#postal_grp.rename(columns={"0":"Neighbourhood"},inplace=True)
postal_grp

postal_grp.rename(columns={0:"Neighbourhood"},inplace=True)

postal_grp





Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie..."
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."


### For records which has Borough but unassigned Neighbourhood.Make Neighbourhood same as borough

In [63]:


## Unassigned neighbourhood corresponding to Postcode M7A

postal_grp[postal_grp['Borough'] == 'Not assigned']
unassigned_neighbourhood=postal_grp[postal_grp['Neighbourhood']=='Not assigned']

print(unassigned_neighbourhood)

not_assigned_borough=unassigned_neighbourhood['Borough']
postal_grp['Neighbourhood'].replace(['Not assigned'],not_assigned_borough,inplace=True)

##THere is no unassigned Neighbourhood after replacement.After replacement The neighbourhood corresponding to M7A is assigned to corresponding Borough


postal_grp[postal_grp['Postcode'] == 'M7A']

   Postcode       Borough Neighbourhood
85      M7A  Queen's Park  Not assigned


Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Queen's Park


### shape of dataframe

In [12]:
postal_grp.shape

(103, 3)