# Segmenting and Clustering Neighbourhoods 

### Web Scraping

In [2]:
#importing necessary modules for Web scraping
import requests 
from bs4 import BeautifulSoup 
import csv
import pandas as pd

In [3]:
#assigning the URL to be read
URL ="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
r = requests.get(URL)

In [4]:
#Defining the Beautiful soup object
soup = BeautifulSoup(r.content,'html.parser')

In [5]:
#reading the table content
codes_list=[]
borough_list=[]
neighborhood_list=[]
i=1
for tag in soup.table.find_all('td'):
    if i == 1:
        codes_list.append(tag.text)
    if i == 2:
        borough_list.append(tag.text)
    if i == 3: 
        neighborhood_list.append(tag.text)
    i = i+1
    if i==4:
        i=1

In [6]:
#creating dataframe with the list content 
df = pd.DataFrame(columns=['Postalcode','Borough','Neighborhood'])
df['Postalcode'] = codes_list
df['Borough'] = borough_list
df['Neighborhood'] = neighborhood_list
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


In [7]:
#remove the rows with 'Not assigned' Borough
idx = df[df['Borough']=="Not assigned"].index
df.drop(df.index[idx], inplace=True)

In [8]:
#removing the unnecessary characters from the neighbourhood column
df=df.assign(Neighborhood=df['Neighborhood'].str.replace(r'\n', ''))

In [9]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [10]:
#Changing values of Neighbourhood which has values as Not assigned
for i in range(len(df)):
    if df.Neighborhood[i]=='Not assigned':
        df.Neighborhood[i] = df.Borough[i]

In [11]:
#combining the neighborhood with similar Borough and PostalCode
df = df.groupby(['Postalcode', 'Borough'])['Neighborhood'].apply(','.join).reset_index()
df.columns = ['Postalcode','Borough','Neighborhood']
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [12]:
df.shape

(103, 3)

In [13]:
df.to_csv('df_toronto.csv')