# Capstone Project

The project is to explore, segment, and cluster the neighborhoods in the city of Toronto

## Importing Data

In [1]:
# importing libraries

import requests
import lxml.html as lh
import pandas as pd


In [2]:
# getting the data from the website

url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
#Create a handle, page, to handle the contents of the website
page = requests.get(url)
#Store the contents of the website under doc
doc = lh.fromstring(page.content)
#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')

In [3]:
tr_elements[0:5]

[<Element tr at 0x2a9640c6368>,
 <Element tr at 0x2a9640c63b8>,
 <Element tr at 0x2a9640c6278>,
 <Element tr at 0x2a9640c6318>,
 <Element tr at 0x2a9640c61d8>]

In [4]:
tr_elements = doc.xpath('//tr')
#Create empty list
col=[]
i=0
#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    print('%d:%s'%(i,name))
    col.append((name,[]))

1:Postal code

2:Borough

3:Neighborhood



In [5]:
#Since out first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 10, the //tr data is not from our table 
    if len(T)!=3:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

In [6]:
[len(C) for (title,C) in col]

[181, 181, 181]

In [7]:
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

In [8]:
df.head()

Unnamed: 0,Postal code\n,Borough\n,Neighborhood\n
0,M1A\n,Not assigned\n,\n
1,M2A\n,Not assigned\n,\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,Regent Park / Harbourfront\n


## Data Cleaning

In [9]:
df1 = df.replace('\n','', regex=True)

In [10]:
df1.head()

Unnamed: 0,Postal code\n,Borough\n,Neighborhood\n
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [11]:
# Renaming the columns

df1.rename(columns = {'Postal code\n' : 'Postal code', 'Borough\n' : 'Borough', 'Neighborhood\n' : 'Neighborhood' }, inplace = True)

In [12]:
# Checking first 5 rows of a dataframe

df1.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [13]:
# Checking the shape of a dataframe
df1.shape

(181, 3)

In [14]:
#Checking how many rows are Not Assigned in Borough Column

df1[df1.Borough == 'Not assigned'].count()

Postal code     77
Borough         77
Neighborhood    77
dtype: int64

#### Dropping the rows with the column 'Borough' having 'Not Assigned'

In [15]:
df1.drop(df1[df1['Borough'] == 'Not assigned'].index, inplace = True)

In [16]:
# Checking the shape of a dataframe

df1.head()

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


#### Checking the null values for all the 3 columns

In [17]:
df1['Postal code'].isnull().sum()

0

In [18]:
df1['Borough'].isnull().sum()

0

In [19]:
df1['Neighborhood'].isna().sum()

0

In [20]:
df1.isna().sum()

Postal code     0
Borough         0
Neighborhood    0
dtype: int64

In [21]:
df1

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
...,...,...,...
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,Business reply mail Processing CentrE
169,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...
178,M8Z,Etobicoke,Mimico NW / The Queensway West / South of Bloo...


#### Checking whether the column Neighborhood has any null values or assinged to the value 'Not Assigned'

In [22]:
df1[df1['Neighborhood'] == 'Not Assigned']

Unnamed: 0,Postal code,Borough,Neighborhood


In [23]:
df1['Neighborhood'].unique()

array(['Parkwoods', 'Victoria Village', 'Regent Park / Harbourfront',
       'Lawrence Manor / Lawrence Heights',
       "Queen's Park / Ontario Provincial Government", 'Islington Avenue',
       'Malvern / Rouge', 'Don Mills', 'Parkview Hill / Woodbine Gardens',
       'Garden District, Ryerson', 'Glencairn',
       'West Deane Park / Princess Gardens / Martin Grove / Islington / Cloverdale',
       'Rouge Hill / Port Union / Highland Creek', 'Woodbine Heights',
       'St. James Town', 'Humewood-Cedarvale',
       'Eringate / Bloordale Gardens / Old Burnhamthorpe / Markland Wood',
       'Guildwood / Morningside / West Hill', 'The Beaches',
       'Berczy Park', 'Caledonia-Fairbanks', 'Woburn', 'Leaside',
       'Central Bay Street', 'Christie', 'Cedarbrae', 'Hillcrest Village',
       'Bathurst Manor / Wilson Heights / Downsview North',
       'Thorncliffe Park', 'Richmond / Adelaide / King',
       'Dufferin / Dovercourt Village', 'Scarborough Village',
       'Fairview / Henry Far

In [24]:
df1[df1['Neighborhood'] == '']

Unnamed: 0,Postal code,Borough,Neighborhood
180,,Canadian postal codes,


In [25]:
# Dropping the unneccessary row

df1.drop(df1[df1['Borough'] == 'Canadian postal codes'].index, inplace = True)

In [26]:
df1.shape

(103, 3)

In [27]:
df1[df1['Neighborhood'] == '']

Unnamed: 0,Postal code,Borough,Neighborhood


The column 'Neighborhood' doesn't have any 'Not Assigned'or Null values

In [28]:
df1

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
...,...,...,...
160,M8X,Etobicoke,The Kingsway / Montgomery Road / Old Mill North
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,Business reply mail Processing CentrE
169,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...


#### Checking the Duplicates

In [29]:
df1['Postal code'].duplicated().sum()

0

In [30]:
df1.duplicated().sum()

0

#### Replacing the character '/' with ','

In [31]:
toronto_data = df1.replace(' / ',', ', regex=True)

In [32]:
toronto_data.head()

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [33]:
toronto_data.reset_index(drop = True, inplace = True)

In [34]:
toronto_data.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [35]:
toronto_data.shape

(103, 3)

After cleaning the data, dataframe has 103 rows and 3 columns