# Segmenting and Clustering Neighborhoods of Toronto

## Importing Libraries

In [7]:
import numpy as np 

import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json 
#!conda install -c conda-forge geopy --yes  
from geopy.geocoders import Nominatim 
!pip install lxml
import lxml
import requests 
from pandas.io.json import json_normalize 


import matplotlib.cm as cm
import matplotlib.colors as colors


from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes 
import folium 


Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/bd/78/56a7c88a57d0d14945472535d0df9fb4bbad7d34ede658ec7961635c790e/lxml-4.6.2-cp36-cp36m-manylinux1_x86_64.whl (5.5MB)
[K     |████████████████████████████████| 5.5MB 4.8MB/s eta 0:00:01     |█████████                       | 1.6MB 4.8MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.6.2


## Webscraping with Pandas

In [8]:
data=pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
data

[    Postal Code           Borough  \
 0           M1A      Not assigned   
 1           M2A      Not assigned   
 2           M3A        North York   
 3           M4A        North York   
 4           M5A  Downtown Toronto   
 5           M6A        North York   
 6           M7A  Downtown Toronto   
 7           M8A      Not assigned   
 8           M9A         Etobicoke   
 9           M1B       Scarborough   
 10          M2B      Not assigned   
 11          M3B        North York   
 12          M4B         East York   
 13          M5B  Downtown Toronto   
 14          M6B        North York   
 15          M7B      Not assigned   
 16          M8B      Not assigned   
 17          M9B         Etobicoke   
 18          M1C       Scarborough   
 19          M2C      Not assigned   
 20          M3C        North York   
 21          M4C         East York   
 22          M5C  Downtown Toronto   
 23          M6C              York   
 24          M7C      Not assigned   
 25         

## Checking Data Type of Table

In [9]:
d1=data[0]

print(type(d1))

<class 'pandas.core.frame.DataFrame'>


In [10]:
d1

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


## Dropping Cells where Borough is NOT assigned

In [11]:
d1.drop(d1[d1['Borough'] == 'Not assigned'].index, inplace=True)
d1

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


## Checking if Postal Codes are Repetitive

In [25]:
d1['Postal Code'].value_counts()    #All are Unique

M4G    1
M4M    1
M1L    1
M1W    1
M1K    1
M8X    1
M4C    1
M6R    1
M4A    1
M5M    1
M5B    1
M4P    1
M5R    1
M1J    1
M6A    1
M2M    1
M5P    1
M5L    1
M8V    1
M5E    1
M1X    1
M4R    1
M5C    1
M6M    1
M3C    1
M5A    1
M1S    1
M9W    1
M4H    1
M9L    1
M4Y    1
M5S    1
M3J    1
M2H    1
M8Y    1
M6L    1
M3H    1
M7A    1
M4K    1
M2R    1
M6S    1
M2K    1
M3N    1
M1M    1
M4W    1
M9R    1
M2N    1
M9C    1
M4E    1
M4X    1
M3B    1
M5V    1
M1P    1
M8Z    1
M8W    1
M3A    1
M4V    1
M1V    1
M1G    1
M9P    1
M2P    1
M1N    1
M9B    1
M6J    1
M5N    1
M1C    1
M5W    1
M6G    1
M3M    1
M6B    1
M6E    1
M1E    1
M1H    1
M5G    1
M5H    1
M4T    1
M6P    1
M6K    1
M4J    1
M5T    1
M9V    1
M1B    1
M9N    1
M2J    1
M4S    1
M1T    1
M7R    1
M4N    1
M3K    1
M5J    1
M4L    1
M1R    1
M7Y    1
M5X    1
M5K    1
M4B    1
M6C    1
M9M    1
M2L    1
M6H    1
M6N    1
M3L    1
M9A    1
Name: Postal Code, dtype: int64

## Resetting Index of DataFrame

In [36]:
d2=d1.reset_index(drop=True)
d2

Unnamed: 0,level_0,index,Postal Code,Borough,Neighbourhood
0,0,2,M3A,North York,Parkwoods
1,1,3,M4A,North York,Victoria Village
2,2,4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,3,5,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,4,6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,5,8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,6,9,M1B,Scarborough,"Malvern, Rouge"
7,7,11,M3B,North York,Don Mills
8,8,12,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,9,13,M5B,Downtown Toronto,"Garden District, Ryerson"


In [37]:
d2=d2.drop(['level_0','index'],axis=1)
d2

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


## Shape of DataFrame

In [38]:
d2.shape

(103, 3)