# Clustering Neighbourhood in Toronto
### Part 1

In [1]:
from urllib.request import urlopen as uReq
from  bs4 import BeautifulSoup as soup
import json 
from geopy.geocoders import Nominatim 
import requests 
from pandas.io.json import json_normalize 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
from sklearn import metrics
import folium 
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist

print('Libraries imported.')

Libraries imported.


# ***Web Scraping Wikipedia***

In [2]:
my_url = 'https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=1011037969'

uClient = uReq(my_url)
page_html = uClient.read()

uClient.close()

In [3]:
page_soup = soup(page_html, "html.parser")
# Finding the the first header
page_soup.h1 

<h1 class="firstHeading" id="firstHeading">List of postal codes of Canada: M</h1>

In [4]:
#Checking for tables
myTable = page_soup.find("table",{"class":"wikitable sortable"})
myTable_row = myTable.findAll('td')
myTable_row[0:7]

[<td>M1A
 </td>,
 <td>Not assigned
 </td>,
 <td>Not assigned
 </td>,
 <td>M2A
 </td>,
 <td>Not assigned
 </td>,
 <td>Not assigned
 </td>,
 <td>M3A
 </td>]

In [5]:
#Seeing that every 3 items are the postal code, borough and neighborhood respectively, I will create a for loop to put each item in its respective list.
post_code = []                                       
for i in range(0,len(myTable_row),3):
    joined_pc ="".join(myTable_row[i])
    joined_pc = str(joined_pc)
    joined_pc = joined_pc.replace("\n","")
    post_code.append(joined_pc)
    
post_code[0:4]    

['M1A', 'M2A', 'M3A', 'M4A']

In [6]:
borough = []                               
for i in range(1,len(myTable_row),3):
    joined_bo ="".join(myTable_row[i])
    joined_bo = str(joined_bo)
    joined_bo = joined_bo.replace("\n","")
    borough.append(joined_bo)
print(borough[:5])

['Not assigned', 'Not assigned', 'North York', 'North York', 'Downtown Toronto']


In [7]:
neig = []
for i in range(2,len(myTable_row),3):
    joined_ne ="".join(myTable_row[i])
    joined_ne = str(joined_ne)
    joined_ne = joined_ne.replace("\n","")
    neig.append(joined_ne)
print(neig[:5])

['Not assigned', 'Not assigned', 'Parkwoods', 'Victoria Village', 'Regent Park, Harbourfront']


In [8]:
df = pd.DataFrame({"Postal Code":post_code,"Borough":borough,"Neighborhood":neig})

df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [9]:
df.replace("Not assigned",np.nan, inplace = True)

df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [10]:
#If both the Borough and Neighborhood are not assigned for a particular Postal Code I will drop the entire row. 
df.replace("Not assigned",np.nan, inplace = True)

df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [11]:
missing_values = df.isnull()

for column in missing_values.columns.values.tolist():
    print(column)
    print (missing_values[column].value_counts())
    print("")   

Postal Code
False    180
Name: Postal Code, dtype: int64

Borough
False    103
True      77
Name: Borough, dtype: int64

Neighborhood
False    103
True      77
Name: Neighborhood, dtype: int64



In [12]:
df.dropna(subset=["Borough"],axis = 0,inplace = True)

In [13]:
df.reset_index(drop = True, inplace = True)

In [14]:
df.sort_values(by=["Postal Code"], inplace =True)

In [15]:
df.reset_index(drop = True, inplace = True)
df.head(12)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [16]:
df.to_csv("postalcodeTable.csv", index = False)

In [17]:
df.shape

(103, 3)