# Segmenting and Clustering Neighborhoods in Toronto

For this assignment, you will be required to explore and cluster the neighborhoods in Toronto.

#### Loading common libraries

In [1]:
import numpy as np 
import pandas as pd 

#### Loading Module BeautifulSoup

In [2]:
from bs4 import BeautifulSoup
import requests

#### Loading Wiki List of postal codes of Canada: M 

In [3]:
html = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

#### Parse HTML and create a list for HTML Table

In [4]:
soup = BeautifulSoup(html.text, 'html.parser')
tablesoup=soup.find("table")

def get_string(ele):
    if ele.find("a") != None :
        return ele.find("a").string.replace('\n','')
    else:
        return ele.string.replace('\n','')
    
my_list=[]
for row_td in tablesoup.find_all("tr"):
    row=row_td.find_all("td")
    if len(row) != 0 :
        my_list.append([get_string(row[0]),get_string(row[1]),get_string(row[2])])
#print(my_list)

#### Creating dataframe with above table

In [5]:
# This dataframe is not cleaned yet
df=pd.DataFrame(my_list,columns=['PostalCode','Borough','Neighborhood'])
df.reset_index(drop=True)
df.shape

(287, 3)

#### Removing cells Borough = "Not assigned"

In [6]:
#cleaning data
df=df[df.Borough != 'Not assigned']

df.shape

(210, 3)

#### In this step we will concatenate the Neighborhood with ',' for the same PostalCode and remove duplicate PostalCode
Using lamda function with lookup

In [7]:
def concat_str(narray):
    new_series = pd.Series(narray)
    s=new_series.str.cat(sep=',')
    return s
    
df['Neighborhood']=df.apply( lambda row: row['Neighborhood'] if ((df[df['PostalCode'] == row['PostalCode']].count())[0] < 2) else concat_str(df[df['PostalCode'] == row['PostalCode']]['Neighborhood'].unique()), axis=1)

df.drop_duplicates(subset ="PostalCode",keep = 'first',inplace=True) 

df.shape

(103, 3)

#### In this step for "Not assigned" Neighborhood we will assign the Borough value.  
I used lamda function inorder to achive it

In [8]:
df['Neighborhood']=df.apply( lambda row: row['Borough'] if (row['Neighborhood'] == 'Not assigned') else row['Neighborhood'], axis=1)

df.shape


(103, 3)

#### Displaying shape

In [9]:
df.shape

(103, 3)

#### Displaying full dataframe


In [10]:
df.reset_index(drop=True,inplace=True)
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Downtown Toronto,Queen's Park
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens,Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson,Garden District"
