# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto #

#### Author : Ratnesh Mehrotra #####

### Part 1: Scraping Wikipedia to build a DataFrame with Postal Code, Borough and Neighbourhood columns ###

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
# Read the Wikipedia page
page = requests.get(" https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

In [5]:
# Use of Beautiful Soup HTML parser
soup = BeautifulSoup(page.content, 'html.parser')
table_body = soup.find_all('table', class_='wikitable sortable')
rows = table_body[0].find_all('tr')

all_rows = []
for row in rows:
    cols=row.find_all('td')
    if (len(cols) != 0):
        cols=[x.text.strip() for x in cols]
        all_rows.append(cols)

df=pd.DataFrame(all_rows,columns=['Postcode','Borough','Neighbourhood'])
df.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [10]:
# Get names of indexes for which Borough == "Not assigned", and remove those rows
indexNames = df[ df['Borough'] == 'Not assigned' ].index
 
# Delete these row indexes from dataFrame
df.drop(indexNames , inplace=True)
df = df.reset_index(drop=True)
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [11]:
# replace "Not assigned" for Neighbourhood with the Borough name
col_names =  ['Postcode', 'Borough', 'Neighbourhood']
df_new = pd.DataFrame(columns = col_names)
for index, row in df.iterrows():
    if row[2] == "Not assigned":
        #df_new = df_new.append(row)
        df_new.loc[index] = [row[0], row[1], row[1]]
    else:
        df_new.loc[index] = [row[0], row[1], row[2]]


df_new = df_new.reset_index(drop=True)
df_new.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [12]:
# concatenate neighbourhood belonging to same postal code 
# Iterate over rows in dataframe, and build a dictionary to check for Neighbourhood with same postal code
# Key in dictionary is Postal Code. Values are tuples with PostalCode, Borough, and Neighbour or string of neighbourhood

col_names =  ['Postcode', 'Borough', 'Neighbourhood']
temp_dict = {}

for index, row in df.iterrows():
    if row[0] in temp_dict:
        temp_dict[row[0]] = ( row[0], row[1], temp_dict.get(row[0],"")[2] + " , "+ row[2] )
    else: 
        temp_dict[row[0]] = ( row[0], row[1], row[2] )

temp_list = list(temp_dict.values())
df_groupedby = pd.DataFrame.from_records(temp_list, columns=col_names)
df_groupedby.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M8V,Etobicoke,"Humber Bay Shores , Mimico South , New Toronto"
1,M4R,Central Toronto,North Toronto West
2,M4V,Central Toronto,"Deer Park , Forest Hill SE , Rathnelly , South..."
3,M6H,West Toronto,"Dovercourt Village , Dufferin"
4,M1H,Scarborough,Cedarbrae
5,M4H,East York,Thorncliffe Park
6,M6C,York,Humewood-Cedarvale
7,M2H,North York,Hillcrest Village
8,M3M,North York,Downsview Central
9,M1C,Scarborough,"Highland Creek , Rouge Hill , Port Union"


#### Printing the number of rows in final dataframe. ####

In [13]:
df_groupedby.shape

(103, 3)