# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto #

#### Author : Ratnesh Mehrotra #####

### Part 2: Scraping Wikipedia to build a DataFrame with Postal Code, Borough and Neighbourhood columns and adding Latitude and Longitude columns to the dataframe ###

In [2]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [3]:
# Read the Wikipedia page
page = requests.get(" https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

In [4]:
# Use of Beautiful Soup HTML parser
soup = BeautifulSoup(page.content, 'html.parser')
table_body = soup.find_all('table', class_='wikitable sortable')
rows = table_body[0].find_all('tr')

all_rows = []
for row in rows:
    cols=row.find_all('td')
    if (len(cols) != 0):
        cols=[x.text.strip() for x in cols]
        all_rows.append(cols)

df=pd.DataFrame(all_rows,columns=['Postcode','Borough','Neighbourhood'])
df.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [5]:
# Get names of indexes for which Borough == "Not assigned", and remove those rows
indexNames = df[ df['Borough'] == 'Not assigned' ].index
 
# Delete these row indexes from dataFrame
df.drop(indexNames , inplace=True)
df = df.reset_index(drop=True)
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [6]:
# replace "Not assigned" for Neighbourhood with the Borough name
col_names =  ['Postcode', 'Borough', 'Neighbourhood']
df_new = pd.DataFrame(columns = col_names)
for index, row in df.iterrows():
    if row[2] == "Not assigned":
        #df_new = df_new.append(row)
        df_new.loc[index] = [row[0], row[1], row[1]]
    else:
        df_new.loc[index] = [row[0], row[1], row[2]]


df_new = df_new.reset_index(drop=True)
df_new.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [7]:
# concatenate neighbourhood belonging to same postal code 
# Iterate over rows in dataframe, and build a dictionary to check for Neighbourhood with same postal code
# Key in dictionary is Postal Code. Values are tuples with PostalCode, Borough, and Neighbour or string of neighbourhood

col_names =  ['Postcode', 'Borough', 'Neighbourhood']
temp_dict = {}

for index, row in df.iterrows():
    if row[0] in temp_dict:
        temp_dict[row[0]] = ( row[0], row[1], temp_dict.get(row[0],"")[2] + " , "+ row[2] )
    else: 
        temp_dict[row[0]] = ( row[0], row[1], row[2] )

temp_list = list(temp_dict.values())
df_groupedby = pd.DataFrame.from_records(temp_list, columns=col_names)
df_groupedby.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M4J,East York,East Toronto
1,M9P,Etobicoke,Westmount
2,M4B,East York,"Woodbine Gardens , Parkview Hill"
3,M4Y,Downtown Toronto,Church and Wellesley
4,M2N,North York,Willowdale South
5,M4L,East Toronto,"The Beaches West , India Bazaar"
6,M4T,Central Toronto,"Moore Park , Summerhill East"
7,M1T,Scarborough,"Clarks Corners , Sullivan , Tam O'Shanter"
8,M1G,Scarborough,Woburn
9,M4E,East Toronto,The Beaches


#### Printing the number of rows in final dataframe. ####

In [8]:
df_groupedby.shape

(103, 3)

### Section 2: Adding latitude & Longitude information ###
##### Note: Using the provided CSV file as Google service failed to work hence using the CSV  #####

In [9]:
# read the lat long CSV file
df_latlong = pd.read_csv("http://cocl.us/Geospatial_data/Geospatial_Coordinates.csv")
df_latlong = df_latlong.rename(columns={'Postal Code': 'Postcode'})
df_groupedby_latlong = pd.merge(df_groupedby, df_latlong,on='Postcode')

In [10]:
df_groupedby_latlong.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4J,East York,East Toronto,43.685347,-79.338106
1,M9P,Etobicoke,Westmount,43.696319,-79.532242
2,M4B,East York,"Woodbine Gardens , Parkview Hill",43.706397,-79.309937
3,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
4,M2N,North York,Willowdale South,43.77012,-79.408493
5,M4L,East Toronto,"The Beaches West , India Bazaar",43.668999,-79.315572
6,M4T,Central Toronto,"Moore Park , Summerhill East",43.689574,-79.38316
7,M1T,Scarborough,"Clarks Corners , Sullivan , Tam O'Shanter",43.781638,-79.304302
8,M1G,Scarborough,Woburn,43.770992,-79.216917
9,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [11]:
df_groupedby.shape

(103, 3)