# Peer-Graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

Import necessary libraries for the assignment.

In [3]:
import pandas as pd # library to process data as dataframes
import urllib.request # library to import data from url
from bs4 import BeautifulSoup # library for working with HTML / XML

print('Libraries imported.')

Libraries imported.


## Assignment 1.1

### Scrape Wikipedia

In [4]:
# Specify which URL/web page we are going to be scraping
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [5]:
# Open the url using urllib.request and put the HTML into the page variable
page = urllib.request.urlopen(url)

In [6]:
# Use the BeautifulSoup library so we can parse HTML and XML documents
soup = BeautifulSoup(page, "lxml")

In [7]:
# Parse out everything contained in the table of "wikitable sortable" class
table=soup.find('table', class_='wikitable sortable')

In [8]:
# Check the type of the parsed table to ensure that Beautiful Soup methods can be used
type(table)

bs4.element.Tag

In [9]:
# Loop through rows of the table to parse out the three columns of data
A=[]
B=[]
C=[]

for row in table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))

# Remove unwanted delimiters from the lists
A = list([a.replace('\n', '') for a in A]) # remove all '\n' 
B = list([b.replace('\n', '') for b in B]) # remove all '\n' 
C = list([c.replace('\n', '') for c in C]) # remove all '\n' 

In [10]:
# Create a dataframe containing the three columns of data
df=pd.DataFrame(A, columns=['PostalCode'])
df['Borough']=B
df['Neighborhood']=C
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [11]:
# Delete rows containing 'Not assigned' boroughs
df.drop(df[df['Borough'] == 'Not assigned' ].index , inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [12]:
# Assign borough name where neighborhood name is 'Not assigned'
df['Neighborhood'][df['Neighborhood'] == 'Not assigned'] = df['Borough']

# Reset the index
df.reset_index(inplace=True, drop=True)

In [13]:
df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [14]:
df.shape

(103, 3)

## Assignment 1.2

In [15]:
df.to_csv('df_assignment_1_1.csv')

In [16]:
import geocoder # import geocoder

# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates
while(lat_lng_coords is None):
  g = geocoder.google('M5G, Toronto, Ontario')
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

KeyboardInterrupt: 

In [20]:
# Load dataframe with coordinates as geocoder is not working
df_coords = pd.read_csv('Geospatial_Coordinates.csv')
df_coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [22]:
df_test = pd.merge(df, df_coords, how='inner', left_on='PostalCode', right_on='Postal Code',
                   sort=True, copy=True, indicator=False).drop('Postal Code', axis = 1)
df_test.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
