# Week Three Assignment - Part 1 #

## Imports ##

In [1]:
import urllib
from bs4 import BeautifulSoup
import pandas as pd

## Load in the wiki page as raw text ##

In [2]:
wiki_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
wiki_file = urllib.request.urlopen(wiki_url)
wiki_raw = wiki_file.read()
wiki_raw[0:200]

b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>List of postal codes of Canada: M - Wikipedia</title>\n<script>document.documentElement.className = '

## Extract the table ##

In [3]:
soup = BeautifulSoup(wiki_raw, "lxml")

In [4]:
table_rows = soup.body.table.tbody

## Convert the table to a 2d list structure ##

In [5]:
first = True
rows = []
for table_row in table_rows.find_all('tr'):
    if first:
        first = False
        continue

    fields = []
    for field in table_row.find_all('td'):
        fields.append(field.get_text().strip())
    rows.append(fields)
rows[0:5]

[['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront']]

## Clean the data ##

In [6]:
pc_dict = {}
for row in rows:
    if row[1] == 'Not assigned':
        continue
    if row[2] == 'Not assigned':
        row[2] = row[1]
    if row[0] in pc_dict:
        pc_dict[row[0]][1] = pc_dict[row[0]][1] + ', ' + row[2]
    else:
        pc_dict[row[0]] = [row[1], row[2]]
cleaned_data = []
for key, value in pc_dict.items():
    cleaned_data.append([key, value[0], value[1]])
cleaned_data = sorted(cleaned_data)
cleaned_data[0:5]

[['M1B', 'Scarborough', 'Rouge, Malvern'],
 ['M1C', 'Scarborough', 'Highland Creek, Rouge Hill, Port Union'],
 ['M1E', 'Scarborough', 'Guildwood, Morningside, West Hill'],
 ['M1G', 'Scarborough', 'Woburn'],
 ['M1H', 'Scarborough', 'Cedarbrae']]

## Convert the data to a Pandas DataFrame##

In [7]:
pc_data = pd.DataFrame(cleaned_data)
pc_data.columns = ['PostalCode', 'Borough', 'Neighborhood']
pc_data.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


## The data's shape##

In [8]:
pc_data.shape

(103, 3)

# Week Three Assignment - Part Two#

## Load the latitudes and longitudes file using Pandas##

In [9]:
lats_and_longs = pd.read_csv("https://cocl.us/Geospatial_data")

In [10]:
lats_and_longs.columns = ['PostalCode', 'Latitude', 'Longitude']
lats_and_longs.head(20)

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


## Join the two dataframes on 'PostalCode'##

In [11]:
to_data = pc_data.join(lats_and_longs.set_index('PostalCode'), 'PostalCode', 'left')
to_data.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [12]:
to_data.shape

(103, 5)