## Week 3 Assignment

### Importing the necessary packages

In [31]:
import requests
import json

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors

!pip install beautifulsoup4
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np

print('Libraries imported.')

Libraries imported.


## 1. Scraping the data

In [33]:
# Get the text
return_text = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text


## 2. Find and extract the data from the relevant table on the wikipage

In [46]:
soup = BeautifulSoup(return_text)
table = soup.find('table', {'class': 'wikitable sortable'})
#print(table)

## 3. Create a data frame

In [36]:
postalcode_rows = []

for table_row in table.findAll('tr'):
    columns = table_row.findAll('td')
    postalcode_row = []
    for column in columns:
        postalcode_row.append(column.text)
    postalcode_rows.append(postalcode_row)
    
df=pd.DataFrame(postalcode_rows, columns=['PostalCode', 'Borough', 'Neighbourhood'])
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,,,
1,M1A,Not assigned,Not assigned\n
2,M2A,Not assigned,Not assigned\n
3,M3A,North York,Parkwoods\n
4,M4A,North York,Victoria Village\n
5,M5A,Downtown Toronto,Harbourfront\n
6,M5A,Downtown Toronto,Regent Park\n
7,M6A,North York,Lawrence Heights\n
8,M6A,North York,Lawrence Manor\n
9,M7A,Queen's Park,Not assigned\n


## 4. Clean up the data by removing '\n' and the first row which contains 'None'

In [38]:
# remove '\n' from the Neighbourhood column
df['Neighbourhood'] = df['Neighbourhood'].replace('\n','', regex=True)

# drop the first row 
df.drop(df.index[0], inplace=True)

# remove the boroughs with 'Not Assigned'
df = df[df.Borough != 'Not assigned']

df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor


## 5. For Neighbourhoods that contain 'Not assigned', assign the borough name

In [39]:
df.loc[df['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = df['Borough']
df.head(15)

Unnamed: 0,PostalCode,Borough,Neighbourhood
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Queen's Park
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge
13,M1B,Scarborough,Malvern
15,M3B,North York,Don Mills North


## 6. Group the data by PostalCode

In [40]:
df = df.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
df.head(15)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [41]:
df.shape

(102, 3)

## Part 2 - Latitude and Longitude for each Neighbourhood

In [42]:
# read the csv file into a dataframe

url='https://cocl.us/Geospatial_data'
df_coord=pd.read_csv(url)
df_coord.columns = ['PostalCode', 'Latitude', 'Longitude']
df_coord.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [44]:
# combine the co-ordinates dataframe with the main dataframe

df_loc = pd.merge(df, df_coord, on=['PostalCode'], how='inner')
df_loc.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [45]:
df_loc.shape

(102, 5)