# PART 1

## First import necessary libraries

In [27]:
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import json
from geopy.geocoders import Nominatim
import requests
from bs4 import BeautifulSoup
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

print("Done importing libraries.")

Done importing libraries.


## Full process for web scraping wikipedia page into a dataframe

#### 1. scrape the data and make a beautiful soup object

In [28]:
webdata = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soupobj = BeautifulSoup(webdata, 'html.parser')

#### list placeholders for postal code, borough names and neighborhoods

In [29]:
postcode = []
borough = []
neighborhood = []

#### 2. look for table and get all data within cells using a for loop

In [30]:
soupobj.find('table').find_all('tr')
for i in soupobj.find('table').find_all('tr'):
    cells = i.find_all('td')

#### 3. fill the lists with respective data by looping

In [31]:
for j in soupobj.find('table').find_all('tr'):
    cells = j.find_all('td')
    if(len(cells) > 0):
        postcode.append(cells[0].text)
        borough.append(cells[1].text)
        neighborhood.append(cells[2].text.rstrip('\n')) # prevent new line

#### 4. make a dataframe using filled lists

In [32]:
toronto1 = pd.DataFrame({"PostalCode": postcode, "BoroughName": borough, "NeighborhoodName": neighborhood})
toronto1.head()

Unnamed: 0,PostalCode,BoroughName,NeighborhoodName
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### 5. drop rows with "Not assigned" burroughs

In [33]:
toronto2 = toronto1[toronto1.BoroughName != "Not assigned"].reset_index(drop=True)
toronto2.head()

Unnamed: 0,PostalCode,BoroughName,NeighborhoodName
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


#### 6. clump together neighborhoods in same borough

In [34]:
toronto3 = toronto2.groupby(["PostalCode", "BoroughName"], as_index=False).agg(lambda x: ", ".join(x))
toronto3.head()

Unnamed: 0,PostalCode,BoroughName,NeighborhoodName
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### 7. if a cell has a borough but a "Not assigned" neighborhood, then the neighborhood will be the same as the borough

In [35]:
for i, j in toronto3.iterrows():
    if j["NeighborhoodName"] == "Not assigned":
        j["NeighborhoodName"] = j["BoroughName"]
        
toronto3.head()

Unnamed: 0,PostalCode,BoroughName,NeighborhoodName
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### 8. (optional) test output accuracy by comparing with question example

In [36]:
test_col = ["PostalCode", "BoroughName", "NeighborhoodName"]
test_df = pd.DataFrame(columns=test_col)

test_pc = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for i in test_pc:
    test_df = test_df.append(toronto3[toronto3["PostalCode"]==i], ignore_index=True)
    
test_df

Unnamed: 0,PostalCode,BoroughName,NeighborhoodName
0,M5G,Downtown Toronto,Central Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Woodbine Gardens, Parkview Hill"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,Studio District
6,M1R,Scarborough,"Maryvale, Wexford"
7,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo..."


#### 9. visually check if above test is accurate, rename toronto3 to toronto_clean and use shape

In [37]:
toronto_clean = toronto3
toronto_clean.shape

(103, 3)