# Applied Data Science Capstone Project

## Toronto Neighborhoods

### Part 1. Data Loading, Preparation and Wrangling

*Importing all necessary libraries*

In [1]:
from bs4 import BeautifulSoup
import urllib
import pandas as pd
import numpy as np

*Downloading and soup'ifying data*

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [3]:
html_doc = urllib.request.urlopen(url)

In [4]:
soup = BeautifulSoup(html_doc, 'html.parser')

*Converting data to a dataframe*

In [5]:
table = soup.find('table')
table_rows = table.find_all('tr')

l = []

for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td]
    if row: 
        l.append(row)
    
    
neigh = pd.DataFrame(l,columns=['Postcode','Borough','Neighborhood'])
neigh.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


*Removing rows not assigned to any borough*

In [6]:
dirty = ['Not assigned']
neigh = neigh[~neigh['Borough'].isin(dirty)]
neigh.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


*Replacing 'Not assigned' neighborhoods with borough names*

In [7]:
neigh['Neighborhood'].where(neigh['Neighborhood'] != 'Not assigned', neigh['Borough'], axis=0, inplace=True)
neigh.shape

(211, 3)

*Grouping rows by borough and postcode, and combining neighborhoods into lists*

In [8]:
neigh['Neighborhood'] = neigh.groupby(['Postcode','Borough'])['Neighborhood'].transform(lambda x: ', '.join(x))
neigh = neigh.drop_duplicates()

#Experimental approach to combine neighborhoods into a list, but it's more trouble than it's worth for the task at hand. 
#neigh = pd.DataFrame(neigh.groupby(['Postcode','Borough'])['Neighborhood'].apply(list)).reset_index()
neigh.head()

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Harbourfront, Regent Park"
6,M6A,North York,"Lawrence Heights, Lawrence Manor"
8,M7A,Queen's Park,Queen's Park


In [9]:
neigh.shape

(103, 3)