# Neighborhoods in Toronto

## 1. Website scraping using BeautifulSoup package

In [13]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [7]:
url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(url, 'lxml')

In [8]:
my_table = soup.find('table',{'class':'wikitable sortable'})
contents = my_table.findAll('td')

## 2. Transform the data into a pandas dataframe

In [9]:
len(contents)

864

In [10]:
row_num_total = int(len(contents)/3)

In [11]:
# get all table contents in a list
t = [r.get_text() for r in contents]

In [14]:
#convert the list to data frame of the correct format
df = pd.DataFrame(np.array(t).reshape(row_num_total,3), columns = list(['PostCode','Borough', 'Neighbourhood']))

In [15]:
df.head()

Unnamed: 0,PostCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


In [17]:
# remove the '\n' at the end of each Neighbourhood
df['Neighbourhood'] = df['Neighbourhood'].str.replace(r'\n', '')

In [18]:
df.head()

Unnamed: 0,PostCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## 3. Ignore cells with a borough that is Not assigned

In [20]:
df1 = df[df.Borough != 'Not assigned']

In [21]:
df1.head()

Unnamed: 0,PostCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


## 4. Combine neighborhoods existing in one postal code area

In [25]:
df2 = df1.groupby(['PostCode', 'Borough'], sort = False).agg(', '.join)
df2.reset_index(inplace = True)

In [26]:
df2.head()

Unnamed: 0,PostCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Not assigned


## 5. If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. 

In [27]:
df2['Neighbourhood'] = df2['Neighbourhood'].replace('Not assigned', df2['Borough'])

In [28]:
df2.head()

Unnamed: 0,PostCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


## 6. Print the number of rows

In [29]:
df2.shape

(103, 3)

### Woohoo, that's the end. 