# Web Scraping using BeautifulSoup

### Importing all required packages

In [1]:
import requests
import lxml.html as lh
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
result = requests.get(url).text

### Collecting required data and converting to dictionary then to dataframe

In [3]:
soup = BeautifulSoup(result, 'lxml')
col = []
values = []
table= soup.find('table', class_ = 'wikitable')

for match in table.find_all('th'):
    head = match.text.rstrip('\n')
    col.append((head,[]))

for item in table.find_all('td'):
    data = item.text.rstrip('\n')
    values.append(data)

for i in range(len(values)):
    col[i%3][1].append(values[i])
    i+=1

Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)
df.tail()

Unnamed: 0,Postal code,Borough,Neighborhood
175,M5Z,Not assigned,
176,M6Z,Not assigned,
177,M7Z,Not assigned,
178,M8Z,Etobicoke,Mimico NW / The Queensway West / South of Bloo...
179,M9Z,Not assigned,


### Refining the dataframe

In [4]:
df = df[df.Borough!='Not assigned'] # Deleting rows having Not assigned values in Borough column

In [5]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [6]:
df.reset_index() # Resetting the index

Unnamed: 0,index,Postal code,Borough,Neighborhood
0,2,M3A,North York,Parkwoods
1,3,M4A,North York,Victoria Village
2,4,M5A,Downtown Toronto,Regent Park / Harbourfront
3,5,M6A,North York,Lawrence Manor / Lawrence Heights
4,6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
...,...,...,...,...
98,160,M8X,Etobicoke,The Kingsway / Montgomery Road / Old Mill North
99,165,M4Y,Downtown Toronto,Church and Wellesley
100,168,M7Y,East Toronto,Business reply mail Processing CentrE
101,169,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...


In [7]:
df['Neighborhood'] = df['Neighborhood'].str.replace('/',',') # Replacing # with ,

In [8]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park , Harbourfront"
5,M6A,North York,"Lawrence Manor , Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


In [9]:
df['Neighborhood'] = df['Neighborhood'].str.replace(' , ',', ')

In [10]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [11]:
df[df['Neighborhood']==''] #Checking if column Neighborhood has any blank values

Unnamed: 0,Postal code,Borough,Neighborhood


In [12]:
df.shape

(103, 3)