## Scraping Data from a Table on a Website into a Pandas DataFrame


## Import Libraries

In [52]:
import requests 
from bs4 import BeautifulSoup 
import pandas as pd 
import numpy as np

## Send the GET request to get the content of the website

In [53]:
#sent the GET Request 
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url)
#Store the contents of the website under doc
doc = page.text

## Scraping Table data with BeautifulSoup

In [54]:
soup=BeautifulSoup(doc, 'html.parser')
table=soup.find('table', {'class': 'wikitable sortable'})

## Read and convert table into a pandas DataFrame

In [55]:
df1=pd.read_html(str(table), header=0)
df1
df=pd.DataFrame(df1[0])
df[0:5]

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Preprossessing

### Keep only rows with assigned Borough


In [56]:
# delete rows with unassigned Borough
df=df[df['Borough'] !='Not assigned']
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## Check for duplicate Postal Codes


In [57]:
print('the dataset has ', len(df['Postal Code']), 'postal Codes')
print('the dataset has ', len(df['Postal Code'].unique()), 'unique postal Codes')
if len(df['Postal Code'])==len(df['Postal Code'].unique()):
    print("No Postal Code is listed more than once!")
       
#reset the index 
df.reset_index(drop=True, inplace=True)
df.head()

the dataset has  103 postal Codes
the dataset has  103 unique postal Codes
No Postal Code is listed more than once!


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## check to see if there are Neighborhoods with values "Not assigned"

In [58]:
Notassigned=df[df['Neighborhood']=="Not assigned"]
print(Notassigned)
if Notassigned.shape[0]==0:
    print("Threre are No Neighborhoods with values= Not Assigned")

Empty DataFrame
Columns: [Postal Code, Borough, Neighborhood]
Index: []
Threre are No Neighborhoods with values= Not Assigned


In [59]:
print(df.shape[0])

103
