# 1. Scraping the data from Wikipedia into a pandas DataFrame

In [0]:
# The packages required for the exercise
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [0]:
# Getting the webpage from the url using requests package
website_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
webpage = requests.get(website_url).text

In [0]:
# This function retrieves the raw data from the webpage
soup = BeautifulSoup(webpage, 'lxml')

# Retrieving the specific table that contains the data that we require
table = soup.find('table', {
    'class': 'wikitable sortable'
})

In [0]:
# Retrieving all the <tr> tags from the soup
# Each of these tags will represent a row in our DataFrame
rows = table.find_all('tr')[1:]

# Populating an array with the values from the rows
values = []
for row in rows:
  row_for_df = []
  for val in row.find_all('td'):
    row_for_df.append(val.contents[0].rstrip())

  # Converting all the multiple values of Neighborhood from '/' separated to comma separated
  row_for_df[2] = row_for_df[2].replace(' /', ',')

  # Appending all the rows to a list called values
  values.append(row_for_df)

In [5]:
# Creating the DataFrame with the scraped values and with appropriate column names
columns = ['PostalCode', 'Borough', 'Neighborhood']
toronto_df = pd.DataFrame(values, columns = columns)


# Printing the top 5 elements of the DataFrame
print(f'The length of the dataframe is {len(toronto_df)}')
toronto_df.head()

The length of the dataframe is 180


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


# 2. Data Wrangling - converting data from the raw form to a workable form

In [6]:
# Dropping all the rows that have a "Not assigned" Borough
toronto_df = toronto_df[toronto_df['Borough'] != 'Not assigned']


# Printing the top 5 elements of the DataFrame
print(f'The length of the dataframe is {len(toronto_df)}')
toronto_df.head()

# The entries with multiple neighborhoods are already handled while reading the data

The length of the dataframe is 103


Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [7]:
# Printing the shape of the toronto dataframe
print(f'The shape of the DataFrame is {toronto_df.shape}')

The shape of the DataFrame is (103, 3)
