In [56]:
import pandas as pd # library for data analsysis
import requests # library to handle requests
from lxml import html # library to collect html data

## Data Collection

In [12]:
url_wiki = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

session = requests.session()
session.verify = False

resp = session.get(url_wiki)
root = html.fromstring(resp.content)

In [45]:
def extract_data(root):
    row_combox = root.xpath('.//table[contains(@class, "wikitable")]/tbody/tr')
    cols = [line.text.strip().replace(" ", "_").lower() for line in list(row_combox[0])]

    toronto_data = []
    for row in row_combox[1:]:
        row_dict = {}
        for col in zip(cols, list(row)):
            row_dict[col[0]] = col[1].text.strip() if not col[1].text.strip() == 'Not assigned' else None
        
        toronto_data.append(row_dict)
    
    return toronto_data

In [58]:
toronto_data = extract_data(root)
toronto_data = pd.DataFrame(toronto_data)
toronto_data.head()

Unnamed: 0,postal_code,borough,neighbourhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Data Wrangling 

In [65]:
# Check for NaN
nan_borough = toronto_data['borough'].isnull().sum()
print("There are 77 rows with empty boroughs.")

There are 77 rows with empty boroughs


In [66]:
# Remove all rows with null borough 
toronto_data = toronto_data.loc[~toronto_data['borough'].isnull()]
toronto_data.isnull().sum()

postal_code      0
borough          0
neighbourhood    0
dtype: int64

In [68]:
# Check for dataframe data types
toronto_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103 entries, 2 to 178
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   postal_code    103 non-null    object
 1   borough        103 non-null    object
 2   neighbourhood  103 non-null    object
dtypes: object(3)
memory usage: 8.2+ KB


In [69]:
num_rows = toronto_data.shape[0]
print(f'There are {num_rows} rows in the Toronto dataset.')

There are 103 in the Toronto dataset
