## Importing libraries though to be used later on...

In [125]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import matplotlib.pyplot as plt

## Getting and parsing the data from Wikipedia
    1. Retrieving the raw html using a get request.
    2. Extracting the second table's rows textual data
    3. Conversion to a DataFrame

### 1. Retrieving the html content of the Wikipedia page

In [126]:
wiki_page_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page_content = str(requests.get(wiki_page_url).content)

### 2. Extracting the textual content of the first table

In [127]:
parser = BeautifulSoup(page_content, parser = "html.parser")

In [128]:
tables = bs_instance.findAll("table")

In [129]:
rows = tables[0].findAll("tr")
row_text =[rows[i].getText().replace("\n\n", ",").replace("\n", "").split(",") for i in range(0, len(rows))]

### 3. Converting to a DataFrame. 
    For some reason numpy would not recognise my "row_text" array's second dimension, therefore I could not use it directly to construct a dataframe. Therefore, I used a loop below to create a dictionary and then pass that to the DataFrame constructor.

In [130]:
columns = row_text[0]
data_dict = {k: [] for k in columns}

In [131]:
data_dict

{'Postal Code': [], 'Borough': [], 'Neighbourhood': []}

In [132]:
for row in row_text[1:]:
    for i, col in enumerate(columns):
        data_dict[col].append(row[i])

In [143]:
postal_df = pd.DataFrame(data_dict)
postal_df.head()


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park


## Transforming the data:
    1. If a postal code is not assigned to a Borogh, it will be dropped.
    2. If a Neighborhood is not assigned, it will be the same as the Borough.
    3. If a Borough has multiple neighborhoods, it will be combined to one row.


### 1. Getting rid of unassigned postal codes. Could have used boolean indexing and a subset of the original dataframe.

In [144]:
postal_df["Borough"].replace("Not assigned", np.nan, inplace = True)
postal_df["Borough"].isna().sum()

77

In [145]:
postal_df.dropna(subset = ["Borough"], axis=0, inplace = True)
postal_df["Borough"].isna().sum()

0

### 2. Replacing unassigned Neighborhoods
    This is an unnecessary step since as of today, if a Borough is unassigned, that goes for the neighbourhood as well, vica-versa. See the empty dataframe below.

In [154]:
postal_df[postal_df["Neighbourhood"] == "Not assigned"]

Unnamed: 0,Postal Code,Borough,Neighbourhood


### 3. Combining postcodes with multiple neighborhoods
    Unnecessary step again, in its current state, it has already been done at the wikipedia page.

In [189]:
value_counts = pd.DataFrame(postal_df["Postal Code"].value_counts()).reset_index()
value_counts.columns = ["code", "counts"]
value_counts.sort_values(by="counts", ascending = False)

Unnamed: 0,code,counts
0,M1R,1
65,M1C,1
75,M3A,1
74,M5X,1
73,M6C,1
...,...,...
31,M9W,1
30,M4M,1
29,M9L,1
28,M9P,1


## Tidying up

In [190]:
postal_df.rename(columns = {"Postal Code": "PostalCode"}, inplace = True)

In [192]:
postal_df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park
5,M6A,North York,Lawrence Manor
6,M7A,Downtown Toronto,Queen's Park


In [193]:
postal_df.shape

(103, 3)

In [194]:
postal_df.to_csv("raw_scraped_canadian_postal_codes.csv")