# Generating dataframe from the wiki data table

In [1]:
from bs4 import BeautifulSoup
import requests

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')

In [3]:
# print(soup.prettify())

In [4]:
table = soup.find('table')
# print(table.prettify())

Testing out how to fatch data from html using only one "tr"

In [5]:
l = table.tr
l

<tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>

In [6]:
print(l.contents[1].text)
print(l.contents[3].text)
print(l.contents[5].text)

Postcode
Borough
Neighbourhood



By exploring above, now taking all the data from the table

In [7]:
postCode, borough, neighbourhood = [], [], []

for data in table.find_all('tr'):
    if data.contents[3].text != 'Not assigned':
        postCode.append(data.contents[1].text)
        borough.append(data.contents[3].text)
        neighbourhood.append(data.contents[5].text)

removing '\n' from the end of neighbourhood column

In [8]:
for i, neighbour in enumerate(neighbourhood):
    neighbourhood[i] = neighbour[:-1]

# print(neighbourhood)

Storing all the lists to the Pandas Dataframe

In [9]:
import pandas as pd

data = [postCode[1:], borough[1:], neighbourhood[1:]]
df = pd.DataFrame(data)
df = df.transpose()
df.head()

Unnamed: 0,0,1,2
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


Changing the name of columns

In [10]:
df.rename(columns={0:'Postcode',
                  1:'Borough',
                  2:'Neighbourhood'},
         inplace=True)
print(df.columns)
unique_code = df['Postcode'].unique()
df.set_index('Postcode', inplace=True)
df.head()

Index(['Postcode', 'Borough', 'Neighbourhood'], dtype='object')


Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,Harbourfront
M6A,North York,Lawrence Heights
M6A,North York,Lawrence Manor


Checking how th loc workds 

In [11]:
df.loc['M6A']

Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M6A,North York,Lawrence Heights
M6A,North York,Lawrence Manor


In [12]:
df.columns

Index(['Borough', 'Neighbourhood'], dtype='object')

Combine the data of same code and reset the index

In [13]:
for code in unique_code:
    data = df.loc[code]
    if data.size > 2:
        df.loc[code, 'Neighbourhood'] = ', '.join(data['Neighbourhood'])
        
# df.drop_duplicates('Postcode', inplace=True)
df.reset_index(inplace=True)
df.drop_duplicates('Postcode', inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park


Get the sape of the final data frame

In [14]:
print(f'The shape of the final data frame is {df.shape}')

The shape of the final data frame is (103, 3)


In [15]:
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern
101,M8Y,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So..."
