In [27]:
#https://www.freecodecamp.org/news/scraping-wikipedia-articles-with-python/
#https://towardsdatascience.com/web-scraping-html-tables-with-python-c9baba21059
import requests

response = requests.get(
    url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M",
)
print(response.status_code)

200


In [9]:
#!pip install beautifulsoup4
from bs4 import BeautifulSoup

In [12]:
soup = BeautifulSoup(response.content, 'html.parser')
title = soup.find(id="firstHeading")
print(title.string)

List of postal codes of Canada: M


In [17]:
# Get all the links
import random

allLinks = soup.find(id="bodyContent").find_all("a")
random.shuffle(allLinks)
linkToScrape = 0

for link in allLinks:
    # We are only interested in other wiki articles
    if link['href'].find("/wiki/") == -1: 
        continue

    # Use this link to scrape
    linkToScrape = link
    break

print(linkToScrape)

<a href="/wiki/List_of_postal_codes_of_Canada:_H" title="List of postal codes of Canada: H">H</a>


In [28]:
import requests
import lxml.html as lh
import pandas as pd

url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
#Create a handle, page, to handle the contents of the website
page = requests.get(url)
#Store the contents of the website under doc
doc = lh.fromstring(page.content)
#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')

In [29]:
#Check the length of the first 12 rows
[len(T) for T in tr_elements[:12]]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

In [30]:
tr_elements = doc.xpath('//tr')
#Create empty list
col=[]
i=0
#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    print ('%d:"%s"'%(i,name))
    col.append((name,[]))

1:"Postal Code
"
2:"Borough
"
3:"Neighbourhood
"


In [31]:
#Since out first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 10, the //tr data is not from our table 
    if len(T)!=10:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

In [32]:
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

In [33]:
df.head()

Unnamed: 0,Postal Code\n,Borough\n,Neighbourhood\n


In [35]:
tr_elements

[<Element tr at 0x1c7e2e77c20>,
 <Element tr at 0x1c7e2e77b80>,
 <Element tr at 0x1c7e2e77f40>,
 <Element tr at 0x1c7e2e77f90>,
 <Element tr at 0x1c7e2e77bd0>,
 <Element tr at 0x1c7e2e77b30>,
 <Element tr at 0x1c7e2e77cc0>,
 <Element tr at 0x1c7e2e77c70>,
 <Element tr at 0x1c7e1e44400>,
 <Element tr at 0x1c7e2e7a270>,
 <Element tr at 0x1c7e2e7a040>,
 <Element tr at 0x1c7e2e7a090>,
 <Element tr at 0x1c7e2e7a0e0>,
 <Element tr at 0x1c7e2e7a130>,
 <Element tr at 0x1c7e2e7a810>,
 <Element tr at 0x1c7e2e7a8b0>,
 <Element tr at 0x1c7e2e7a860>,
 <Element tr at 0x1c7e2e7a950>,
 <Element tr at 0x1c7e2e7a9a0>,
 <Element tr at 0x1c7e2e7a900>,
 <Element tr at 0x1c7e2e7aa40>,
 <Element tr at 0x1c7e2e7a9f0>,
 <Element tr at 0x1c7e2e7aae0>,
 <Element tr at 0x1c7e2e7a400>,
 <Element tr at 0x1c7e2e7a4f0>,
 <Element tr at 0x1c7e2e7a540>,
 <Element tr at 0x1c7e2e7a590>,
 <Element tr at 0x1c7e2e7a5e0>,
 <Element tr at 0x1c7e2e7a3b0>,
 <Element tr at 0x1c7e2e7a360>,
 <Element tr at 0x1c7e2e7a310>,
 <Elemen

In [36]:
#https://medium.com/@sateesh.gmc/how-to-scrape-wikipedia-table-using-python-beautiful-soup-cd0d8ee1a319
import requests
from bs4  import BeautifulSoup

In [37]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
s = requests.Session()
response = s.get(url, timeout=10)
response

<Response [200]>

In [38]:
soup.title.string

'List of postal codes of Canada: M - Wikipedia'

In [39]:
right_table = soup.find('table',{"class":'wikitable sortable'})

In [40]:
# Number of columns in the table
for row in right_table.findAll("tr"):
    cells = row.findAll('td')

len(cells)

3

In [42]:
# number of rows in the table including header
rows = right_table.findAll("tr")
len(rows)

181

In [43]:
# header attributes of the table
header = [th.text.rstrip() for th in rows[0].find_all('th')]
print(header)
print('------------')
print(len(header))

['Postal Code', 'Borough', 'Neighbourhood']
------------
3


In [44]:
lst_data = []
for row in rows[1:]:
            data = [d.text.rstrip() for d in row.find_all('td')]
            lst_data.append(data)

In [45]:
# select also works as find_all
lst_data1 = []
for row in rows[1:]:
            data = [d.text.rstrip() for d in row.select('td')]
            lst_data1.append(data)

In [46]:
# sample records
lst_data1[0:3]

[['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods']]

In [48]:
# html of each table record

list_row = []
for row in right_table.findAll("tr"):
    list_row.append(row)

    
print('Number of row :',len(list_row))
print('----------------')
print(list_row[1])
print('----------------')
print('Second Attribute is has link reference')
print('----------------')
print(list_row[1].findAll('th'))
print('----------------')
print(list_row[1].find('a'))

Number of row : 181
----------------
<tr>
<td>M1A
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>
----------------
Second Attribute is has link reference
----------------
[]
----------------
None


In [53]:

#Scrap the data and append to respective lists

c1=[]
c2=[]
c3=[]

for row in right_table.findAll("tr"):
    cells = row.findAll('td')
    if len(cells)==6: #Only extract table body not heading
        c1.append(cells[0].find(text=True))
        c2.append(cells[1].find('a').text)  # fetch the text of the url in td tag. 
        c3.append(cells[2].find(text=True))

In [54]:
# create a dictionary
d = dict([(x,0) for x in header])
d

{'Postal Code': 0, 'Borough': 0, 'Neighbourhood': 0}

In [55]:
# append dictionary with corresponding data list.
d['Postal Code'] = c1
d['Borough']= c2
d['Neighbourhood']=c3

In [56]:
# convert dict to DataFrame
df_table = pd.DataFrame(d)

# Top 5 records
df_table.head(5)

Unnamed: 0,Postal Code,Borough,Neighbourhood


In [57]:
import pandas as pd 

import requests 

from bs4 import BeautifulSoup 

 

req = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M") 
soup = BeautifulSoup(req.content,'lxml') 
table = soup.find_all('table')[0]  
df = pd.read_html(str(table)) 

neighborhood=pd.DataFrame(df[0])

In [58]:
neighborhood

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."
