In [8]:
from __future__ import print_function

from bs4 import BeautifulSoup

import requests

# Beautiful Soup on test data

Documentation: https://www.crummy.com/software/BeautifulSoup/bs4/doc/

Below, we create a simple HTML page that include some frequently used tags. 
Note, however, that we have also left one paragraph tag unclosed. 

In [9]:
source = """
<!DOCTYPE html> 
<html>  
  <head>
    <title>Scraping</title>
  </head>
  <body class="col-sm-12">
    <h1>section1</h1>
    <p>paragraph1</p>
    <p>paragraph2</p>
    <div class="col-sm-2">
      <h2>section2</h2>
      <p>paragraph3</p>
      <p>unclosed
    </div>
  </body>
</html>  
"""

soup = BeautifulSoup(source, "html.parser")

Once the soup object has been created successfully, we can execute a number of queries on the DOM. 
First we request all data from the `head` tag. 
Note that while it looks like a list of strings was returned, actually, a `bs4.element.Tag` type is returned. 
These examples explore how to extract tags, the text from tags, how to filter queries based on 
attributes, how to retreive attributes from a returned query, and how the BeautifulSoup engine 
is tolerant of unclosed tags. 
Notice in the actual HTML source, the last paragraph is not closed. 

In [10]:
print(soup.prettify())
# BeautifulSoup engine corrects the HTML source by including </p> to the unclosed paragraph

<!DOCTYPE html>
<html>
 <head>
  <title>
   Scraping
  </title>
 </head>
 <body class="col-sm-12">
  <h1>
   section1
  </h1>
  <p>
   paragraph1
  </p>
  <p>
   paragraph2
  </p>
  <div class="col-sm-2">
   <h2>
    section2
   </h2>
   <p>
    paragraph3
   </p>
   <p>
    unclosed
   </p>
  </div>
 </body>
</html>



In [11]:
print('Head:')
print('', soup.find_all("head"))
# [<head>\n<title>Scraping</title>\n</head>]

Head:
 [<head>
<title>Scraping</title>
</head>]


In [12]:
print('\nType of head:')
print('', map(type, soup.find_all("head")))
# [<class 'bs4.element.Tag'>]


Type of head:
 <map object at 0x000002977A5BB4F0>


In [13]:
print('\nTitle tag:')
print('', soup.find("title"))
# <title>Scraping</title>


Title tag:
 <title>Scraping</title>


In [14]:
print('\nTitle text:')
print('', soup.find("title").text)
# Scraping


Title text:
 Scraping


In [15]:
divs = soup.find_all("div", attrs={"class": "col-sm-2"})
print('\nDiv with class=col-sm-2:')
print('', divs)
# [<div class="col-sm-2">....</div>]


Div with class=col-sm-2:
 [<div class="col-sm-2">
<h2>section2</h2>
<p>paragraph3</p>
<p>unclosed
    </p></div>]


In [16]:
print('\nClass of first div:')
print('', divs[0].attrs['class'])
# [u'col-sm-2']


Class of first div:
 ['col-sm-2']


In [17]:
print('\nAll paragraphs:')
print('', soup.find_all("p"))
# [<p>paragraph1</p>, 
#  <p>paragraph2</p>, 
#  <p>paragraph3</p>, 
#  <p>unclosed\n    </p>]


All paragraphs:
 [<p>paragraph1</p>, <p>paragraph2</p>, <p>paragraph3</p>, <p>unclosed
    </p>]


# Beautilful soup on real data 

In this example I will show how you can use BeautifulSoup to retreive information from live web pages. 
We make use of The Guardian newspaper, and retreive the HTML from an arbitrary article. 
We then create the BeautifulSoup object, and query the links that were discovered in the DOM. 
Since a large number are returned, we then apply attribute filters that let us reduce significantly 
the number of returned links. 
I selected the filters selected for this example in order to focus on the names in the paper. 
The parameterisation of the attributes was discovered by using the `inspect` functionality available in a modern web broser. Web browsers including Google Chrome or Microsoft Firefox support this functionality. You could right click on a part of a web page, and click `inspect` to view source of that part of a web page. 

In [18]:
url = 'https://www.theguardian.com/books/2023/jan/18/ts-eliot-prize-winner-anthony-joseph-how-poetry-helped-me-love-my-absent-father'
req = requests.get(url)
source = req.text
soup = BeautifulSoup(source, 'html.parser')

In [19]:
print(source)
# To view the complete HTML source of a web page in a web browser, right click on the page, and click "View Page Source."

<!doctype html>
        <html lang="en" >
            <head>
			    <!-- Hello there, HTML enthusiast! -->

				<!-- DCR commit hash 17508a1f7e4f14228382fb1f2b4d42bdf1648e53 -->

                <title>TS Eliot prize winner Anthony Joseph: how poetry helped me love my absent father | Books | The Guardian</title>
                <meta name="description" content="The writer on switching from rock to poetry and how Sonnets for Albert explores his relationship with a missing parent who became an almost &#x2018;mythological figure&#x2019;" />
				<meta charset="utf-8">
				<link rel="canonical" href="https://www.theguardian.com/books/2023/jan/18/ts-eliot-prize-winner-anthony-joseph-how-poetry-helped-me-love-my-absent-father" />
				<meta name="viewport" content="width=device-width,minimum-scale=1,initial-scale=1">
                <meta name="theme-color" content="#052962" />
				<link rel="manifest" href="https://assets.guim.co.uk/static/frontend/manifest.json" />
				<link rel="apple-touch

In [20]:
links = soup.find_all('a') # Find all <a> tags 
links

[<a class="dcr-1gpubb" data-link-name="skip : main content" href="#maincontent">Skip to main content</a>,
 <a class="dcr-1gpubb" data-link-name="skip : navigation" href="#navigation">Skip to navigation</a>,
 <a class="dcr-1gpubb" data-link-name="skip : navigation" href="#navigation">Skip to navigation</a>,
 <a class="dcr-vxblrw" data-link-name="header : topbar : printsubs" href="https://support.theguardian.com/subscribe?REFPVID=undefined&amp;INTCMP=undefined&amp;acquisitionData=%7B%22source%22%3A%22GUARDIAN_WEB%22%2C%22componentId%22%3A%22PrintSubscriptionsHeaderLink%22%2C%22componentType%22%3A%22ACQUISITIONS_HEADER%22%2C%22referrerUrl%22%3A%22%22%7D">Print subscriptions</a>,
 <a class="dcr-vxblrw" data-link-name="header : topbar : job-cta" href="https://jobs.theguardian.com">Search jobs</a>,
 <a class="dcr-140c513" data-link-name="header : topbar : signin" href="https://profile.theguardian.com/signin?INTCMP=DOTCOM_NEWHEADER_SIGNIN&amp;ABCMP=ab-sign-in&amp;componentEventParams=componen

In [21]:
# Open the URL: https://www.theguardian.com/books/2023/jan/18/ts-eliot-prize-winner-anthony-joseph-how-poetry-helped-me-love-my-absent-father 
# in a web browser, right click on the link text "Poetry" formatted in brown colour and click Inspect Element.
# Compare the attributes of this link with other links of the web page. 
# You will notice, this link has its "data-component" set as "auto-linked-tag".

# Find all <a></a> whose value of attribute 'data-component' is 'auto-linked-tag'

links = soup.find_all('a', attrs={
    'data-component': 'auto-linked-tag'
})
print(links)
for link in links:
    print(link['href'], link.text)

[<a data-component="auto-linked-tag" data-link-name="in body link" href="https://www.theguardian.com/books/poetry">Poetry</a>]
https://www.theguardian.com/books/poetry Poetry


## Task 1: Extracting Linked News Stories

URL: https://www.theguardian.com/books/2023/jan/18/ts-eliot-prize-winner-anthony-joseph-how-poetry-helped-me-love-my-absent-father

Open the above URL in your browser. You will notice that parts of the main news story are hyperlinked to other news stories published previously. For instance, the first paragraph is linked to a new story on animal welfare (sentience) bill. Your first task is to extract the links to these other news stories in the main news.  

Hint 1: 05_web_scraping_beautiful_soup.ipynb has a code block to extract topics hyperlinked within a new story. 

Hint 2: These can be identified with attribute 'data-link-name':'in body link' in \<a> \</a>

In [22]:
# Solution to Task 1
# This task is similar to the the code block above where we try to extract some links. 
# Open the web page in a web browser and inspect the hyper links of other linked news stories. 
# Notice each of these links have attribute "data-link-name" set as "in body link"
newurl = 'https://www.theguardian.com/books/2023/jan/18/ts-eliot-prize-winner-anthony-joseph-how-poetry-helped-me-love-my-absent-father'
# Let us extract these:
response = requests.get(newurl)
soup = BeautifulSoup(response.text, 'html.parser')
###
# WRITE YOUR CODE HERE
links = soup.find_all('a', attrs={'data-link-name': 'in body link'})
for link in links:
    print(link['href'], link.text)
###

https://www.theguardian.com/books/2023/jan/16/anthony-joseph-wins-ts-eliot-prize-for-luminous-poetry-collection TS Eliot prize
https://www.theguardian.com/books/2018/aug/04/kitch-anthony-joseph-review-windrush-trinidad-calypso in his Guardian review
https://www.theguardian.com/books/poetry Poetry
https://guardianbookshop.com/sonnets-for-albert-9781526649942 guardianbookshop.com


## Task 2: Extracting Topics or Categories 

URL: https://www.theguardian.com/books/2023/jan/18/ts-eliot-prize-winner-anthony-joseph-how-poetry-helped-me-love-my-absent-father 

~~Guardian's website tags the news story with a list of topics. Your second task is to find these topics. 

Hint: In the source we see the topics are in a div with attribute `data-print-layout':'hide'` and that contains the text `Explore more on these topics`

In [23]:
# Solution to Task 2

# In the source we see topics are in a div with 'data-print-layout':'hide'"

url = 'https://www.theguardian.com/books/2023/jan/18/ts-eliot-prize-winner-anthony-joseph-how-poetry-helped-me-love-my-absent-father'
req = requests.get(url)
source = req.text
soup = BeautifulSoup(source, 'html.parser')

potential_topics_div = soup.find_all('div', attrs={'data-print-layout':'hide'})

# One of these divs has the topics. 
# We notice that text "Explore more on these topics" is within a span. We need to find the parent div containing this span. 

# for div in potential_topics_div:
    ###
    # WRITE YOUR CODE HERE
for div in potential_topics_div:
    span = div.find('span', string='Explore more on these topics')
    if span:
        topic_ul = span.find_next('ul')
        if topic_ul:
            for topic in topic_ul.find_all('a'):
                print(topic.text)
        break
    ###

Books
TS Eliot prize for poetry
Poetry
Awards and prizes
interviews


# Chaining queries

Now, let us conisder a more general query that might be done on a website such as this. 
We will query the base technology page, and attempt to list all articles that pertain to this main page

In [24]:
url = 'https://www.theguardian.com/uk/technology'
req = requests.get(url)
source = req.text
soup = BeautifulSoup(source, 'html.parser')

After inspecting the DOM (via the `inspect` tool in my browser), I see that the links to `technology` article are in the tag `<main>...<\main>` with `id='maincontent'`: 
        
On inspecting the links, i.e., `<a>...<\a>` in `<main>...<\main>`, I see that the link text is under attribute `aria-label`

In [25]:
potential_divs = soup.find_all('main', attrs={'id':'maincontent'})
for div in potential_divs:
    links = div.find_all('a')
    for link in links:
        if link.has_attr('aria-label'):
            print(link['aria-label'], '\n', link['href'], "\n")

Global disunity, energy concerns and the shadow of Musk: key takeaways from the Paris AI summit  
 /technology/2025/feb/14/global-disunity-energy-concerns-and-the-shadow-of-musk-key-takeaways-from-the-paris-ai-summit 

OpenAI rejects $97.4bn Musk bid and says company is not for sale 
 /technology/2025/feb/14/openai-elon-musk 

Arm looks to launch its own chip after landing Meta contract 
 /business/2025/feb/14/arm-looks-to-launch-its-own-chip-after-landing-meta-contract 

 TikTok returns to Apple App Store and Google Play in US 
 /technology/2025/feb/14/tiktok-returns-to-apple-app-store-and-google-play-in-us 

Elon Musk says he’ll drop his $97bn bid for OpenAI if it remains a non-profit 
 /technology/2025/feb/13/elon-musk-openai-non-profit 

Rogue states could use AI to do ‘real harm’, warns ex-Google CEO 
 /technology/2025/feb/13/former-google-ceo-warns-ai-could-be-used-by-rogue-states-to-harm-people 

Parents are desperate to protect kids on social media. Why did the US let a safety 

With this set of articles, it is now possible to chain further querying, for example with code 
similar to the following 

```python
for article in articles: 
    req = requests.get(article['href'])
    source = req.text 
    soup = BeautifulSoup(source, 'html.parser') 
    
    ... and so on...
```

However, I won't go into much detail about this now. For scraping like this tools, such as `scrapy` are more 
appropriate than `BeautifulSoup` since they are designed for multithreadded web crawling. 
Once again, however, I urge caution and hope that before any crawling is initiated you determine whether 
crawling is within the terms of use of the website. 
If in doubt contact the website administrators. 

https://scrapy.org/

## Task 3. Listing All News Stories in a Section  

URL: https://www.theguardian.com/uk/technology 

In the chaining code block above (in 05_web_scraping_beautiful_soup.ipynb), we listed all news stories on the technology page of the Guardian's website. On the webpage, we see that these stories are grouped as "Technology", "Spotlight", "Smartphone reviews" and so on.  

Your task is to fetch only technology related stories that are highlighted under Technology and highlighted under Spotlight of the Guardian's Technology webpage  

Hint 1: This requires extracting the source of the divs that list the Technology and Spotlight new stories. 

Hint 2: For Technology stories, filter for div with attribute 'id': 'container-technology'

Hint 2: For Spotlight stories, filter for div with attribute 'id': 'container-spotlight'

In [26]:
# Solution to Task 3

print("Printing stories under Technology\n")

potential_technology_div = soup.find_all('div', attrs={'id':'container-technology'})
# print(potential_technology_div)
for div in potential_technology_div:
    technology_links = div.find_all('a')
    for link in technology_links:
#         print("\n\n", link['href'])
        if link.has_attr('aria-label'):
            print(link['aria-label'], '\n', link['href'], "\n")

print("\n\nPrinting stories under Spotlight\n")

###
# WRITE YOUR CODE HERE

potential_spotlight_div = soup.find_all('div', attrs={'id':'container-spotlight'})
for div in potential_spotlight_div:
    spotlights = div.find_all('a')
    for spotlight in spotlights:
        if spotlight.has_attr('aria-label'):
            print(spotlight['aria-label'], '\n', spotlight['href'], "\n")
###

Printing stories under Technology

Global disunity, energy concerns and the shadow of Musk: key takeaways from the Paris AI summit  
 /technology/2025/feb/14/global-disunity-energy-concerns-and-the-shadow-of-musk-key-takeaways-from-the-paris-ai-summit 

OpenAI rejects $97.4bn Musk bid and says company is not for sale 
 /technology/2025/feb/14/openai-elon-musk 

Arm looks to launch its own chip after landing Meta contract 
 /business/2025/feb/14/arm-looks-to-launch-its-own-chip-after-landing-meta-contract 

 TikTok returns to Apple App Store and Google Play in US 
 /technology/2025/feb/14/tiktok-returns-to-apple-app-store-and-google-play-in-us 

Elon Musk says he’ll drop his $97bn bid for OpenAI if it remains a non-profit 
 /technology/2025/feb/13/elon-musk-openai-non-profit 

Rogue states could use AI to do ‘real harm’, warns ex-Google CEO 
 /technology/2025/feb/13/former-google-ceo-warns-ai-could-be-used-by-rogue-states-to-harm-people 



Printing stories under Spotlight

Parents are 

## Task 4. List 50 Most Recent Technology-Related News Stories  

URL: https://www.theguardian.com/uk/technology 

On Guardian's technology home page, you will notice a link to “All Stories.” If you cannot locate it visually, use the browser’s find tool (Ctrl + F) and search for “All Stories.” Click the link to “All Stories” and observe the structure of the web page listing all stories. Here is the direct link to that web page: https://www.theguardian.com/technology/all 

Your task is to extract 50 most recent technology stories published by Guardian.  

Hint 1: This will require you to loop through multiple pages and maintain a counter of stories. 

Hint 2: Click on page 2. Observe the URL string.  

Hint 3: URL "../technology/all" maps to "/technology?page=1". 

In [27]:
# Solution to Task 4
# This task is similar to the other tasks. 
# The only difference is we need to loop through multiple pages of Guardian's website while maintaining a counter of the links fetched 

###
# WRITE YOUR CODE HERE
url = 'https://www.theguardian.com/uk/technology/all'
source = requests.get(url)
soup = BeautifulSoup(source.text, 'html.parser')

count = 0
guardian_url = 'https://www.theguardian.com'

def find_articles(soup, limit):
    links = list(soup.find_all('a', attrs={'data-link-name':lambda x:x and 'card-@' in x}, limit = 50))
    
    if len(links) >= limit:
        return links[:limit]

    else:
        next_page = soup.find('a', href=lambda x: x and '/technology?page=' in x)
        if next_page:
            next_url = guardian_url + next_page['href']
            remaining = limit - len(links)
            next_links = find_articles(BeautifulSoup(requests.get(next_url).text, 'html.parser'), remaining)

            if next_links:
                links.extend(next_links)

    return links

articles = find_articles(soup, 50)
for article in articles:
    count += 1
    print(count, ': ', article['aria-label'])





###

1 :  Parents are desperate to protect kids on social media. Why did the US let a safety bill die?
2 :  Protesters target Tesla showrooms in US over Elon Musk’s government cost-cutting
3 :  Elon Musk’s mass government cuts could make private companies millions
4 :  A tale of two suckers: Donald Trump’s plastic straws and Keir Starmer
5 :  If the AI Roundheads go to war with tech royalty, don’t bet against them
6 :  ‘Everyone knows the Centrelink song’: how we learned to love – and remix – hold music
7 :  OpenAI rejects $97.4bn Musk bid and says company is not for sale
8 :  US watchdog to investigate Musk ‘Doge’ team’s access to payment systems
9 :  Every picture tells a story: the joy of analogue photography 
10 :  Musk-linked group offered $5m for proof of voter fraud – and came up with nothing
11 :  Arm looks to launch its own chip after landing Meta contract
12 :  Tell us: how has artificial intelligence affected your work? 
13 :  I met the ‘godfathers of AI’ in Paris – here’s what t