In [5]:
from __future__ import print_function

from bs4 import BeautifulSoup

import requests

# Beautiful Soup on test data

Documentation: https://www.crummy.com/software/BeautifulSoup/bs4/doc/

Below, we create a simple HTML page that include some frequently used tags. 
Note, however, that we have also left one paragraph tag unclosed. 

In [6]:
source = """
<!DOCTYPE html>  
<html>  
  <head>
    <title>Scraping</title>
  </head>
  <body class="col-sm-12">
    <h1>section1</h1>
    <p>paragraph1</p>
    <p>paragraph2</p>
    <div class="col-sm-2">
      <h2>section2</h2>
      <p>paragraph3</p>
      <p>unclosed
    </div>
  </body>
</html>  
"""

soup = BeautifulSoup(source, "html.parser")

Once the soup object has been created successfully, we can execute a number of queries on the DOM. 
First we request all data from the `head` tag. 
Note that while it looks like a list of strings was returned, actually, a `bs4.element.Tag` type is returned. 
These examples explore how to extract tags, the text from tags, how to filter queries based on 
attributes, how to retreive attributes from a returned query, and how the BeautifulSoup engine 
is tolerant of unclosed tags. 
Notice in the actual HTML source, the last paragraph is not closed. 

In [7]:
print(soup.prettify())
# BeautifulSoup engine corrects the HTML source by including </p> to the unclosed paragraph

<!DOCTYPE html>
<html>
 <head>
  <title>
   Scraping
  </title>
 </head>
 <body class="col-sm-12">
  <h1>
   section1
  </h1>
  <p>
   paragraph1
  </p>
  <p>
   paragraph2
  </p>
  <div class="col-sm-2">
   <h2>
    section2
   </h2>
   <p>
    paragraph3
   </p>
   <p>
    unclosed
   </p>
  </div>
 </body>
</html>



In [8]:
print('Head:')
print('', soup.find_all("head"))
# [<head>\n<title>Scraping</title>\n</head>]

Head:
 [<head>
<title>Scraping</title>
</head>]


In [9]:
print('\nType of head:')
print('', map(type, soup.find_all("head")))
# [<class 'bs4.element.Tag'>]


Type of head:
 <map object at 0x7f6409f48190>


In [10]:
print('\nTitle tag:')
print('', soup.find("title"))
# <title>Scraping</title>


Title tag:
 <title>Scraping</title>


In [11]:
print('\nTitle text:')
print('', soup.find("title").text)
# Scraping


Title text:
 Scraping


In [12]:
divs = soup.find_all("div", attrs={"class": "col-sm-2"})
print('\nDiv with class=col-sm-2:')
print('', divs)
# [<div class="col-sm-2">....</div>]


Div with class=col-sm-2:
 [<div class="col-sm-2">
<h2>section2</h2>
<p>paragraph3</p>
<p>unclosed
    </p></div>]


In [13]:
print('\nClass of first div:')
print('', divs[0].attrs['class'])
# [u'col-sm-2']


Class of first div:
 ['col-sm-2']


In [14]:
print('\nAll paragraphs:')
print('', soup.find_all("p"))
# [<p>paragraph1</p>, 
#  <p>paragraph2</p>, 
#  <p>paragraph3</p>, 
#  <p>unclosed\n    </p>]


All paragraphs:
 [<p>paragraph1</p>, <p>paragraph2</p>, <p>paragraph3</p>, <p>unclosed
    </p>]


# Beautilful soup on real data 

In this example I will show how you can use BeautifulSoup to retreive information from live web pages. 
We make use of The Guardian newspaper, and retreive the HTML from an arbitrary article. 
We then create the BeautifulSoup object, and query the links that were discovered in the DOM. 
Since a large number are returned, we then apply attribute filters that let us reduce significantly 
the number of returned links. 
I selected the filters selected for this example in order to focus on the names in the paper. 
The parameterisation of the attributes was discovered by using the `inspect` functionality of Google Chrome

In [15]:
url = 'https://www.theguardian.com/money/2023/jan/24/british-households-businesses-cut-power-use-national-grid'
req = requests.get(url)
source = req.text
soup = BeautifulSoup(source, 'html.parser')

In [16]:
print(source)

<!doctype html>
        <html lang="en">
            <head>
			    
<!--

We are hiring, ever thought about joining us?
https://workforus.theguardian.com/careers/product-engineering/


                                    GGGGGGGGG
                           GGGGGGGGGGGGGGGGGGGGGGGGGG
                       GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
                    GGGGGGGGGGGGGGGGG      GG   GGGGGGGGGGGGG
                  GGGGGGGGGGGG        GGGGGGGGG      GGGGGGGGGG
                GGGGGGGGGGG         GGGGGGGGGGGGG       GGGGGGGGG
              GGGGGGGGGG          GGGGGGGGGGGGGGGGG     GGGGGGGGGGG
             GGGGGGGGG           GGGGGGGGGGGGGGGGGGG    GGGGGGGGGGGG
            GGGGGGGGG           GGGGGGGGGGGGGGGGGGGGGG  GGGGGGGGGGGGG
           GGGGGGGGG            GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
           GGGGGGGG             GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
          GGGGGGGG              GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
          GGGGGGGG              GGGGGGGGGGG

In [17]:
links = soup.find_all('a')
links

[<a class="dcr-vg6br7" href="#maincontent">Skip to main content</a>,
 <a class="dcr-vg6br7" href="#navigation">Skip to navigation</a>,
 <a class="dcr-1086n7e" data-link-name="nav3 : topbar : printsubs" href="https://support.theguardian.com/subscribe?REFPVID=&amp;INTCMP=undefined&amp;acquisitionData=%7B%22source%22%3A%22GUARDIAN_WEB%22%2C%22componentId%22%3A%22PrintSubscriptionsHeaderLink%22%2C%22componentType%22%3A%22ACQUISITIONS_HEADER%22%2C%22referrerPageviewId%22%3A%22%22%2C%22referrerUrl%22%3A%22%22%7D"><svg height="18" viewbox="-4 -4 32 32" width="18" xmlns="http://www.w3.org/2000/svg"><path d="M12,-4 a15,15 0,0,0 0,32 15,15 0,0,0 0,-32     M20 21L19 21H5L4 21V3L5 2H16L20 6V21Z     M18 8H6V9.5H18V8Z     M18 11H6V12.5H18V11Z     M13 14H6V15.5H13V14Z"></path></svg>Print subscriptions</a>,
 <a class="dcr-xg94hc" data-link-name="nav3 : topbar : signin" href="https://profile.theguardian.com/signin?INTCMP=DOTCOM_NEWHEADER_SIGNIN&amp;ABCMP=ab-sign-in&amp;componentEventParams=componentTyp

In [18]:
links = soup.find_all('a', attrs={
    'data-component': 'auto-linked-tag'
})

for link in links: 
    print(link['href'], link.text)

# Chaining queries

Now, let us conisder a more general query that might be done on a website such as this. 
We will query the base technology page, and attempt to list all articles that pertain to this main page

In [19]:
url = 'https://www.theguardian.com/uk/technology'
req = requests.get(url)
source = req.text
soup = BeautifulSoup(source, 'html.parser')

After inspecting the DOM (via the `inspect` tool in my browser), I see that the attributes that define 
a `technology` article are: 
    
    class = "js-headline-text"

In [20]:
articles = soup.find_all('a', attrs={
    'class': 'js-headline-text'
})

for article in articles: 
    print(article['href'][:], article.text[:20])

https://www.theguardian.com/technology/2023/jan/27/elon-musk-doesnt-seem-like-right-person-to-own-twitter-says-co-founder Tesla CEO ‘doesn’t s
https://www.theguardian.com/us-news/live/2023/jan/26/trump-facebook-ban-meta-biden-economy-politics-live-updates Outrage greets Meta 
https://www.theguardian.com/technology/2023/jan/26/state-linked-hackers-in-russia-and-iran-are-targeting-uk-groups-ncsc-warns State-linked hackers
https://www.theguardian.com/technology/2023/jan/25/first-uk-industrial-action-against-amazon-is-making-an-impact-says-gmb First UK industrial 
https://www.theguardian.com/science/audio/2023/jan/26/how-will-chatgpt-transform-creative-work-podcast How will ChatGPT tra
https://www.theguardian.com/technology/2023/jan/25/microsoft-investigates-outage-affecting-teams-and-outlook-users-worldwide Company investigates
https://www.theguardian.com/media/2023/jan/26/the-one-change-that-didnt-work-giving-up-social-media-left-me-bored The one change that 
https://www.theguardian.com/

With this set of articles, it is now possible to chain further querying, for example with code 
similar to the following 

```python
for article in articles: 
    req = requests.get(article['href'])
    source = req.text 
    soup = BeautifulSoup(source, 'html.parser') 
    
    ... and so on...
```

However, I won't go into much detail about this now. For scraping like this tools, such as `scrapy` are more 
appropriate than `BeautifulSoup` since they are designed for multithreadded web crawling. 
Once again, however, I urge caution and hope that before any crawling is initiated you determine whether 
crawling is within the terms of use of the website. 
If in doubt contact the website administrators. 

https://scrapy.org/

# Task 1

In [21]:
url = 'https://www.theguardian.com/books/2023/jan/18/ts-eliot-prize-winner-anthony-joseph-how-poetry-helped-me-love-my-absent-father'
req = requests.get(url)
source = req.text
soup = BeautifulSoup(source, 'html.parser')

links = soup.find_all('a', attrs={
    'data-link-name': 'in body link'
})

for link in links: 
    print(link['href'][:], link.text[:20])

https://www.theguardian.com/books/2023/jan/16/anthony-joseph-wins-ts-eliot-prize-for-luminous-poetry-collection TS Eliot prize
https://www.theguardian.com/books/2018/aug/04/kitch-anthony-joseph-review-windrush-trinidad-calypso in his Guardian revi
https://www.theguardian.com/books/poetry Poetry
https://guardianbookshop.com/sonnets-for-albert-9781526649942 guardianbookshop.com


# Task 2

In [22]:
topics = soup.find_all('a', attrs={
    'class': 'dcr-viu5to'
})

for topic in topics: 
    print(topic['href'][:], topic.text[:20])

/books/books Books
/books/ts-eliot-prize-for-poetry TS Eliot prize for p
/books/poetry Poetry
/culture/awards-and-prizes Awards and prizes
/tone/interview interviews


# Task 3

In [23]:
url = 'https://www.theguardian.com/uk/technology'
req = requests.get(url)
source = req.text
soup = BeautifulSoup(source, 'html.parser')

section = soup.find('section', attrs={
    'id': 'technology'
})

articles = section.find_all('a', attrs={
    'class': 'js-headline-text'
})

for article in articles: 
    print(article['href'][:], article.text[:20])


https://www.theguardian.com/technology/2023/jan/27/elon-musk-doesnt-seem-like-right-person-to-own-twitter-says-co-founder Tesla CEO ‘doesn’t s
https://www.theguardian.com/us-news/live/2023/jan/26/trump-facebook-ban-meta-biden-economy-politics-live-updates Outrage greets Meta 
https://www.theguardian.com/technology/2023/jan/26/state-linked-hackers-in-russia-and-iran-are-targeting-uk-groups-ncsc-warns State-linked hackers
https://www.theguardian.com/technology/2023/jan/25/first-uk-industrial-action-against-amazon-is-making-an-impact-says-gmb First UK industrial 
https://www.theguardian.com/science/audio/2023/jan/26/how-will-chatgpt-transform-creative-work-podcast How will ChatGPT tra
https://www.theguardian.com/technology/2023/jan/25/microsoft-investigates-outage-affecting-teams-and-outlook-users-worldwide Company investigates


# Task 4

In [24]:
num = 0
for i in range(1, 5):
    url = f"https://www.theguardian.com/technology?page={i}"
    req = requests.get(url)
    source = req.text
    soup = BeautifulSoup(source, 'html.parser')
    articles = soup.find_all('a', attrs={
        'class': 'js-headline-text'
    })    

    for article in articles:
        if num < 50: print(article['href'][:], article.text[:20])
        num += 1

https://www.theguardian.com/technology/2023/jan/27/elon-musk-doesnt-seem-like-right-person-to-own-twitter-says-co-founder Elon Musk ‘doesn’t s
https://www.theguardian.com/us-news/2023/jan/26/uber-lyft-new-york-zero-emission-2030 Uber and Lyft in New
https://www.theguardian.com/society/2023/jan/27/covid-lockdowns-created-online-backdoor-for-child-abusers-says-charity Covid lockdowns crea
https://www.theguardian.com/media/2023/jan/26/buzzfeed-artifical-intelligence-content-quizzes-chatgpt BuzzFeed to use AI t
https://www.theguardian.com/us-news/live/2023/jan/26/trump-facebook-ban-meta-biden-economy-politics-live-updates Biden vows to veto R
https://www.theguardian.com/science/2023/jan/26/science-journals-ban-listing-of-chatgpt-as-co-author-on-papers Science journals ban
https://www.theguardian.com/us-news/2023/jan/26/hive-ransomware-servers-seized-us US authorities seize
https://www.theguardian.com/us-news/2023/jan/26/donald-trump-truth-social-posts-bode-ill-return-facebook Donald Trump’

# Task 5

In [28]:
import re
for i in range(1, 5):
    url = f"https://www.theguardian.com/technology?page={i}"
    req = requests.get(url)
    source = req.text
    soup = BeautifulSoup(source, 'html.parser')
    articles = soup.find_all('a', attrs={
        'class': 'js-headline-text',
        'href': re.compile('https://www.theguardian.com/.*/2023/jan/[2-9][1-9]')
    })    
    for article in articles:
        print(article['href'][:], article.text[:20])


https://www.theguardian.com/technology/2023/jan/27/elon-musk-doesnt-seem-like-right-person-to-own-twitter-says-co-founder Elon Musk ‘doesn’t s
https://www.theguardian.com/us-news/2023/jan/26/uber-lyft-new-york-zero-emission-2030 Uber and Lyft in New
https://www.theguardian.com/society/2023/jan/27/covid-lockdowns-created-online-backdoor-for-child-abusers-says-charity Covid lockdowns crea
https://www.theguardian.com/media/2023/jan/26/buzzfeed-artifical-intelligence-content-quizzes-chatgpt BuzzFeed to use AI t
https://www.theguardian.com/us-news/live/2023/jan/26/trump-facebook-ban-meta-biden-economy-politics-live-updates Biden vows to veto R
https://www.theguardian.com/science/2023/jan/26/science-journals-ban-listing-of-chatgpt-as-co-author-on-papers Science journals ban
https://www.theguardian.com/us-news/2023/jan/26/hive-ransomware-servers-seized-us US authorities seize
https://www.theguardian.com/us-news/2023/jan/26/donald-trump-truth-social-posts-bode-ill-return-facebook Donald Trump’