### Question 1. Extract at least 10 United Nations press releases containing the word “crisis”. 

In [3]:
import requests
from bs4 import BeautifulSoup

# check if article is a press release
def is_press_release(soup):
    tag1 =  soup.find('a', hreflang="en", href="/en/press-release")
    return tag1 

visited = set()
seen = ["https://press.un.org/en"]
press_releases = []

print("Starting with url=" + "https://press.un.org/en" )
while seen and len(press_releases) < 10:
    url = seen.pop(0)
    if url in visited:
        continue

    response = requests.get(url)
    if response.status_code != 200:
        continue

    soup = BeautifulSoup(response.content, 'html.parser')

    if is_press_release(soup) and "crisis" in soup.get_text().lower():
        press_releases.append(url)

    visited.add(url)

    # Extract links
    for tag in soup.find_all('a', href=True):
        if tag['href'].startswith('/'):
            full_link = "https://press.un.org" + tag['href']
            if full_link not in visited:
                seen.append(full_link)

press_releases

Starting with url=https://press.un.org/en




['https://press.un.org/en/2023/sgsm21967.doc.htm',
 'https://press.un.org/en/2023/sgsm21947.doc.htm',
 'https://press.un.org/en/2023/dsgsm1874.doc.htm',
 'https://press.un.org/en/2023/sgsm21952.doc.htm',
 'https://press.un.org/en/2023/sgsm21876.doc.htm',
 'https://press.un.org/en/2023/sgsm21852.doc.htm',
 'https://press.un.org/en/2023/sgsm21806.doc.htm',
 'https://press.un.org/en/2023/dsgsm1848.doc.htm',
 'https://press.un.org/en/2023/sgsm21765.doc.htm',
 'https://press.un.org/en/2023/sgsm21767.doc.htm']

In [4]:
for i in len(press_releases):
    response = requests.get(press_releases[i])
    soup = BeautifulSoup(response.content, 'html.parser')


<!DOCTYPE html>

<html dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="width" name="MobileOptimized"/>
<meta content="true" name="HandheldFriendly"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<link href="/themes/custom/un_press_theme/favicon.ico" rel="icon" type="image/vnd.microsoft.icon"/>
<link href="https://press.un.org/en/2023/sgsm21967.doc.htm" hreflang="en" rel="alternate"/>
<link href="https://press.un.org/en/2023/sgsm21967.doc.htm" rel="canonical"/>
<link href="https://press.un.org/en/node/323882" rel="shortlink"/>
<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='https://www.googletagmanager.com/gtm.js?id='+i+dl+'';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-NWVSS4R');</script>
<title>Stressing ‘1.5°C Limit Is Possible’, Secretary-G

In [8]:
for i in range(1, 11):
    file_name = "1_" + str(i) + ".txt"

    # Open the file for writing
    with open(file_name, 'w', encoding='utf-8') as file:
        response = requests.get(press_releases[i - 1])  
        soup = BeautifulSoup(response.content, 'html.parser')
        file.write(soup.prettify())  # Write the HTML content to the file


### Question 2.  Extract at least 10 European Parliament press releases containing the word “crisis”

In [5]:
from bs4 import BeautifulSoup
import urllib.request

def is_plenary_session_press_release(soup):
    tag1 = soup.find('span', class_ = "ep_name", text = 'Plenary session')
    tag2 = soup.find('span', class_ = "ep_name", text = 'Press Releases')
    return tag1 and tag2

seed_url = "https://www.europarl.europa.eu/news/en/press-room"

urls = [seed_url]  # Queue of URLs to crawl
seen = []           # Keep track of seen URLs
press_releases_ep = [] # Store found press releases

maxNumUrl = 10  # Set the maximum number of URLs to visit

print("Starting with url=" + str(urls))
while len(urls) > 0 and len(press_releases_ep) < maxNumUrl:
    # Dequeue a URL from urls and try to open and read it
    try:
        curr_url = urls.pop(0)
#         print("num. of URLs in stack: %d " % len(urls))
#         print("Trying to access= " + curr_url)
        req = urllib.request.Request(curr_url, headers={'User-Agent': 'Mozilla/5.0'})
        webpage = urllib.request.urlopen(req).read()

    except Exception as ex:
#         print("Unable to access= " + curr_url)
#         print(ex)
        continue  # Skip code below if URL cannot be accessed

    # If URL opens, check if it's a plenary session page and contains "crisis"
    soup = BeautifulSoup(webpage, 'html.parser')

    if is_plenary_session_press_release(soup) and "crisis" in soup.get_text().lower():
        press_releases_ep.append(curr_url)

    # Put child URLs into the stack
    for tag in soup.find_all('a', href=True):
        child_url = tag['href']
        child_url = urllib.parse.urljoin(seed_url, child_url)
        if seed_url in child_url and child_url not in seen and child_url not in urls:
            urls.append(child_url)
            seen.append(child_url)

press_releases_ep

Starting with url=['https://www.europarl.europa.eu/news/en/press-room']


['https://www.europarl.europa.eu/news/en/press-room/20230929IPR06132/nagorno-karabakh-meps-demand-review-of-eu-relations-with-azerbaijan',
 'https://www.europarl.europa.eu/news/en/press-room/20221209IPR64426/eu-long-term-budget-needs-urgent-revision-to-cope-with-current-crises',
 'https://www.europarl.europa.eu/news/en/press-room/20210304IPR99207/parliament-gives-green-light-for-new-eu4health-programme',
 'https://www.europarl.europa.eu/news/en/press-room/20220909IPR40138/parliament-adopts-new-rules-on-adequate-minimum-wages-for-all-workers-in-the-eu',
 'https://www.europarl.europa.eu/news/en/press-room/20230310IPR77232/minimum-income-schemes-increasing-support-accessibility-and-inclusion',
 'https://www.europarl.europa.eu/news/en/press-room/20230210IPR74806/green-deal-industrial-plan-securing-the-eu-s-clean-tech-leadership',
 'https://www.europarl.europa.eu/news/en/press-room/20230707IPR02421/parliament-adopts-new-rules-to-boost-energy-savings',
 'https://www.europarl.europa.eu/news/e

In [9]:
for i in range(1, 11):
    file_name = "2_" + str(i) + ".txt"

    # Open the file for writing
    with open(file_name, 'w', encoding='utf-8') as file:
        response = requests.get(press_releases_ep[i - 1])  
        soup = BeautifulSoup(response.content, 'html.parser')
        file.write(soup.prettify())  # Write the HTML content to the file