In [15]:
import requests
from bs4 import BeautifulSoup
import urllib.request

### Q1

In [4]:
# check if page release is there
def is_press_release(soup):
    press_release_link = soup.find('a', href="/en/press-release", hreflang="en")
    return press_release_link is not None

# check for "crisis"
def scrape(url):
    response = requests.get(url)
    if response.status_code != 200:
        return None

    #get the entire text
    soup = BeautifulSoup(response.content, 'html.parser')

    #check for crisis
    if is_press_release(soup):
        if "crisis" in soup.get_text().lower():
            return url

    return None

# Start with the seed URL
seed_url = "https://press.un.org/en"
base_url = "https://press.un.org"
visited = set()
to_visit = [seed_url]
press_releases = []

while to_visit and len(press_releases) < 10:
    url = to_visit.pop(0)
    if url in visited:
        continue

    visited.add(url)

    # Extract links from the current page
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        for link in soup.find_all('a', href=True):
            next_url = link['href']
            if next_url.startswith('/en'):
                full_link = base_url + next_url
                if full_link not in visited:
                    to_visit.append(full_link)

    press_release_url = scrape(url)
    if press_release_url:
        press_releases.append(press_release_url)

# print
for i, release_url in enumerate(press_releases[:10]):
    print(f"Press Release {i + 1}: {release_url}")


Press Release 1: https://press.un.org/en/2023/sgsm21982.doc.htm
Press Release 2: https://press.un.org/en/2023/sgsm21980.doc.htm
Press Release 3: https://press.un.org/en/2023/sgsm21978.doc.htm
Press Release 4: https://press.un.org/en/2023/sgsm21947.doc.htm
Press Release 5: https://press.un.org/en/2023/dsgsm1874.doc.htm
Press Release 6: https://press.un.org/en/2023/sgsm21952.doc.htm
Press Release 7: https://press.un.org/en/2023/sgsm21876.doc.htm
Press Release 8: https://press.un.org/en/2023/sgsm21852.doc.htm
Press Release 9: https://press.un.org/en/2023/sgsm21806.doc.htm
Press Release 10: https://press.un.org/en/2023/dsgsm1848.doc.htm


## Q2

In [21]:
#check for plenary session and press releases
def is_plenary_session_press_release(soup):
    return all(
        soup.find('span', class_='ep_name', text=text)
        for text in ['Plenary session', 'Press Releases']
    )

seed_url = "https://www.europarl.europa.eu/news/en/press-room"
url_num = 10  # max # of URLS
urls = [seed_url]  # Queue of URLs to crawl
old = set()       # Keep track of seen URLs
pr_eps = []  # Store found press releases

for url in urls:
    if len(pr_eps) >= url_num:
        break  

    try:
        req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        webpage = urllib.request.urlopen(req).read()
    except Exception as ex:
        continue  

    soup = BeautifulSoup(webpage, 'html.parser')

    # Check for crisis
    if is_plenary_session_press_release(soup) and "crisis" in soup.get_text().lower():
        pr_eps.append(url)

    # Put child URLs into the stack
    child_urls = [
        urllib.parse.urljoin(seed_url, tag['href'])
        for tag in soup.find_all('a', href=True)
    ]

    # Filter and update the URLs and seen set
    for child_url in child_urls:
        if seed_url in child_url and child_url not in old and child_url not in urls:
            urls.append(child_url)
            old.add(child_url)

# Print the extracted press releases
for idx, release_url in enumerate(pr_eps):
    print(f"Press Release {idx + 1}: {release_url}")


Press Release 1: https://www.europarl.europa.eu/news/en/press-room/20230929IPR06132/nagorno-karabakh-meps-demand-review-of-eu-relations-with-azerbaijan
Press Release 2: https://www.europarl.europa.eu/news/en/press-room/20221209IPR64426/eu-long-term-budget-needs-urgent-revision-to-cope-with-current-crises
Press Release 3: https://www.europarl.europa.eu/news/en/press-room/20210304IPR99207/parliament-gives-green-light-for-new-eu4health-programme
Press Release 4: https://www.europarl.europa.eu/news/en/press-room/20220909IPR40138/parliament-adopts-new-rules-on-adequate-minimum-wages-for-all-workers-in-the-eu
Press Release 5: https://www.europarl.europa.eu/news/en/press-room/20230310IPR77232/minimum-income-schemes-increasing-support-accessibility-and-inclusion
Press Release 6: https://www.europarl.europa.eu/news/en/press-room/20230210IPR74806/green-deal-industrial-plan-securing-the-eu-s-clean-tech-leadership
Press Release 7: https://www.europarl.europa.eu/news/en/press-room/20230707IPR02421/