# b9122_hw2_sol_code_Qiantong Li

# Q1-1 bs4 (Code 1- related more to class- plz ignore XML Warning)

In [3]:
!pip install lxml



In [4]:
import requests
from bs4 import BeautifulSoup
from collections import deque
import time

BASE_URL = "https://press.un.org"

# Function to check if a given page is a press release
def is_press_release(soup):
    press_release_tag = soup.find('a', {'href': '/en/press-release', 'hreflang': 'en'})
    return bool(press_release_tag)

def get_soup(url):
    response = requests.get(url)
    # time.sleep(1)  # Pause for 1 second after each request
    return BeautifulSoup(response.content, 'lxml')

def bfs_crawl(start_url):
    visited = set()  # To keep track of visited URLs
    queue = deque([start_url])  # Queue initialized with the start URL

    counter = 0

    while queue:
        url = queue.popleft()
        if url in visited:
            continue

        soup = get_soup(url)
        visited.add(url)

        if is_press_release(soup) and "crisis" in soup.get_text().lower():
            counter += 1
            # Save the content to a file
            with open(f"1_{counter}.txt", 'w', encoding='utf-8') as f:
                f.write(soup.prettify())
            # Save the URL to crisis.txt
            with open("crisis_1.txt", 'a', encoding='utf-8') as f:
                f.write(url + '\n')

            if counter >= 12:
                break

        # Extract links to enqueue
        links = soup.find_all('a', href=True)
        for link in links:
            href = link['href']
            if href.startswith(('#', 'javascript:', 'mailto:')):
                continue  # Skip invalid links

            # Properly join the base URL with the href
            if href.startswith('?'):
                full_url = url + href
            elif href.startswith("/"):
                full_url = BASE_URL + href
            else:
                full_url = href

            if full_url not in visited:
                queue.append(full_url)

bfs_crawl("https://press.un.org/en")

# Q1-1 scrapy (Code 2-running more smoothly on my computer)

In [1]:
!pip install scrapy



In [2]:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.exceptions import CloseSpider

# Define the spider class
class UNPressSpider(scrapy.Spider):
    name = 'un_press_releases'
    start_urls = ['https://press.un.org/en']

    # Counter for the number of press releases found containing the word "crisis"
    count = 0
    MAX_COUNT = 10

    def parse(self, response):
        # Check if the current page is a press release
        if response.css('a[href="/en/press-release"][hreflang="en"]'):
            
            # Check if the word "crisis" exists in the press release
            if "crisis" in response.text.lower():
                self.count += 1
                # Save the content to a file
                with open(f"1_{self.count}.txt", 'w', encoding='utf-8') as f:
                    f.write(response.text)
                
                # Save the URL to crisis.txt
                with open("crisis_1.txt", 'a', encoding='utf-8') as f:
                    f.write(response.url + '\n')
                
                if self.count >= self.MAX_COUNT:
                    raise CloseSpider('Reached maximum count of press releases containing "crisis".')

        # Extract all links and recursively scrape them
        for link in response.css('a::attr(href)').extract():
            if link.startswith('/en/') and 'javascript:' not in link and not link.startswith('mailto:'):
                yield response.follow(link, self.parse)



# Configure settings and run the spider
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0',
    'LOG_LEVEL': 'ERROR',  # Change to 'INFO' to see more logs
})

process.crawl(UNPressSpider)
process.start()


# Q1-2

In [10]:
!pip install requests_cache==0.5.2



In [11]:
import requests
import requests_cache
from bs4 import BeautifulSoup
from collections import deque, namedtuple

# 设置缓存 Set up caching
requests_cache.install_cache()

BASE_URL = "https://www.europarl.europa.eu/news/en/press-room"
SEARCH_URL = BASE_URL  # 使用相同的URL进行搜索 Perform a search with the same URL

URLData = namedtuple('URLData', ['url', 'depth'])

def is_plenary_session_press_release(soup):
    return soup.find('span', class_='ep_name', string='Plenary session') is not None

def get_soup(url, params=None):
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Connection": "keep-alive",
        "Host": "www.europarl.europa.eu",
        "If-Modified-Since": "Fri, 06 Oct 2023 16:26:42 UTC",
        "If-None-Match": '"0da04c545a722ed9e772a81e630581bf5"',
        "Referer": "https://www.europarl.europa.eu/news/en/press-room",
        "Sec-Ch-Ua": '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
        "Sec-Ch-Ua-Mobile": "?0",
        "Sec-Ch-Ua-Platform": "macOS",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "same-origin",
        "Sec-Fetch-User": "?1",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers, params=params, timeout=5)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')
    except requests.RequestException:
        return None


def bfs_crawl(search_keyword, max_depth=1, max_pages=3):
    visited = set()
    counter = 0

    # 使用搜索关键字发送GET请求 Send a GET request using search keywords
    soup = get_soup(SEARCH_URL, params={'searchQuery': search_keyword})
    if soup is None:
        print("Failed to get the search results.")
        return

    # 基于搜索结果初始化队列 Initialize a queue based on search results
    queue = deque([URLData(SEARCH_URL, 0)])

    # 将前max_pages个页面添加到队列 Add the first max_pages pages to the queue
    for page in range(1, max_pages+1):
        page_url = f"{BASE_URL}/page/{page}?searchQuery={search_keyword}"
        queue.append(URLData(page_url, 0))

    while queue:
        url_data = queue.popleft()
        url, depth = url_data.url, url_data.depth

        if url in visited:
            continue

        soup = get_soup(url)
        if soup is None:
            continue

        visited.add(url)

        if is_plenary_session_press_release(soup) and "crisis" in soup.get_text().lower():
            counter += 1
            with open(f"2_{counter}.txt", 'w', encoding='utf-8') as f:
                f.write(soup.prettify())
            with open("crisis_2.txt", 'a', encoding='utf-8') as f:
                f.write(url + '\n')
            
            if counter >= 15:
                break

        if depth < max_depth:
            links = soup.find_all('a', href=True)
            for link in links:
                href = link['href']
                if not href or href.startswith(('#', 'javascript:', 'mailto:')):
                    continue

                if href.startswith('?'):
                    full_url = url + href
                elif href.startswith("/"):
                    full_url = BASE_URL + href
                else:
                    full_url = href

                if full_url not in visited:
                    queue.append(URLData(full_url, depth+1))

bfs_crawl("crisis")