# Q1-1

In [10]:
import requests
from bs4 import BeautifulSoup

# Function to check if a given page is a press release
def is_press_release(soup):
    press_release_tag = soup.find('a', {'href': '/en/press-release', 'hreflang': 'en'})
    return bool(press_release_tag)

# Function to save the HTML source code to a .txt file
def save_to_file(content, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(content)

# Initial seed URL
seed_url = "https://press.un.org/en"

# Fetch and parse the seed page
response = requests.get(seed_url)
soup = BeautifulSoup(response.content, 'html.parser')

# Find all links on the page that might lead to press releases
links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].startswith("/en/")]

# Counter for the number of press releases found containing the word "crisis"
counter = 0

# List to store press release URLs containing the word "crisis"
crisis_press_releases = []

for link in links:
    # Construct full URL
    full_url = "https://press.un.org" + link

    # Fetch and parse the press release page
    response = requests.get(full_url)
    press_release_soup = BeautifulSoup(response.content, 'html.parser')

    # Check if the page is a press release by looking for the "PRESS RELEASE" link
    if is_press_release(press_release_soup):
        # Check if the word "crisis" exists in the press release
        if "crisis" in press_release_soup.get_text().lower():
            crisis_press_releases.append(full_url)
            counter += 1

            # Save the complete HTML source code to a .txt file
            save_to_file(response.text, f"1_{counter}.txt")

    # Stop if we've found 10 press releases containing the word "crisis"
    if counter >= 10:
        break

print(crisis_press_releases)


['https://press.un.org/en/2023/sgsm21967.doc.htm']


# Q1-1 bfs ☑️

In [26]:
import requests
from bs4 import BeautifulSoup
from collections import deque
import time

BASE_URL = "https://press.un.org"

# Function to check if a given page is a press release
def is_press_release(soup):
    press_release_tag = soup.find('a', {'href': '/en/press-release', 'hreflang': 'en'})
    return bool(press_release_tag)

def get_soup(url):
    response = requests.get(url)
    time.sleep(1)  # Pause for 1 second after each request
    return BeautifulSoup(response.content, 'html.parser')

def bfs_crawl(start_url):
    visited = set()  # To keep track of visited URLs
    queue = deque([start_url])  # Queue initialized with the start URL

    counter = 0

    while queue:
        url = queue.popleft()
        if url in visited:
            continue

        soup = get_soup(url)
        visited.add(url)

        if is_press_release(soup) and "crisis" in soup.get_text().lower():
            counter += 1
            # Save the content to a file
            with open(f"1_{counter}.txt", 'w', encoding='utf-8') as f:
                f.write(soup.prettify())
            # Save the URL to crisis.txt
            with open("crisis_1.txt", 'a', encoding='utf-8') as f:
                f.write(url + '\n')
            
            if counter >= 10:
                break

        # Extract links to enqueue
        links = soup.find_all('a', href=True)
        for link in links:
            href = link['href']
            if href.startswith(('#', 'javascript:', 'mailto:')):
                continue  # Skip invalid links

            # Properly join the base URL with the href
            if href.startswith('?'):
                full_url = url + href
            elif href.startswith("/"):
                full_url = BASE_URL + href
            else:
                full_url = href

            if full_url not in visited:
                queue.append(full_url)

bfs_crawl("https://press.un.org/en")


# Q1-1 scrapy

In [None]:
!pip install scrapy
# 安装确保在 Jupyter 环境中安装了 Scrapy。如果没有，可以使用 !pip install scrapy 进行安装。

In [15]:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.exceptions import CloseSpider

# Define the spider class
class UNPressSpider(scrapy.Spider):
    name = 'un_press_releases'
    start_urls = ['https://press.un.org/en']

    # Counter for the number of press releases found containing the word "crisis"
    count = 0
    MAX_COUNT = 10

    def parse(self, response):
        # Check if the current page is a press release
        if response.css('a[href="/en/press-release"][hreflang="en"]'):
            
            # Check if the word "crisis" exists in the press release
            if "crisis" in response.text.lower():
                self.count += 1
                # Save the content to a file
                with open(f"1_{self.count}.txt", 'w', encoding='utf-8') as f:
                    f.write(response.text)
                
                # Save the URL to crisis.txt
                with open("crisis_1.txt", 'a', encoding='utf-8') as f:
                    f.write(response.url + '\n')
                
                if self.count >= self.MAX_COUNT:
                    raise CloseSpider('Reached maximum count of press releases containing "crisis".')

        # Extract all links and recursively scrape them
        for link in response.css('a::attr(href)').extract():
            if link.startswith('/en/') and 'javascript:' not in link and not link.startswith('mailto:'):
                yield response.follow(link, self.parse)



# Configure settings and run the spider
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0',
    'LOG_LEVEL': 'ERROR',  # Change to 'INFO' to see more logs
})

process.crawl(UNPressSpider)
process.start()


# Q1-2
跳过部分网站

In [None]:
import requests
from bs4 import BeautifulSoup
from collections import deque
import time

BASE_URL = "https://www.europarl.europa.eu/news/en/press-room"

def is_plenary_session_press_release(soup):
    return soup.find('span', class_='ep_name', string='Plenary session') is not None

def get_soup(url):
    try:
        response = requests.get(url, timeout=5)
        time.sleep(1)  # Pause for 1 second after each request
        response.raise_for_status()  # raise HTTPError for bad responses (4xx and 5xx)
        return BeautifulSoup(response.content, 'html.parser')
    except requests.RequestException:
        return None  # or you can log the error or do something else

def bfs_crawl(start_url):
    visited = set()  # To keep track of visited URLs
    queue = deque([start_url])  # Queue initialized with the start URL
    counter = 0

    while queue:
        url = queue.popleft()
        if url in visited:
            continue

        soup = get_soup(url)
        if soup is None:  # Check if soup is None
            continue  # Skip this URL
        
        visited.add(url)

        if is_plenary_session_press_release(soup) and "crisis" in soup.get_text().lower():
            counter += 1
            with open(f"2_{counter}.txt", 'w', encoding='utf-8') as f:
                f.write(soup.prettify())
            with open("crisis_2.txt", 'a', encoding='utf-8') as f:
                f.write(url + '\n')
            
            if counter >= 10:
                break

         # Extract links to enqueue
        links = soup.find_all('a', href=True)
        for link in links:
            href = link['href']
            if not href or href.startswith(('#', 'javascript:', 'mailto:')):
                continue  # Skip invalid or empty links

            # Properly join the base URL with the href
            if href.startswith('?'):
                full_url = url + href
            elif href.startswith("/"):
                full_url = BASE_URL + href
            else:
                full_url = href

            if full_url not in visited:
                queue.append(full_url)

bfs_crawl("https://www.europarl.europa.eu/news/en/press-room")


# Q1-2 优化

In [None]:
import requests
from bs4 import BeautifulSoup
from collections import deque, namedtuple
import time

BASE_URL = "https://www.europarl.europa.eu/news/en/press-room"

# 使用namedtuple存储URL和其深度
URLData = namedtuple('URLData', ['url', 'depth'])

cache = {}  # 用于缓存页面内容的字典

def is_plenary_session_press_release(soup):
    return soup.find('span', class_='ep_name', string='Plenary session') is not None

def get_soup(url):
    try:
        response = requests.get(url, timeout=5)
        time.sleep(1)  # Pause for 1 second after each request
        response.raise_for_status()  # raise HTTPError for bad responses (4xx and 5xx)
        return BeautifulSoup(response.content, 'html.parser')
    except requests.RequestException:
        return None  # or you can log the error or do something else

def bfs_crawl(start_url, max_depth=3):
    visited = set()
    queue = deque([URLData(start_url, 0)])  # 使用namedtuple初始化队列
    counter = 0

    while queue:
        url_data = queue.popleft()
        url, depth = url_data.url, url_data.depth

        if url in visited:
            continue

        # 使用缓存，避免重复下载页面
        if url not in cache:
            soup = get_soup(url)
            if soup is None:  # Check if soup is None
                continue  # Skip this URL
            if soup:
                cache[url] = soup
        else:
            soup = cache[url]
        
        visited.add(url)

        if is_plenary_session_press_release(soup) and "crisis" in soup.get_text().lower():
            counter += 1
            # Save the content to a file
            with open(f"2_{counter}.txt", 'w', encoding='utf-8') as f:
                f.write(soup.prettify())
            # Save the URL to crisis.txt
            with open("crisis_2.txt", 'a', encoding='utf-8') as f:
                f.write(url + '\n')
            
            if counter >= 10:
                break

        # 如果未达到最大深度，继续爬取
        if depth < max_depth:
            links = soup.find_all('a', href=True)
            for link in links:
                href = link['href']
                if not href or href.startswith(('#', 'javascript:', 'mailto:')):
                    continue  # Skip invalid or empty links

                # Properly join the base URL with the href
                if href.startswith('?'):
                    full_url = url + href
                elif href.startswith("/"):
                    full_url = BASE_URL + href
                else:
                    full_url = href

                if full_url not in visited:
                    queue.append(URLData(full_url, depth+1))  # 增加深度

bfs_crawl("https://www.europarl.europa.eu/news/en/press-room")


# Q1-2 V2.0 优化

In [36]:
!pip install requests_cache==0.5.2

Collecting requests_cache
  Downloading requests_cache-1.1.0-py3-none-any.whl (60 kB)
[K     |████████████████████████████████| 60 kB 376 kB/s eta 0:00:01
[?25hCollecting platformdirs>=2.5
  Downloading platformdirs-3.11.0-py3-none-any.whl (17 kB)
Collecting url-normalize>=1.4
  Downloading url_normalize-1.4.3-py2.py3-none-any.whl (6.8 kB)
Collecting cattrs>=22.2
  Downloading cattrs-23.1.2-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 1.0 MB/s eta 0:00:01
Collecting exceptiongroup
  Downloading exceptiongroup-1.1.3-py3-none-any.whl (14 kB)
Collecting typing_extensions>=4.1.0
  Downloading typing_extensions-4.8.0-py3-none-any.whl (31 kB)
Installing collected packages: typing-extensions, exceptiongroup, url-normalize, platformdirs, cattrs, requests-cache
  Attempting uninstall: typing-extensions
    Found existing installation: typing-extensions 3.10.0.2
    Uninstalling typing-extensions-3.10.0.2:
      Successfully uninstalled typing-extensions-3.10.0.2
S

In [None]:
import requests
import requests_cache
from bs4 import BeautifulSoup
from collections import deque, namedtuple

# 设置缓存
requests_cache.install_cache()

BASE_URL = "https://www.europarl.europa.eu/news/en/press-room"

URLData = namedtuple('URLData', ['url', 'depth'])

def is_plenary_session_press_release(soup):
    return soup.find('span', class_='ep_name', string='Plenary session') is not None

def get_soup(url):
    try:
        response = requests.get(url, timeout=5)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')
    except requests.RequestException:
        return None

def bfs_crawl(start_url, max_depth=3):
    visited = set()
    queue = deque([URLData(start_url, 0)])
    counter = 0

    while queue:
        url_data = queue.popleft()
        url, depth = url_data.url, url_data.depth

        if url in visited:
            continue

        soup = get_soup(url)
        if soup is None:  # Check if soup is None
            continue  # Skip this URL if soup is None
        visited.add(url)

        if is_plenary_session_press_release(soup) and "crisis" in soup.get_text().lower():
            counter += 1
            with open(f"2_{counter}.txt", 'w', encoding='utf-8') as f:
                f.write(soup.prettify())
            with open("crisis_2.txt", 'a', encoding='utf-8') as f:
                f.write(url + '\n')
            
            if counter >= 10:
                break

        if depth < max_depth:
            links = soup.find_all('a', href=True)
            for link in links:
                href = link['href']
                if not href or href.startswith(('#', 'javascript:', 'mailto:')):
                    continue

                if href.startswith('?'):
                    full_url = url + href
                elif href.startswith("/"):
                    full_url = BASE_URL + href
                else:
                    full_url = href

                if full_url not in visited:
                    queue.append(URLData(full_url, depth+1))

bfs_crawl("https://www.europarl.europa.eu/news/en/press-room")


# Q1-2 使用requests库来模拟发送带有搜索关键字的GET请求 V1.0


In [None]:
import requests
import requests_cache
from bs4 import BeautifulSoup
from collections import deque, namedtuple

# 设置缓存
requests_cache.install_cache()

BASE_URL = "https://www.europarl.europa.eu/news/en/press-room"
SEARCH_URL = BASE_URL  # 使用相同的URL进行搜索

URLData = namedtuple('URLData', ['url', 'depth'])

def is_plenary_session_press_release(soup):
    return soup.find('span', class_='ep_name', string='Plenary session') is not None

def get_soup(url, params=None):
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Connection": "keep-alive",
        "Host": "www.europarl.europa.eu",
        "If-Modified-Since": "Fri, 06 Oct 2023 16:26:42 UTC",
        "If-None-Match": '"0da04c545a722ed9e772a81e630581bf5"',
        "Referer": "https://www.europarl.europa.eu/news/en/press-room",
        "Sec-Ch-Ua": '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
        "Sec-Ch-Ua-Mobile": "?0",
        "Sec-Ch-Ua-Platform": "macOS",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "same-origin",
        "Sec-Fetch-User": "?1",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers, params=params, timeout=5)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')
    except requests.RequestException:
        return None

def bfs_crawl(search_keyword, max_depth=1):
    visited = set()
    counter = 0

    # 使用搜索关键字发送GET请求
    soup = get_soup(SEARCH_URL, params={'searchQuery': search_keyword})
    if soup is None:
        print("Failed to get the search results.")
        return

    # 基于搜索结果初始化队列
    queue = deque([URLData(SEARCH_URL, 0)])

    while queue:
        url_data = queue.popleft()
        url, depth = url_data.url, url_data.depth

        if url in visited:
            continue

        soup = get_soup(url)
        if soup is None:
            continue

        visited.add(url)

        if is_plenary_session_press_release(soup) and "crisis" in soup.get_text().lower():
            counter += 1
            with open(f"2_{counter}.txt", 'w', encoding='utf-8') as f:
                f.write(soup.prettify())
            with open("crisis_2.txt", 'a', encoding='utf-8') as f:
                f.write(url + '\n')
            
            if counter >= 10:
                break

        if depth < max_depth:
            links = soup.find_all('a', href=True)
            for link in links:
                href = link['href']
                if not href or href.startswith(('#', 'javascript:', 'mailto:')):
                    continue

                if href.startswith('?'):
                    full_url = url + href
                elif href.startswith("/"):
                    full_url = BASE_URL + href
                else:
                    full_url = href

                if full_url not in visited:
                    queue.append(URLData(full_url, depth+1))

bfs_crawl("crisis")


# Q1-2 使用requests库来模拟发送带有搜索关键字的GET请求 V2.0 ☑️

In [None]:
!pip install requests_cache==0.5.2

In [51]:
import requests
import requests_cache
from bs4 import BeautifulSoup
from collections import deque, namedtuple

# 设置缓存
requests_cache.install_cache()

BASE_URL = "https://www.europarl.europa.eu/news/en/press-room"
SEARCH_URL = BASE_URL  # 使用相同的URL进行搜索

URLData = namedtuple('URLData', ['url', 'depth'])

def is_plenary_session_press_release(soup):
    return soup.find('span', class_='ep_name', string='Plenary session') is not None

def get_soup(url, params=None):
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Connection": "keep-alive",
        "Host": "www.europarl.europa.eu",
        "If-Modified-Since": "Fri, 06 Oct 2023 16:26:42 UTC",
        "If-None-Match": '"0da04c545a722ed9e772a81e630581bf5"',
        "Referer": "https://www.europarl.europa.eu/news/en/press-room",
        "Sec-Ch-Ua": '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
        "Sec-Ch-Ua-Mobile": "?0",
        "Sec-Ch-Ua-Platform": "macOS",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "same-origin",
        "Sec-Fetch-User": "?1",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers, params=params, timeout=5)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')
    except requests.RequestException:
        return None


def bfs_crawl(search_keyword, max_depth=1, max_pages=3):
    visited = set()
    counter = 0

    # 使用搜索关键字发送GET请求
    soup = get_soup(SEARCH_URL, params={'searchQuery': search_keyword})
    if soup is None:
        print("Failed to get the search results.")
        return

    # 基于搜索结果初始化队列
    queue = deque([URLData(SEARCH_URL, 0)])

    # 将前max_pages个页面添加到队列
    for page in range(1, max_pages+1):
        page_url = f"{BASE_URL}/page/{page}?searchQuery={search_keyword}"
        queue.append(URLData(page_url, 0))

    while queue:
        url_data = queue.popleft()
        url, depth = url_data.url, url_data.depth

        if url in visited:
            continue

        soup = get_soup(url)
        if soup is None:
            continue

        visited.add(url)

        if is_plenary_session_press_release(soup) and "crisis" in soup.get_text().lower():
            counter += 1
            with open(f"2_{counter}.txt", 'w', encoding='utf-8') as f:
                f.write(soup.prettify())
            with open("crisis_2.txt", 'a', encoding='utf-8') as f:
                f.write(url + '\n')
            
            if counter >= 10:
                break

        if depth < max_depth:
            links = soup.find_all('a', href=True)
            for link in links:
                href = link['href']
                if not href or href.startswith(('#', 'javascript:', 'mailto:')):
                    continue

                if href.startswith('?'):
                    full_url = url + href
                elif href.startswith("/"):
                    full_url = BASE_URL + href
                else:
                    full_url = href

                if full_url not in visited:
                    queue.append(URLData(full_url, depth+1))

bfs_crawl("crisis")

# Q1-2 暴力完整的搜索URL，直接开始从这个URL爬取 https://www.europarl.europa.eu/news/en/press-room?searchQuery=crisis

In [None]:
import requests
from bs4 import BeautifulSoup
from collections import deque, namedtuple

BASE_URL = "https://www.europarl.europa.eu/news/en/press-room"
SEARCH_URL = "https://www.europarl.europa.eu/news/en/press-room?searchQuery=crisis"

URLData = namedtuple('URLData', ['url', 'depth'])

HEADERS = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Connection": "keep-alive",
    "Host": "www.europarl.europa.eu",
    "Referer": "https://www.europarl.europa.eu/news/en/press-room",
    "Sec-Ch-Ua": '"Google Chrome";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
    "Sec-Ch-Ua-Mobile": "?0",
    "Sec-Ch-Ua-Platform": "macOS",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-User": "?1",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

def is_plenary_session_press_release(soup):
    return soup.find('span', class_='ep_name', string='Plenary session') is not None

def get_soup(url):
    try:
        response = requests.get(url, headers=HEADERS, timeout=5)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')
    except requests.RequestException:
        return None

def bfs_crawl(start_url, max_depth=2):
    visited = set()
    queue = deque([URLData(start_url, 0)])
    counter = 0

    while queue:
        url_data = queue.popleft()
        url, depth = url_data.url, url_data.depth

        if url in visited:
            continue

        soup = get_soup(url)
        if soup is None:
            continue

        visited.add(url)

        if is_plenary_session_press_release(soup) and "crisis" in soup.get_text().lower():
            counter += 1
            with open(f"2_{counter}.txt", 'w', encoding='utf-8') as f:
                f.write(soup.prettify())
            with open("crisis_2.txt", 'a', encoding='utf-8') as f:
                f.write(url + '\n')
            
            if counter >= 12:
                break

        if depth < max_depth:
            links = soup.find_all('a', href=True)
            for link in links:
                href = link['href']
                if not href or href.startswith(('#', 'javascript:', 'mailto:')):
                    continue

                if href.startswith('?'):
                    full_url = url + href
                elif href.startswith("/"):
                    full_url = BASE_URL + href
                else:
                    full_url = href

                if full_url not in visited:
                    queue.append(URLData(full_url, depth+1))

bfs_crawl(SEARCH_URL)
