<a target="_blank" href="https://colab.research.google.com/github/sappw1/Dissertation/blob/main/Notebooks/Notebooks-Working/ETL/prac_scraper.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [None]:
import requests
from bs4 import BeautifulSoup
import time
import json

base_url = "https://pandemicoversight.gov"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36'
}

def safe_request(url):
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        return response
    except requests.exceptions.RequestException as e:
        print(f"Request error: {e} - URL: {url}")
        return None

def get_reports(page_number):
    url = f"https://pandemicoversight.gov/oversight/reports?f%5B0%5D=report_type_taxonomy%3A85&page={page_number}"
    response = safe_request(url)
    if not response:
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    reports = []

    for item in soup.select('.views-row'):
        title_tag = item.select_one('.display__condensed--title a')
        date_tag = item.select_one('.display__condensed--footer time')

        if title_tag and date_tag:
            title = title_tag.text.strip()
            date = date_tag.text.strip()
            link = title_tag['href']

            reports.append({
                'title': title,
                'date': date,
                'link': link
            })

    return reports

def get_press_release(url):
    response = safe_request(url)
    if not response:
        return ""

    soup = BeautifulSoup(response.text, 'html.parser')
    content = soup.select_one('.node-body .field_body')

    return content.text.strip() if content else ""

# Crawl through pages with optional limit
all_reports = []
page_limit = None  # Set limit here for testing, None for no limit
page_count = 0

while page_limit is None or page_count < page_limit:
    print(f"Scraping page: {page_count + 1}")
    reports = get_reports(page_count)

    if not reports:
        break

    for report in reports:
        print(f"Fetching press release for: {report['title']}")
        full_url = report['link'] if report['link'].startswith('http') else base_url + report['link']
        report['press_release'] = get_press_release(full_url)
        time.sleep(1)  # delay between press release requests

    all_reports.extend(reports)
    page_count += 1

    # Incremental saving
    with open('pandemic_reports.json', 'w', encoding='utf-8') as f:
        json.dump(all_reports, f, ensure_ascii=False, indent=4)

    time.sleep(2)  # delay between page requests

# Display results
for report in all_reports:
    print(f"Title: {report['title']}\nDate: {report['date']}\nLink: {report['link']}\nPress Release:\n{report['press_release']}\n{'-'*80}\n")
