In [5]:
import requests
from bs4 import BeautifulSoup

# Sitemap URL
SITEMAP_URL = "https://www.pittsburghpa.gov/sitemap.xml"

# Headers to mimic a real browser request
HEADERS = {"User-Agent": "Mozilla/5.0"}

# Function to extract links from sitemap.xml
def get_links_from_sitemap():
    response = requests.get(SITEMAP_URL, headers=HEADERS)
    if response.status_code != 200:
        print(f"Failed to retrieve sitemap. Status Code: {response.status_code}")
        return []

    # Parse XML using BeautifulSoup
    soup = BeautifulSoup(response.text, "xml")  # Use 'xml' parser for proper parsing

    # Extract all <loc> values (URLs)
    links = [loc.text.strip() for loc in soup.find_all("loc")]

    print(f"Extracted {len(links)} links from sitemap.")
    return links

# Run the function to test it
links = get_links_from_sitemap()
print(links[:5])  # Print the first 5 links for verification


Extracted 2694 links from sitemap.
['https://www.pittsburghpa.gov/City-Government/Mayor/Mayor-banner/Youth-Civic-Leadership-Academy', 'https://www.pittsburghpa.gov/City-Government/Finances-Budget/Management-Budget/Community-Development/Public-Participation', 'https://www.pittsburghpa.gov/News-articles/Public-Safety-Blotter/2024/UPDATE-Police-Seek-the-Publics-Assistance-to-Locate-Missing-Female', 'https://www.pittsburghpa.gov/City-Government/City-Council/Districts/Bob-Charland-District-3/Newsletter-and-other-Resources', 'https://www.pittsburghpa.gov/News-articles/Public-Safety-Blotter/2024/Juvenile-Male-Injured-in-Overnight-Shooting-in-Spring-Hill-Detectives-Investigating']


In [6]:
import requests
from bs4 import BeautifulSoup
import time

# Headers to mimic a real browser request
HEADERS = {"User-Agent": "Mozilla/5.0"}

# Output files
DATA_FILE = "pittsburgh_gov_data.txt"
ERROR_LOG = "scraping_errors.log"

# Function to scrape a single webpage
def scrape_page(url):
    """Scrapes headings and paragraph text from a given webpage."""
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)  # Timeout after 10s

        if response.status_code != 200:
            print(f"Skipping {url} (Status Code: {response.status_code})")
            with open(ERROR_LOG, "a", encoding="utf-8") as err_file:
                err_file.write(f"Failed: {url} (Status Code: {response.status_code})\n")
            return None

        soup = BeautifulSoup(response.text, "html.parser")
        title_tag = soup.find("title")
        title = title_tag.text.strip() if title_tag else "No Title"

        content_list = [f"Title: {title}\nURL: {url}\n"]

        # Extract headings & paragraphs
        for tag in soup.find_all(["h1", "h2", "h3", "p"]):
            if tag.name in ["h1", "h2", "h3"]:  # Headings
                content_list.append(f"\n{tag.text.strip()}\n" + "-" * len(tag.text.strip()) + "\n")
            elif tag.name == "p":  # Paragraphs
                text = tag.get_text().strip()
                if text:
                    content_list.append(text)

        return "\n".join(content_list)

    except requests.exceptions.RequestException as e:
        print(f"Skipping {url} due to error: {e}")
        with open(ERROR_LOG, "a", encoding="utf-8") as err_file:
            err_file.write(f"Error: {url} ({e})\n")
        return None

# Function to scrape all pages and write data immediately
def scrape_all_pages(links):
    """Scrapes all URLs from the sitemap and writes data immediately after each page."""
    print(f"\nScraping {len(links)} pages and writing immediately...\n")

    with open(DATA_FILE, "a", encoding="utf-8") as f:  # Append mode to prevent data loss
        for i, link in enumerate(links):
            print(f"Scraping {i+1}/{len(links)}: {link}")
            page_content = scrape_page(link)

            if page_content:
                f.write(page_content + "\n\n")  # Write after scraping each page
                f.flush()  # Ensure data is saved immediately

            time.sleep(1)  # 1-second delay to prevent blocking

    print(f"\nScraping completed. Data saved in '{DATA_FILE}'")

# Get all links from the sitemap (use the function you tested earlier)
links = get_links_from_sitemap()

# Run the scraper on all extracted links
scrape_all_pages(links)


Extracted 2694 links from sitemap.

Scraping 2694 pages and writing immediately...

Scraping 1/2694: https://www.pittsburghpa.gov/City-Government/Mayor/Mayor-banner/Youth-Civic-Leadership-Academy
Scraping 2/2694: https://www.pittsburghpa.gov/City-Government/Finances-Budget/Management-Budget/Community-Development/Public-Participation
Scraping 3/2694: https://www.pittsburghpa.gov/News-articles/Public-Safety-Blotter/2024/UPDATE-Police-Seek-the-Publics-Assistance-to-Locate-Missing-Female
Scraping 4/2694: https://www.pittsburghpa.gov/City-Government/City-Council/Districts/Bob-Charland-District-3/Newsletter-and-other-Resources
Scraping 5/2694: https://www.pittsburghpa.gov/News-articles/Public-Safety-Blotter/2024/Juvenile-Male-Injured-in-Overnight-Shooting-in-Spring-Hill-Detectives-Investigating
Scraping 6/2694: https://www.pittsburghpa.gov/News-articles/Public-Safety-Blotter/2023/Pittsburgh-Police-EOD-Unit-Responds-for-Threat-to-Warhol-Museum
Scraping 7/2694: https://www.pittsburghpa.gov/New

### Scrapping Events through https://pittsburgh.events/