# Web Scraping

This is a web scraper for the GOV.UK website, designed to scrape content from various categories and topics from "Services and Information." It uses BeautifulSoup for HTML parsing, requests for HTTP requests, and includes error handling and structured output saving.

### Step 0: Load dependencies

In [None]:
from bs4 import BeautifulSoup
import requests
from html.parser import HTMLParser
import re
import time
import os
from urllib.parse import urljoin

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /rhome/zchapman/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

**Imports**:
- `Beautiful Soup`: also used for parsing HTLML
- `html.parser`: parser library used by Beautiful Soup
- `Requests`: used for making http 
- `Pipeline`: used for text generation
- `nltk`: library for natural language processing
- `sent_tokenize`: used to split text into sequences
- `re`: regular expressions


### Step 2: Expand the scraper for additional `www.gov.uk` URLs

In [2]:
BASE_URL = "https://www.gov.uk"
BROWSE_URL = urljoin(BASE_URL, "/browse")
OUTPUT_DIR = "scraped_content" # Directory to save the output files.

# Fetches a URL using a requests.Session object and returns a BeautifulSoup object.
def get_soup(session, url):
    try:
        # Use a try-except block for robust error handling.
        response = session.get(url, timeout=10)  # Add a timeout to prevent hanging.
        response.raise_for_status()  # Raises an HTTPError for bad responses (4xx or 5xx).
        return BeautifulSoup(response.text, "html.parser")
    except requests.exceptions.RequestException as e:
        # Catch any request-related errors (e.g., network issues, timeouts).
        print(f"Error fetching {url}: {e}")
        return None

# Scrapes the main GOV.UK "browse" page to find all top-level category links, returns a list of unique urls for each category.
def get_main_category_links(session):
    print(f"Fetching categories from: {BROWSE_URL}")
    soup = get_soup(session, BROWSE_URL)
    if not soup:
        return []
    # The pattern '/browse/' followed by a name (e.g., '/browse/benefits') has two slashes.
    category_links = {
        urljoin(BASE_URL, link['href'])
        for link in soup.select('main a[href]') # Use a more specific CSS selector
        if link['href'].startswith('/browse/') and link['href'].count('/') == 2
    }
    return sorted(list(category_links)) # Sorting provides a consistent order.

# Scrapes a category page to find all links under the "Topics" section, returns a list of URL, title tuples for each topic. 
def get_topics_from_category(session, category_url):
    soup = get_soup(session, category_url)
    if not soup:
        return []
    topics_heading = soup.find(lambda tag: tag.name in ["h2", "h3"] and "Topics" in tag.get_text())
    if not topics_heading:
        print(f"  -> No 'Topics' section found on {category_url}")
        return []
    topic_list = topics_heading.find_next_sibling("ul")
    if not topic_list:
        return []
    # Return both the link and the text of the link (the sub-topic title).
    topic_links = [
        (urljoin(BASE_URL, link['href']), link.get_text(strip=True))
        for link in topic_list.select('a[href]')
    ]
    return topic_links

# Scrapes a topic landing page to get links to the final content articles, returns a tuple containing a list of URLS and the selector that worked. 
def get_content_links_from_topic(session, topic_url):
    soup = get_soup(session, topic_url)
    if not soup:
        return [], None
    
    # Websites use different structures. We create a list of potential CSS selectors.
    possible_selectors = [
        'main .gem-c-document-list a[href]',      # For pages like 'living-abroad'
        'main .browse-container .govuk-list a[href]', # For pages like 'benefits'
        'main ul.govuk-list a[href]'             # A more general fallback for lists
    ]
    
    link_elements = []
    found_selector = None
    for selector in possible_selectors:
        link_elements = soup.select(selector)
        if link_elements:
            found_selector = selector
            break # If we find links with one selector, we don't need to check the others.

    content_links = [
        urljoin(BASE_URL, link['href'])
        for link in link_elements
        # Ensure we are not re-visiting browse pages.
        if link['href'].startswith('/') and not link['href'].startswith('/browse')
    ]
    return content_links, found_selector

# Scrapes a single final content page for its heading and full body text, returns a dict containing the hedaing and body text. 
def scrape_page_content(session, page_url):
    soup = get_soup(session, page_url)
    if not soup:
        return None
    # Scrape and clean the main body text
    body_text = "No main content found"
    main_content = soup.find('main') # Target the main content area of the page.
    # Remove unwanted elements like the table of contents.
    if main_content:
        toc_classes_to_remove = ['gem-c-contents-list', 'contents', 'toc']
        for toc_class in toc_classes_to_remove:
            for element in main_content.find_all(class_=toc_class):
                element.decompose()
        # Initialize a list to hold the text content from relevant tags.
        content_lines = []
        # Include headers, paragraphs, and list items.
        for tag in main_content.find_all(['h1', 'h2', 'h3', 'p', 'li']):
            text = ' '.join(tag.stripped_strings)
            if tag.name == 'li':
                content_lines.append(f"* {text}")
            else:
                content_lines.append(text)
        body_text = '\n'.join(content_lines)
    return {"body_text": body_text}

# Saves the given text content to a sequentially numbered file.
def save_content_to_file(body_text, file_heading, file_counter):
    file_content = f"{file_heading}\n\n{body_text}"
    filename = f"target{file_counter}.txt"
    filepath = os.path.join(OUTPUT_DIR, filename)
    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(file_content)
        print(f"        -> Saved to {filepath}")
        return file_counter + 1
    except IOError as e:
        print(f"        -> ERROR: Could not write file {filepath}: {e}")
        return file_counter
    
# Main function to orchestrate the web scraping and file saving process.
def main():
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
        print(f"Created directory: {OUTPUT_DIR}")
    file_counter = 1
    with requests.Session() as session:
        session.headers.update({'User-Agent': 'MyCoolScraper/1.0'})
        category_links = get_main_category_links(session)
        print(f"Found {len(category_links)} categories.\n")
        for category_url in category_links:
            try:
                main_category_slug = category_url.strip('/').split('/')[-1]
                main_category_name = main_category_slug.replace('-', ' ').title()
            except IndexError:
                main_category_name = "Unknown Category"
            print(f"Processing Category: {main_category_name}")
            topic_links = get_topics_from_category(session, category_url)
            for topic_url, sub_topic_title in topic_links:
                print(f"  -> Processing Sub-Topic: {sub_topic_title}")
                content_links, found_selector = get_content_links_from_topic(session, topic_url)
                file_heading = f"{main_category_name}: {sub_topic_title}"
                if content_links:
                    # Case 1: The topic page is a landing page with more links.
                    print(f"    -> Found {len(content_links)} content pages (using selector: '{found_selector}')")
                    for content_url in content_links:
                        print(f"      -> Scraping: {content_url}")
                        data = scrape_page_content(session, content_url)
                        if data and data['body_text'].strip() and data['body_text'] != "No main content found":
                            file_counter = save_content_to_file(data['body_text'], file_heading, file_counter)
                        time.sleep(0.5)
                else:
                    # Case 2: The topic page is the final content page itself.
                    print(f"    -> No further links found. Scraping page directly.")
                    data = scrape_page_content(session, topic_url)
                    if data and data['body_text'].strip() and data['body_text'] != "No main content found":
                        file_counter = save_content_to_file(data['body_text'], file_heading, file_counter)
                    time.sleep(0.5)
            print("-" * 50)

    print(f"\nScraping complete. Total files saved: {file_counter - 1} in the '{OUTPUT_DIR}' directory.")

if __name__ == "__main__":
    main()


Created directory: scraped_content
Fetching categories from: https://www.gov.uk/browse
Found 16 categories.

Processing Category: Abroad
  -> Processing Sub-Topic: Living abroad
    -> Found 21 content pages (using selector: 'main ul.govuk-list a[href]')
      -> Scraping: https://www.gov.uk/world/all
        -> Saved to scraped_content/target1.txt
      -> Scraping: https://www.gov.uk/foreign-travel-advice
        -> Saved to scraped_content/target2.txt
      -> Scraping: https://www.gov.uk/get-document-legalised
        -> Saved to scraped_content/target3.txt
      -> Scraping: https://www.gov.uk/marriages-civil-partnerships-abroad
        -> Saved to scraped_content/target4.txt
      -> Scraping: https://www.gov.uk/register-a-death
        -> Saved to scraped_content/target5.txt
      -> Scraping: https://www.gov.uk/register-birth-abroad
        -> Saved to scraped_content/target6.txt
      -> Scraping: https://www.gov.uk/voting-when-abroad
        -> Saved to scraped_content/target