In [1]:
import requests
from bs4 import BeautifulSoup
import time
import json
import re
import os
import datetime
import hashlib

In [11]:
# Headers to mimic a real browser request
HEADERS = {"User-Agent": "Mozilla/5.0"}
BASE_URLS = {
    "https://www.visitpittsburgh.com/things-to-do/":{'category': 'Pittsburgh', 'subcategory':'Events'},
}

# Output files
DATA_FILE = "output.txt"
ERROR_LOG = "scraping_errors.log"

In [None]:

# Visited URLs (to prevent duplicate scraping)
visited_urls = set()

# Function to scrape a single webpage
def scrape_page(url, depth, max_depth, category, subcategory):
    """Scrapes headings and paragraphs from a given webpage."""
    if url in visited_urls or depth > max_depth:
        return None  # Avoid re-scraping and exceeding depth

    response = requests.get(url, headers=HEADERS, timeout=10)  # 10s timeout
    if response.status_code != 200:
        print(f"Skipping {url} (Status Code: {response.status_code})")
        with open(ERROR_LOG, "a", encoding="utf-8") as err_file:
            err_file.write(f"Failed: {url} (Status Code: {response.status_code})\n")
        return None

    visited_urls.add(url)  # Mark as visited
    soup = BeautifulSoup(response.text, "html.parser")
    title_tag = soup.find("title")
    title = title_tag.text.strip() if title_tag else "No Title"

    content_list = [f"Title: {title}\nURL: {url}\n"]

    # Extract headings & paragraphs
    for tag in soup.find_all(["h1", "h2", "h3", "p"]):
        if tag.name in ["h1", "h2", "h3"]:  # Headings
            content_list.append(f"\n{tag.text.strip()}\n" + "-" * len(tag.text.strip()) + "\n")
        elif tag.name == "p":  # Paragraphs
            text = tag.get_text().strip()
            if text:
                content_list.append(text)
                
    content = "\n".join(content_list)\

    max_id = max(
        [0] + [
            int(s.split("_")[0])
            for s in os.listdir(os.getcwd())
            if s.endswith('.json')
        ]
    )

    
    document = {
            "content": content,
            "metadata": {
                "source": url,
                "title": title,
                "category": category,
                "subcategory": subcategory,
                "date_scraped": datetime.datetime.now().isoformat(),
                "depth": depth,
                'id':max_id + 1
            }
        }
    doc_id = document["metadata"]["id"]
    filename = f"{doc_id}_{title}.json"
    # filepath = os.path.join(OUTPUT_DIR, filename)

    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(document, f, ensure_ascii=False, indent=2)
    return soup

# Recursive function to scrape links up to depth 5
def recursive_scrape(url, depth, category, subcategory, max_depth = 3, base_url=None):
    """Recursively scrapes a webpage and follows links up to depth 5."""
    if depth > max_depth or url in visited_urls:
        return
    
    if not base_url:
        base_url = url

    print(f"Scraping (Depth {depth}): {url}")
    soup = scrape_page(url, depth, max_depth, category, subcategory)

    if not soup:
        return
    
    
    links = []

    # Extract all links and filter only CMU internal links
    for link in soup.find_all("a", href=True):
        href = link["href"]
        if href.startswith("/"):  # Internal relative link
            full_url = base_url.rstrip('/') + href
            links.append(full_url)
        elif href.startswith(("http://", "https://")):
            # Extract domain from base_url
            base_domain = re.search(r'https?://([^/]+)', base_url)
            if base_domain and base_domain.group(1) in href:
                links.append(href)
    for link in links:
        if link not in visited_urls:
            time.sleep(2)
            recursive_scrape(link, depth+1, category, subcategory, max_depth) 


In [3]:

for url in BASE_URLS:
    recursive_scrape(url, depth=0, category = BASE_URLS[url]['category'], subcategory =BASE_URLS[url]['subcategory'],  max_depth=5, base_url=url)


NameError: name 'BASE_URLS' is not defined