In [7]:
import requests
from bs4 import BeautifulSoup
import time
import json
import re
import os
import datetime
import urllib

In [9]:
# Headers to mimic a real browser request
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
BASE_URLS = {
    # "https://www.visitpittsburgh.com/things-to-do/":{'category': 'Pittsburgh', 'subcategory':'Events'},
    # "https://pittsburghrestaurantweek.com/": {'category': 'Pittsburgh', 'subcategory': 'Events'},
    # "https://www.pghtacofest.com/":{'category': 'Pittsburgh', 'subcategory': 'Events'},
    # "https://littleitalydays.com/":{'category': 'Pittsburgh', 'subcategory': 'Events'},
    # "https://www.picklesburgh.com/":{'category': 'Pittsburgh', 'subcategory': 'Events'},   
    # "https://pittsburghsymphony.org/":{'category': 'Pittsburgh', 'subcategory': 'Events'},
    # "https://www.pghtech.org/":{'category': 'Pittsburgh', 'subcategory': 'Events'},
    # "https://www.thefrickpittsburgh.org/":{'category': 'Pittsburgh', 'subcategory': 'Events'},
    # "https://www.visitpittsburgh.com/blog/":{'category': 'Pittsburgh', 'subcategory': 'Events'},
    # "https://trustarts.org/":{'category': 'Pittsburgh', 'subcategory': 'Events'},
    # "https://www.visitpittsburgh.com/events-festivals/":{'category': 'Pittsburgh', 'subcategory': 'Events'},
    # "https://www.visitpittsburgh.com/hotels-resorts/":{'category': 'Pittsburgh', 'subcategory': 'Events'},
    # "https://www.visitpittsburgh.com/restaurants-culinary/":{'category': 'Pittsburgh', 'subcategory': 'Events'},
    #  "https://en.wikipedia.org/wiki/Cork_Run_Tunnel":{'category': 'Pittsburgh', 'subcategory': 'Facts'},
    #  "https://en.wikipedia.org/wiki/Fort_Pitt_Tunnel":{'category': 'Pittsburgh', 'subcategory': 'Facts'},
    #  "https://en.wikipedia.org/wiki/Liberty_Tunnel":{'category': 'Pittsburgh', 'subcategory': 'Facts'},
    #  "https://en.wikipedia.org/wiki/Mount_Washington_Transit_Tunnel":{'category': 'Pittsburgh', 'subcategory': 'Facts'},
    #  "https://en.wikipedia.org/wiki/Squirrel_Hill_Tunnel":{'category': 'Pittsburgh', 'subcategory': 'Facts'},
    #  "https://en.wikipedia.org/wiki/Schenley_Tunnel":{'category': 'Pittsburgh', 'subcategory': 'Facts'},
    #  "https://en.wikipedia.org/wiki/List_of_bridges_of_Pittsburgh":{'category': 'Pittsburgh', 'subcategory': 'Facts'}
    "https://carnegiemuseums.org/events/":{'category': 'Pittsburgh', 'subcategory': 'Events'}
    
}

# Output files
OUTPUT_DIRECTORY = os.path.join(os.getcwd(), "json_documents")
os.makedirs(OUTPUT_DIRECTORY, exist_ok = True)
ERROR_LOG = "scraping_errors.log"

In [10]:
def carnegie_museum_events(soup):
    event_cards = soup.find_all('article', class_='event-card')
    
    content_list = []

    for card in event_cards:
        try:
            # Extract event start and end dates
            start_date = card.get('data-event-start')
            end_date = card.get('data-event-end')
            
            # Extract recurrence information
            recurrence_event = card.get('data-recurring-event', False)
            recurrence_frequency = card.get('data-recurring-frequency', None)
            recurrence_day_index = card.get('data-day-index', None)
            weekday = card.get('data-day-weekday', None)
            
            # Extract event title and link
            title_element = card.find('div', class_='event-card__content')
            title = title_element.text.strip() if title_element else "No title"
            link = None
            if title_element and title_element.find('a'):
                link = title_element.find('a').get('href')
            
            # Extract venue information
            venue_element = card.find('div', class_='event-card__venue')
            venue = None
            venue_link = None
            if venue_element and venue_element.find('a'):
                venue = venue_element.find('a').text.strip()
                venue_link = venue_element.find('a').get('href')
            
            # Extract details
            details_element = card.find('div', class_='event-card__details')
            details = details_element.text.strip() if details_element else None
            
            content = f"{title} starts on {start_date} and ends on {end_date}. It will be located at {venue}. {details}"
            
            if recurrence_event:
                content += f"The event occurs on {recurrence_frequency} on {weekday}"
            
            content_list.append(content)
        
        except Exception as e:
            print(f"Error parsing event card: {e}")
            continue
        
        return "\n".join(content_list)
        
        

In [11]:

# def get_base_domain(url):
#     pattern = r'^(?:https?:\/\/)?(?:www\.)?([^\/]+?)\.(?:com|org|gov|edu)(?:\/|$)'
#     match = re.search(pattern, url)
#     return match.group(1) if match else None

# Function to scrape a single webpage
def scrape_page(url, depth, max_depth, category, subcategory, scraped_urls):
    """Scrapes headings and paragraphs from a given webpage."""
    if depth > max_depth:
        return None 

    response = requests.get(url, headers=HEADERS, timeout=10)  # 10s timeout
    if response.status_code != 200:
        print(f"Skipping {url} (Status Code: {response.status_code})")
        with open(ERROR_LOG, "a", encoding="utf-8") as err_file:
            err_file.write(f"Failed: {url} (Status Code: {response.status_code})\n")
        return None

    soup = BeautifulSoup(response.text, "html.parser")
    
    if url not in scraped_urls:
        print(f"Scraping (Depth {depth}): {url}")
        title_tag = soup.find("title")
        title = title_tag.text.strip() if title_tag else "No Title"
        title = re.sub(r'[<>:"/\\|?*\x00-\x1F]', '', title)

        content_list = [f"Title: {title}\nURL: {url}\n"]

        # Extract headings & paragraphs
        for tag in soup.find_all(["h1", "h2", "h3", "p"]):
            if tag.name in ["h1", "h2", "h3"]:  # Headings
                content_list.append(f"\n{tag.text.strip()}\n" + "-" * len(tag.text.strip()) + "\n")
            elif tag.name == "p":  # Paragraphs
                text = tag.get_text().strip()
                if text:
                    content_list.append(text)
                    content = "\n".join(content_list)
                    
        if "https://carnegiemuseums.org/events" in url:
            content = carnegie_museum_events(soup)
        
        
        max_id = max(
            [0] + [
                int(s.split("_")[0])
                for s in os.listdir(OUTPUT_DIRECTORY)
                if s.endswith('.json')
            ]
        )
        document = {
                "content": content,
                "metadata": {
                    "source": url,
                    "title": title,
                    "category": category,
                    "subcategory": subcategory,
                    "date_scraped": datetime.datetime.now().isoformat(),
                    "depth": depth,
                    'id':max_id + 1
                }
            }
        doc_id = document["metadata"]["id"]
        
        filename = f"{doc_id}_{title}.json"
        filepath = os.path.join(OUTPUT_DIRECTORY, filename)

        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(document, f, ensure_ascii=False, indent=2)
    return soup


def recursive_scrape(url, depth, category, subcategory, visited_urls, scraped_urls, max_depth = 3, base_url=None):
    if depth > max_depth or url == "https://carnegiemuseums.org/" or url == "https://carnegiemuseums.org":
        return
    
    if not base_url:
        # parsed_url = urllib.parse.urlparse(url)
        # base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
        base_url = url
    
    visited_urls.add(url)
    
    soup = None
    soup = scrape_page(url, depth, max_depth, category, subcategory, scraped_urls)
    scraped_urls.add(url)
    if not soup:
        return
    
    
    if max_depth > 0 and depth < max_depth:
        links = []
        
        if "en.wikipedia" in url:
            tables = soup.find_all("table", class_="wikitable")

            links = []
            for table in tables:
                for row in table.find_all("tr")[1:]: 
                    if row and len(row) > 0:
                        link_tags = row.find_all("a", href=True)
                        if link_tags and link_tags[0]["href"].startswith("/wiki/"):
                            link = link_tags[0]
                            links.append("https://en.wikipedia.org" + link["href"])
            

        else:
            full_url = None
            for link in soup.find_all("a", href=True):
                href = link["href"]
                full_url = None

                if href.startswith("/"):
                    full_url = urllib.parse.urljoin(base_url, href)  # Use urljoin to correctly resolve relative paths
                elif href.startswith(("http://", "https://")):
                    parsed_href = urllib.parse.urlparse(href)
                    parsed_base = urllib.parse.urlparse(base_url)

                    # Ensure the link belongs to the same domain
                    if parsed_href.netloc == parsed_base.netloc:
                        full_url = href

                if full_url and full_url not in visited_urls:
                    links.append(full_url)
        
        print(f"Found {len(links)} links at {url}")
        # print(links)
        for link in links:
            if link not in visited_urls:
                time.sleep(2)
                recursive_scrape(link, depth+1, category, subcategory, visited_urls, scraped_urls, max_depth) 


In [12]:
def get_scraped_urls(documents_dir):
    scraped_urls = set()
    
    for filename in os.listdir(documents_dir):
        if not filename.endswith('.json'):
            continue
        filepath = os.path.join(documents_dir, filename)
        
        with open(filepath, 'r', encoding='utf-8') as f:
            document = json.load(f)
            if 'source' in document['metadata']:
                url = document['metadata']['source']
                scraped_urls.add(url)
    
    return scraped_urls

In [13]:
# Visited URLs (to prevent duplicate scraping)
scraped_urls = get_scraped_urls(OUTPUT_DIRECTORY)
visited_urls = set()

for url in BASE_URLS:
    recursive_scrape(url, 
                     depth=0, 
                     category = BASE_URLS[url]['category'], 
                     subcategory =BASE_URLS[url]['subcategory'], 
                     visited_urls = visited_urls, 
                     scraped_urls=scraped_urls, 
                     max_depth=3, 
                     base_url=url)


Found 150 links at https://carnegiemuseums.org/events/
Found 111 links at https://carnegiemuseums.org/join-support/membership/joinrenew/
Found 109 links at https://carnegiemuseums.org/timed-tickets/
Scraping (Depth 1): https://carnegiemuseums.org/events?museum=carnegie-museum-of-art
Found 16 links at https://carnegiemuseums.org/events?museum=carnegie-museum-of-art
Scraping (Depth 2): https://carnegiemuseums.org/events?event_type=exhibitions
Found 3 links at https://carnegiemuseums.org/events?event_type=exhibitions
Scraping (Depth 3): https://carnegiemuseums.org/events?museum=carnegie-science-center
Scraping (Depth 2): https://carnegiemuseums.org/events?event_type=activities
Found 23 links at https://carnegiemuseums.org/events?event_type=activities
Scraping (Depth 3): https://carnegiemuseums.org/events?museum=carnegie-museum-of-natural-history
Scraping (Depth 3): https://carnegiemuseums.org/events?museum=the-andy-warhol-museum
Scraping (Depth 3): https://carnegiemuseums.org/events?audie

In [25]:
SKIP_PATTERN = re.compile(r"_Terms|_Notice|_Become|_Privacy")

for filename in os.listdir(OUTPUT_DIRECTORY):
    count = 0
    if SKIP_PATTERN.search(filename): 
        count+=1
        os.remove(os.path.join(OUTPUT_DIRECTORY, filename))
print(count)

0
