In [None]:
!pip install selenium
!pip install webdriver-manager

### Pittsburgh Events 

In [None]:
import re
import json
import time
from datetime import datetime

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

def clean_text(text):
    return re.sub(r"\s+", " ", text.strip())

def get_driver(headless=False):
    chrome_options = webdriver.ChromeOptions()
    if headless:
        chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

def click_show_more_buttons(driver):
    clicked = False 
    try:
        show_more_gsb = WebDriverWait(driver, 3).until(
            EC.element_to_be_clickable((By.XPATH, "//div[contains(@class, 'gsb-show-more')]"))
        )
        driver.execute_script("arguments[0].click();", show_more_gsb)
        time.sleep(2)
        clicked = True
    except:
        pass  

    try:
        show_more_ptb = WebDriverWait(driver, 3).until(
            EC.element_to_be_clickable((By.XPATH, "//div[contains(@class, 'ptb-show-more')]"))
        )
        driver.execute_script("arguments[0].click();", show_more_ptb)
        time.sleep(2)
        clicked = True
    except:
        pass  

    return clicked 


def scrape_events_page(url):
    driver = get_driver(headless=False)
    driver.get(url)

    while True:
        clicked = click_show_more_buttons(driver)
        if not clicked:
            print("No more 'Show More' All events should be loaded.")
            break

    page_source = driver.page_source
    driver.quit()
    soup = BeautifulSoup(page_source, "html.parser")

    events = []
    date_rows = soup.find_all("li", class_="date-row")
    print("Found", len(date_rows), "date-row events.")
    for row in date_rows:
        try:
            name_tag = row.find("div", class_="venue")
            name_link = name_tag.find("a") if name_tag else None

            date_block  = row.find("div", class_="date")
            time_block  = row.find("div", class_="time")
            venue_block = row.find("div", class_="date-desc")
            price_block = row.find("div", class_="from-price")

            name = clean_text(name_link.get_text()) if name_link else "No name"
            date_text = clean_text(date_block.get_text()) if date_block else ""
            time_text = clean_text(time_block.get_text()) if time_block else ""
            venue_text = clean_text(venue_block.get_text()) if venue_block else ""
            price_text = clean_text(price_block.get_text()) if price_block else ""

            event_str = f"{name} | {date_text} | {time_text} | {venue_text} | {price_text}"
            events.append(event_str)
        except Exception as e:
            print(f"Skipping one date-row event due to error: {e}")
            continue
   
    char_listings = soup.find_all("a", class_="v-list-char")
    print("Found", len(char_listings), "v-list-char events.")
    for link in char_listings:
        try:
            pname_div = link.find("div", class_="pname")
            name = clean_text(pname_div.get_text()) if pname_div else "No name"

            pcount_div = link.find("div", class_="pcount")
            pcount_text = clean_text(pcount_div.get_text()) if pcount_div else ""

            event_str = f"{name} | {pcount_text}"
            events.append(event_str)
        except Exception as e:
            print(f"Skipping one v-list-char event due to error: {e}")
            continue

    print(f"Scraped {len(events)} events.")

    return "\n".join(events)

def main():
    # URLs for each month
    urls_by_month = {
        "march":      "https://pittsburgh.events/march/",
        "april":      "https://pittsburgh.events/april/",
        "may":        "https://pittsburgh.events/may/",
        "june":       "https://pittsburgh.events/june/",
        "july":       "https://pittsburgh.events/july/",
        "august":     "https://pittsburgh.events/august/",
        "september":  "https://pittsburgh.events/september/",
        "october":    "https://pittsburgh.events/october/",
        "november":   "https://pittsburgh.events/november/",
        "december":   "https://pittsburgh.events/december/"
    }

    doc_id = 1

    for month_name, url in urls_by_month.items():
        print(f"Scraping {month_name.title()} => {url}")
        events_data = scrape_events_page(url)
        doc = {
            "content": events_data, 
            "metadata": {
                "source": url,
                "title": f"Pittsburgh Events - {month_name.title()}",
                "date_scraped": datetime.now().isoformat()
            }
        }
        filename = f"pittsburgh_events_{month_name}.json"
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(doc, f, ensure_ascii=False, indent=2)

        print(f"Saved {filename}")
        doc_id += 1
    

if __name__ == "__main__":
    main()


### Pittsburgh Downtown Events 

In [None]:
def clean_text(text):
    """ Removes extra spaces, tabs, and newlines, making content clean and readable. """
    return re.sub(r"\s+", " ", text.strip())

def get_driver(headless=True):
    """ Configure Selenium WebDriver with Chrome in headless mode. """
    chrome_options = webdriver.ChromeOptions()
    if headless:
        chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

def scrape_downtown_pittsburgh_events():
    """ Scrapes events from Downtown Pittsburgh Events page and returns a formatted string. """
    url = "https://downtownpittsburgh.com/events/"
    driver = get_driver(headless=True)
    driver.get(url)
    time.sleep(3)  

    page_source = driver.page_source
    driver.quit()

    soup = BeautifulSoup(page_source, "html.parser")
    events = []
    event_items = soup.find_all("div", class_="copyContent")

    for event in event_items:
        try:
            category_div = event.find("div", class_="term")
            category = clean_text(category_div.get_text()) if category_div else "No Category"
            title_h1 = event.find("h1")
            title_link = title_h1.find("a") if title_h1 else None
            title = clean_text(title_link.get_text()) if title_link else "No Title"
            date_div = event.find("div", class_="eventdate")
            date_time = clean_text(date_div.get_text()) if date_div else "No Date/Time"
            event_description = event.get_text().replace("READ MORE", "").strip()
            event_description = clean_text(event_description.replace(date_time, "").replace(title, "").replace(category, ""))
            event_str = f"{category} | {title} | {date_time} | {event_description}"
            events.append(event_str)
        except Exception as e:
            print(f"Skipping an event due to error: {e}")
            continue

    print(f"Scraped {len(events)} events.")
    return " --- ".join(events)

def main():
    """ Scrapes events and saves them in JSON format with a clean string. """
    event_text = scrape_downtown_pittsburgh_events()  # Get cleaned event text
    doc = {
        "content": event_text, 
        "metadata": {
            "source": "https://downtownpittsburgh.com/events/",
            "title": "pittsburgh_downtown_events",
            "date_scraped": datetime.now().isoformat()
        }
    }
    filename = "pittsburgh_downtown_events.json"
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(doc, f, ensure_ascii=False, indent=2)

    print(f"Saved {filename} with {len(event_text)} events.")

if __name__ == "__main__":
    main()


### CMU Events Scrapper

In [None]:
from urllib.parse import urljoin
import json
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=options)
base_url = "https://events.cmu.edu"
url = base_url + "/all"
driver.get(url)

# Click "Show 50 more" button until all events are loaded
while True:
    try:
        show_more = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "a.lw_cal_next"))
        )
        show_more.click()
        time.sleep(2)
    except Exception:
        print("No more 'Show 50 more' button found or error.")
        break

content_output = ""
events_container = driver.find_element(By.ID, "lw_cal_events")
children = events_container.find_elements(By.XPATH, "./*")
current_date = None

for child in children:
    tag = child.tag_name.lower()

    if tag == "h3":
        if current_date:
            content_output += " ##### "
        current_date = child.text.strip()
        content_output += f"{current_date}: "

    elif tag == "div":
        try:
            event_list = child.find_element(By.CLASS_NAME, "lw_cal_event_list")
            events = event_list.find_elements(By.CLASS_NAME, "lw_cal_event_info")

            for event in events:
                try:
                    title_elem = event.find_element(By.CSS_SELECTOR, "div.lw_events_title a")
                    event_title = title_elem.text.strip()
                    event_href = urljoin(base_url, title_elem.get_attribute("href"))

                    try:
                        event_location = event.find_element(By.CLASS_NAME, "lw_events_location").text.strip()
                    except Exception:
                        event_location = "Location: N/A"

                    try:
                        event_time = event.find_element(By.CLASS_NAME, "lw_events_time").text.strip()
                    except Exception:
                        event_time = "Time: N/A"

                    try:
                        event_summary = event.find_element(By.CLASS_NAME, "lw_events_summary").text.strip()
                    except Exception:
                        event_summary = "No summary available."

                    content_output += f"{event_title}, {event_time}, {event_location}, {event_summary}, {event_href} || "

                except Exception as e:
                    print(f"Error processing event: {e}")

        except Exception as e:
            print(f"Error processing event list for date {current_date}: {e}")
            continue
data = {
    "content": content_output.strip(),
    "metadata": {
        "source": "https://events.cmu.edu/all",
        "title": "Carnegie Mellon University Events",
        "date_scraped": datetime.now().isoformat(),
        "depth": 2,
        "id": 1
    }
}
with open("cmu_all_events.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print("Scraping complete. Data saved to cmu_all_events.json")
driver.quit()


### CMU Drama Calendar

In [None]:
!pip install ics

In [None]:
from ics import Calendar

def clean_text(text):
    if text:
        text = re.sub(r'\s+', ' ', text)  
        return text.strip()  
    return "No Description"

ics_file_path = r"C:\Users\maitr\OneDrive\Desktop\cmu-2025\anlp\assignments\assignment2-pitt-rag-local-copy\scrapper\cmu-drama-8e20d4b7c6d.ics"  # Replace with your file path
with open(ics_file_path, "r", encoding="utf-8") as file:
    calendar = Calendar(file.read())

content_output = ""

for event in calendar.events:
    event_date = event.begin.date().isoformat()  
    event_title = clean_text(event.name) if event.name else "No Title"
    event_start = event.begin.to('local').strftime("%Y-%m-%d %H:%M:%S")
    event_end = event.end.to('local').strftime("%Y-%m-%d %H:%M:%S")
    event_location = clean_text(event.location) if event.location else "Location: N/A"
    event_description = clean_text(event.description) if event.description else "No Description"

    content_output += f"{event_date}: {event_title}, {event_start} - {event_end}, {event_location}, {event_description} || "


data = {
    "content": content_output.strip(),
    "metadata": {
        "source": "events.ics",
        "title": "Extracted Events",
        "date_scraped": datetime.now().isoformat(),
        "depth": 1,
        "id": 1
    }
}

json_file_path = "cmu_drama_calendar.json"
with open(json_file_path, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print(f"Conversion complete! Data saved to {json_file_path}")


### CMU DRAMA

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import json
import os
import time
from datetime import datetime

# Configuration
BASE_URL = "https://drama.cmu.edu/"
MAX_DEPTH = 7  
OUTPUT_DIR = "cmu_drama"  
visited_urls = set()  

os.makedirs(OUTPUT_DIR, exist_ok=True)

def clean_text(text):
    return " ".join(text.split())

def extract_text(soup):
    elements = soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "span", "div"])
    return clean_text("\n".join([elem.get_text(strip=True) for elem in elements if elem.get_text(strip=True)]))

def generate_filename(url):
    parsed = urlparse(url)
    slug = parsed.path.strip("/").replace("/", "_").replace("?", "_").replace("=", "_")
    return f"cmu_drama_{slug}.json" if slug else "cmu_drama_home.json"

def save_to_json(data, filename):
    filepath = os.path.join(OUTPUT_DIR, filename)
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"✔ Saved: {filepath}")

def crawl(url, depth=1):
    if depth > MAX_DEPTH or url in visited_urls:
        return 
    visited_urls.add(url)  
    print(f"Crawling ({depth}): {url}")

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  
    except requests.RequestException as e:
        print(f" Error fetching {url}: {e}")
        return

    soup = BeautifulSoup(response.text, "html.parser")
    title = soup.title.string.strip() if soup.title else "No Title"
    page_text = extract_text(soup)  
    data = {
        "content": page_text,
        "metadata": {
            "title": title,
            "source": url,
            "date_scraped": datetime.now().isoformat(),
            "depth": depth
        }
    }
    save_to_json(data, generate_filename(url))  
    links = soup.find_all("a", href=True)
    for link in links:
        full_url = urljoin(BASE_URL, link["href"])  
        if urlparse(full_url).netloc == urlparse(BASE_URL).netloc:  
            crawl(full_url, depth + 1)

crawl(BASE_URL)

print(f"\nScraping complete! {len(visited_urls)} pages saved in '{OUTPUT_DIR}/'.")
