In [1]:
import calendar
import json
import os
import random
import re
import time
from datetime import datetime, timedelta
from urllib.parse import urljoin, urlparse

import pandas as pd
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from tqdm import tqdm

In [2]:
os.chdir(os.path.dirname(os.getcwd()))
os.getcwd()  # * Check working directory

'/Users/xx/Documents/Repositories/anlp-spring2025-hw2'

In [None]:
RAW_DATA_DIR = 'raw_data/events_pittsburgh_cmu'

In [4]:
STARTING_DATE = datetime(2025, 3, 19)
ENDING_DATE = datetime(2025, 12, 31)

In [5]:
PITTSBURGH_EVENTS_URL = 'https://pittsburgh.events/'
DOWNTOWN_PITTSBURGH_URL = 'https://downtownpittsburgh.com/events/'
PGH_CITY_PAPER_URL = 'https://www.pghcitypaper.com/pittsburgh/EventSearch'
CMU_EVENT_URL = 'https://events.cmu.edu/day/date/'
CMU_COMMUNITY_URL = 'https://community.cmu.edu/s/events'

In [6]:
driver = webdriver.Chrome()
driver.implicitly_wait(2)

In [7]:
def save_events_to_json(filename: str, events: list[dict]):
    filepath = os.path.join(RAW_DATA_DIR, filename)
    if not filepath.endswith('.json'):
        filepath += '.json'
    with open(filepath, 'w') as f:
        json.dump(events, f, indent=4)
    print(f'Saved scraped data to {filepath}')

# Pittsburgh Events

In [None]:
def load_pgh_event_by_month(month: str):
    """
    Load all the events in this month on pittsburgh.events

    Args:
        month (str): full month name in lowercase
    """
    driver.get(urljoin(PITTSBURGH_EVENTS_URL, month))
    while True:
        try:
            show_more_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CLASS_NAME, 'ldm'))
            )
            show_more_button.click()
            time.sleep(5)
        except:
            break


def scrape_pgh_event_by_month(month: str) -> list[dict]:
    """
    Scrape event info from a month-specific page of pittsburgh.events

    Args:
        month (str): full month name in lowercase

    Returns:
        list[dict]: A list of scraped event dictionaries
    """
    events = []
    try:
        event_ul = driver.find_element(By.XPATH, '//ul[contains(@class, "dates-list")]')
        # Extract event info from each <li> tag
        for event_li in tqdm(event_ul.find_elements(By.TAG_NAME, 'li'),
                             desc=f'Scraping {month}'):
            event = {}
            find_mapping = {
                'url': (By.TAG_NAME, 'a'),
                'title': (By.TAG_NAME, 'a'),
                'date': (By.CLASS_NAME, 'date'),
                'time': (By.CLASS_NAME, 'time'),
                'datedesc': (By.CLASS_NAME, 'date-desc'),
                'location': (By.CLASS_NAME, 'location'),
                'price': (By.CLASS_NAME, 'from-price'),
            }
            for key, (by, value) in find_mapping.items():
                try:
                    element = event_li.find_element(by, value)
                    if key == 'url':
                        text = element.get_attribute('href')
                    else:
                        text = element.text
                    text = text.replace('\n', ' ').strip()
                    event[key] = text
                except:
                    pass
            if 'date' in event and 'time' in event:  # Format datetime
                event['datetime'] = f'{event['date']}, {event['time']}'
            if len(event) > 0:  # Valid event info
                events.append(event)
    except:
        print(f'{driver.current_url}: No event list found.')
    time.sleep(random.uniform(2, 10))
    return events


load_pgh_event_by_month('march')
scrape_pgh_event_by_month('march')

Scraping march: 100%|██████████| 105/105 [00:05<00:00, 17.90it/s]


[{'url': 'https://pittsburgh.events/tickets/avery-sunshine/',
  'title': 'Avery Sunshine',
  'date': 'MAR 10 2025',
  'time': '7:30 PM MON',
  'datedesc': 'City Winery Pittsburgh',
  'location': '15222, Pittsburgh, Pennsylvania, US',
  'price': 'Prices from $87',
  'datetime': 'MAR 10 2025, 7:30 PM MON'},
 {'url': 'https://pittsburgh.events/tickets/willis/',
  'title': 'WILLIS',
  'date': 'MAR 10 2025',
  'time': '8:00 PM MON',
  'datedesc': 'Bottlerocket Social Hall',
  'location': '15210, Pittsburgh, Pennsylvania, US',
  'price': 'Prices from $798',
  'datetime': 'MAR 10 2025, 8:00 PM MON'},
 {'url': 'https://pittsburgh.events/tickets/vegas-golden-knights/',
  'title': 'Vegas Golden Knights',
  'date': 'MAR 11 2025',
  'time': '7:00 PM TUE',
  'datedesc': 'PPG Paints Arena',
  'location': '15219, Pittsburgh, Pennsylvania, US',
  'price': 'Prices from $24',
  'datetime': 'MAR 11 2025, 7:00 PM TUE'},
 {'url': 'https://pittsburgh.events/tickets/jenna-nicholls/',
  'title': 'Jenna Nichol

In [10]:
def get_pittsburgh_events(filename: str, save_file=True):
    events = []
    print(f'Scraping from {PITTSBURGH_EVENTS_URL}')

    # Scrape events by month
    for month in calendar.month_name[3:]:
        month = month.lower()
        load_pgh_event_by_month(month)
        month_events= scrape_pgh_event_by_month(month)
        events.extend(month_events)
    print(f'Scraped {len(events)} events')

    # Save to file
    if save_file:
        save_events_to_json(filename, events)


get_pittsburgh_events('pittsburgh_events.json')

Scraping from https://pittsburgh.events/


Scraping march: 100%|██████████| 105/105 [00:05<00:00, 18.15it/s]
Scraping april: 100%|██████████| 164/164 [00:09<00:00, 17.33it/s]
Scraping may: 100%|██████████| 86/86 [00:04<00:00, 17.53it/s]
Scraping june: 100%|██████████| 68/68 [00:03<00:00, 18.73it/s]
Scraping july: 100%|██████████| 39/39 [00:02<00:00, 17.56it/s]
Scraping august: 100%|██████████| 45/45 [00:02<00:00, 17.89it/s]
Scraping september: 100%|██████████| 41/41 [00:02<00:00, 16.98it/s]
Scraping october: 100%|██████████| 7/7 [00:00<00:00, 18.65it/s]
Scraping november: 100%|██████████| 3/3 [00:00<00:00, 15.58it/s]
Scraping december: 0it [00:00, ?it/s]


Scraped 558 events
Saved scraped data to raw_data/pittsburgh_events.json


# Downtown Pittsburgh Events

In [8]:
def scrape_pdp_event_item(event_item: Tag) -> dict:
    """
    Args:
        event_item (bs4.element.Tag): eventitem div tag from Downtown Pittsburgh event page

    Returns:
        dict
    """
    # Extract basic information from eventitem
    result = {
        'url': urljoin(DOWNTOWN_PITTSBURGH_URL, event_item.a.attrs['href']),
        'title': event_item.a.get_text(strip=True),
        'datetime': re.sub(r'\s{1,}', ' ',
                           event_item.find(class_='eventdate').get_text(' ', strip=True).replace('|', ',')),
    }

    # Load the event page and extract event details from READ MORE
    response = requests.get(result['url'])
    assert response.status_code == 200, f'Failed to fetch {response.url}: {response.status_code}'
    soup = BeautifulSoup(response.content, 'html.parser')
    description = []
    for element in soup.find('div', class_='eventitem').find('div', class_='copyContent').contents:  # type: ignore
        text = element.get_text(' ', strip=True)
        if len(text) == 0 or element.name == 'h1':  # Skip empty text or title
            continue
        elif 'class' in element.attrs:
            if element.find('a'):  # Extract hyperlink instead of plain text
                text += f": {element.find('a').get('href')}"
            result[element.attrs['class'][0]] = text
        else:  # Add all other text to description
            if not text.endswith('.'):
                text += '.'
            description.append(text)
    result['description'] = ' '.join(description)
    result['location'] = result.pop('eventlocation', '')  # Rename location key
    result.pop('eventdate', None)  # Remove redundant datetime
    return result


def get_pdp_events(filename: str, save_file=True):
    print(f'Scraping from {DOWNTOWN_PITTSBURGH_URL}')
    events = []
    response = requests.get(DOWNTOWN_PITTSBURGH_URL)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        for event_item in tqdm(soup.find_all('div', class_='eventitem'),
                               desc='Scraping events'):
            curr_event = scrape_pdp_event_item(event_item)
            events.append(curr_event)
        print(f'Scraped {len(events)} events')
        if save_file:
            save_events_to_json(filename, events)
    else:
        print(f'Failed to fetch {response.url}: {response.status_code}')


event_item = get_pdp_events('downtown_pittsburgh_events.json')

Scraping from https://downtownpittsburgh.com/events/


Scraping events: 100%|██████████| 125/125 [03:19<00:00,  1.60s/it]

Scraped 125 events
Saved scraped data to raw_data/downtown_pittsburgh_events.json





# Pittsburgh City Paper Events

In [None]:
def scrape_pghcitypaper_search_page(url: str) -> list[dict]:
    """
    Args:
        url (str): Pittsburgh City Paper event calendar page URL,
        e.g. https://www.pghcitypaper.com/pittsburgh/EventSearch?v=d

    Returns:
        list[dict]: A list of scraped event dictionaries
    """
    events = []
    driver.get(url)
    try:
        event_ul = driver.find_element(By.CLASS_NAME, 'search-results')
        # Extract event info from each <li> tag
        for event_li in tqdm(event_ul.find_elements(By.TAG_NAME, 'li'),
                             desc=f'Scraping {urlparse(url).query}'):
            event = {}
            find_mapping = {
                'url': 'fdn-teaser-headline',
                'title': 'fdn-teaser-headline',
                'datetime': 'fdn-teaser-subheadline',
                'location': 'fdn-event-teaser-location-block',
                'category': 'fdn-teaser-tag-link',
                'price': 'fdn-pres-details-split',
                'ticketlink': 'fdn-teaser-ticket-link',
                'description': 'fdn-teaser-description',
            }  # Value: class name
            for key, class_name in find_mapping.items():
                try:
                    element = event_li.find_element(By.CLASS_NAME, class_name)
                    if key == 'url':
                        text = element.find_element(By.TAG_NAME, 'a').get_attribute('href')
                    elif key == 'ticketlink':
                        text = element.get_attribute('href')
                    else:
                        text = element.text.strip()
                        text = re.sub(r'\s{1,}', ' ', text)  # Remove extra spaces
                    event[key] = text
                except:
                    pass
            if len(event) > 0:
                events.append(event)
    except:
        print(f'{url}: No event list found.')
    time.sleep(random.uniform(2, 10))
    return events


len(scrape_pghcitypaper_search_page('https://www.pghcitypaper.com/pittsburgh/EventSearch?v=d'))

Scraping v=d: 100%|██████████| 43/43 [04:20<00:00,  6.06s/it]


30

In [10]:
def get_pghcitypaper_events(filename: str, save_file=True):
    events = []
    driver.get(PGH_CITY_PAPER_URL)
    print(f'Scraping from {PGH_CITY_PAPER_URL}')

    # Scape page by page
    while True:
        events.extend(scrape_pghcitypaper_search_page(driver.current_url))
        try:  # Turn to next page
            next_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable(
                    (By.XPATH,
                     '//a[contains(@class, "fdn-page-navigation-prev-next") and contains(text(), "next")]'))
            )
            next_button.click()
        except:
            print('No next page found.')
            break
    print(f'Scraped {len(events)} events')

    # Save to file
    if save_file:
        save_events_to_json(filename, events)


get_pghcitypaper_events('pittsburgh_city_paper_events.json')

Parsing from https://www.pghcitypaper.com/pittsburgh/EventSearch


Scraping v=d: 100%|██████████| 43/43 [04:20<00:00,  6.06s/it]
Scraping page=2&v=d: 100%|██████████| 43/43 [06:41<00:00,  9.35s/it]
Scraping page=3&v=d: 100%|██████████| 43/43 [07:04<00:00,  9.88s/it]
Scraping page=4&v=d: 100%|██████████| 43/43 [06:27<00:00,  9.02s/it]
Scraping page=5&v=d: 100%|██████████| 43/43 [04:50<00:00,  6.75s/it]
Scraping page=6&v=d: 100%|██████████| 43/43 [05:01<00:00,  7.01s/it]
Scraping page=7&v=d: 100%|██████████| 43/43 [04:46<00:00,  6.67s/it]
Scraping page=8&v=d: 100%|██████████| 43/43 [04:44<00:00,  6.62s/it]
Scraping page=9&v=d: 100%|██████████| 43/43 [04:26<00:00,  6.19s/it]
Scraping page=10&v=d: 100%|██████████| 43/43 [04:32<00:00,  6.35s/it]
Scraping page=11&v=d: 100%|██████████| 43/43 [04:39<00:00,  6.50s/it]
Scraping page=12&v=d: 100%|██████████| 43/43 [04:18<00:00,  6.01s/it]
Scraping page=13&v=d: 100%|██████████| 43/43 [04:24<00:00,  6.14s/it]
Scraping page=14&v=d: 100%|██████████| 43/43 [03:58<00:00,  5.54s/it]
Scraping page=15&v=d: 100%|█████████

No next page found.
Scraped 643 events
Saved scraped data to raw_data/pittsburgh_city_paper_events.json


# CMU Events

In [None]:
def scrape_cmu_event_calendar(url: str) -> list[str]:
    """
    Args:
        url (str): CMU Events Calendar by day page URL,
        e.g. https://events.cmu.edu/day/date/20250319

    Returns:
        list[str]: A list of event URLs
    """
    event_urls = []
    driver.get(url)
    try:  # Find the event list container
        event_container = driver.find_element(By.CLASS_NAME, 'lw_cal_event_list')
        # Select all child elements
        for event_item in event_container.find_elements(By.XPATH, './*'):
            try:
                event_url_tag = (event_item
                                 .find_element(By.CLASS_NAME, 'lw_events_title')
                                 .find_element(By.TAG_NAME, 'a'))
                event_url = event_url_tag.get_attribute('href')
                event_urls.append(event_url)
            except:
                pass
    except:
        print(f'{url}: No event list found.')
    time.sleep(random.uniform(2, 10))
    return event_urls


scrape_cmu_event_calendar('https://events.cmu.edu/day/date/20250319')

['https://events.cmu.edu/event/13096-libraries-workshop-data-communication-data-for',
 'https://events.cmu.edu/event/13599-uncommon-perfection-illustrations-by-derek-norman',
 'https://events.cmu.edu/event/13097-libraries-workshop-getting-the-most-out-ofheinonlines-',
 'https://events.cmu.edu/event/13098-libraries-workshop-collaborating-on-github',
 'https://events.cmu.edu/event/13672-mosaic-conference-for-intersectionality-series',
 'https://events.cmu.edu/event/13318-masters-student-support-group',
 'https://events.cmu.edu/event/13639-predatory-data-eugenics-in-big-tech-feminist-solidarit',
 'https://events.cmu.edu/event/13009-paws-to-relax-weekly-pet-therapy-program',
 'https://events.cmu.edu/event/7307-here-to-stay-celebrating-40-years-at-the',
 'https://events.cmu.edu/event/12499-lent-great-lent',
 'https://events.cmu.edu/event/10974-like-totally-transformative-cmu-in-the-1980s',
 'https://events.cmu.edu/event/12502-naw-ruz',
 'https://events.cmu.edu/event/12496-ramadan',
 'https:

In [None]:
scrape_cmu_event_calendar('https://events.cmu.edu/day/date/20251203')  # No events found

[]

In [None]:
def scrape_cmu_event_page(url: str) -> dict:
    """
    Args:
        url (str): CMU Event Calendar event page URL, e.g. https://events.cmu.edu/event/12496-ramadan

    Returns:
        dict
    """
    result = {'url': url}
    driver.get(url)
    contents = driver.find_elements(By.ID, 'main-content')
    if len(contents) > 0:  # Check if main content is found
        content = contents[0]
        find_mapping = {
            'title': (By.TAG_NAME, 'h1'),
            'startdate': (By.ID, 'lw_cal_this_day'),
            'datetime': (By.XPATH, '//h1/following-sibling::p'),
            'contact': (By.ID, 'lw_cal_event_leftcol'),
            'description': (By.ID, 'lw_cal_event_rightcol'),
        }
        for key, (by, value) in find_mapping.items():
            try:
                text = content.find_element(by, value).text.strip()
                text = re.sub(r'\s{1,}', ' ', text)  # Remove extra spaces
                result[key] = text
            except:
                pass
    time.sleep(random.uniform(2, 10))
    return result


scrape_cmu_event_page('https://events.cmu.edu/event/13318-masters-student-support-group')

{'url': 'https://events.cmu.edu/event/13318-masters-student-support-group',
 'title': 'Master’s Student Support Group',
 'startdate': 'March 19, 2025',
 'datetime': '4 - 5 p.m. CaPS Office, Highmark Center for Health, Wellness and Athletics Repeats weekly (to May 7)',
 'contact': '',
 'description': 'The Master’s Student Support Group provides an opportunity for students to come together and discuss the unique experiences and challenges of being a Master’s student. To join this group, please email co-facilitator Adrijana Vukelic, MSCP, at avukelic@andrew.cmu.edu or call 412-268-2922 to schedule your pre-group consultation. Add to my calendar'}

In [None]:
def scrape_timely_event_page(url: str) -> dict:
    """
    Args:
        url (str): Timely event page URL, e.g. https://events.time.ly/vdibqnd/43900770

    Returns:
        dict
    """
    result = {'url': url}
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        result['title'] = soup.h1.get_text(' ', strip=True)
        # Scrape details
        container = soup.find(class_='timely-event-details')
        if container:
            for detail in container.find_all(class_='timely-details'):
                key, value = [tag.get_text(' ', strip=True) for tag in detail.contents
                              if len(tag.get_text(' ', strip=True)) > 0]
                result[key] = value
        # Rename detail keys
        key_mapping = {
            'WHEN': 'datetime',
            'WHERE': 'location',
            'COST': 'price',
            'CONTACT': 'contact',
        }
        for old_key, new_key in key_mapping.items():
            if old_key in result:
                result[new_key] = result.pop(old_key)
        # Scrape description
        description = soup.find(class_='timely-event-has-description')
        if description:
            text = description.get_text(' ', strip=True)
            text = re.sub(r'\s{1,}', ' ', text)  # Remove extra spaces
            result['description'] = text
    else:
        print(f'Failed to fetch {url}: {response.status_code}')
    return result


scrape_timely_event_page('https://events.time.ly/vdibqnd/43900777')

{'url': 'https://events.time.ly/vdibqnd/43900777',
 'title': 'Carnegie Mellon Philharmonic',
 'datetime': 'Sunday, April 27, 2025 from 08:00 pm to 10:00 pm',
 'location': 'Heinz Hall at 600 Penn Ave Pittsburgh, PA, 15222',
 'price': '$10 / $5 Buy Tickets',
 'contact': 'School of Music Box Office 412-268-2383 412-268-2383',
 'description': 'Carnegie Mellon Philharmonic Daniel Nesta Curtis , conductor Andrew Gray , violin ( Winner of the 2024 Philharmonic Soloist Competition ) Program Lili Boulanger - D’Un Matin de Printemps Jerod Impichchaachaaha’ Tate - Spider Brings Fire from Lowak Shoppala (Fire and Light) Richard Strauss - Death and Transfiguration Karol Szymanowski - Violin Concerto No. 1, Op. 35 Alexander Scriabin - Le Poème de l’extase (The Poem of Ecstasy), Op. 54 Cost: $10 Regular Admission $5 Senior Citizen (65+) Admission All students, along with CMU/Pitt faculty and staff, are free w/ valid ID'}

In [None]:
def get_cmu_events(filename: str, save_file=True,
                   start_date=STARTING_DATE, end_date=ENDING_DATE):
    # Get all the event URLs from the Event Calendar
    event_urls = []
    days = (end_date - start_date).days
    print(f'Scraping from {CMU_EVENT_URL}')
    for d in tqdm(range(days + 1), desc='Scraping CMU calendar'):
        curr_date = start_date + timedelta(days=d)
        url = CMU_EVENT_URL + curr_date.strftime('%Y%m%d')
        event_urls.extend(scrape_cmu_event_calendar(url))

    # Scrape each event page
    events = []
    for url in tqdm(set(event_urls), desc='Scraping events'):
        domain = urlparse(url).netloc
        if domain == 'events.cmu.edu':
            curr_event = scrape_cmu_event_page(url)
        else:
            curr_event = scrape_timely_event_page(url)
        events.append(curr_event)
    print(f'Scraped {len(events)} events')

    # Save to file
    if save_file:
        save_events_to_json(filename, events)


events = get_cmu_events('cmu_events.json',
                        start_date=STARTING_DATE,
                        end_date=ENDING_DATE)

Scraping CMU calendar: 100%|██████████| 288/288 [39:37<00:00,  8.25s/it]
Scraping events: 100%|██████████| 293/293 [32:01<00:00,  6.56s/it]

Scraped 293 events
Saved scraped data to raw_data/cmu_events.json





# Campus Events

In [None]:
def scrape_cmu_community_search_grid(url='') -> list[str]:
    """
    Args:
        url (str, optional): Defaults to ''.

    Returns:
        list[str]: A list of event URLs from CMU Alumni Community event
    """
    if url != '':
        driver.get(url)

    # Get the page number
    try:
        page = driver.find_element(By.CLASS_NAME, 'slds-text-body_small').text
    except:
        page = 'no page number'

    # Extract all the event urls in the grid
    event_urls = []
    try:
        event_container = driver.find_element(By.CSS_SELECTOR, '.slds-grid.slds-wrap.cCMU_Theme')
        # Extract event url from each <div> tag
        for event_div in tqdm(event_container.find_elements(By.XPATH, './*'),
                              desc=f'Scraping {page}'):
            try:
                event_tag = event_div.find_element(By.CLASS_NAME, 'evt_name')
                event_url = event_tag.get_attribute('href')
                event_urls.append(event_url)
            except:
                pass
    except:
        print(f'{driver.current_url}: No event list found.')
    time.sleep(random.uniform(2, 10))
    return event_urls


scrape_cmu_community_search_grid(CMU_COMMUNITY_URL)

Scraping Page 1 /: 100%|██████████| 9/9 [00:00<00:00, 121.46it/s]


['https://www.givecampus.com/schools/CarnegieMellonUniversity/events/phl-women-s-history-month-dinner-in-philly',
 'https://www.givecampus.com/schools/CarnegieMellonUniversity/events/pit-cmu-night-at-the-penguins',
 'https://www.givecampus.com/schools/CarnegieMellonUniversity/events/sd-san-diego-clippers-vs-mexico-city-basketball-game',
 'https://www.givecampus.com/schools/CarnegieMellonUniversity/events/bos-cmu-boston-pi-day',
 'https://www.givecampus.com/schools/CarnegieMellonUniversity/events/pit-pizza-party-at-alumni-house',
 'https://www.givecampus.com/schools/CarnegieMellonUniversity/events/pit-trivia-night-at-over-the-bar-bicycle-cafe',
 'https://www.givecampus.com/schools/CarnegieMellonUniversity/events/faculty-dialogues-sitting-with-discomfort-mindfulness-equanimity-and-health-featuring-david-creswell-in-seattle',
 'https://www.givecampus.com/schools/CarnegieMellonUniversity/events/den-laughs-and-legacy-an-evening-with-cmu-alumna-paula-newman',
 'https://www.givecampus.com/sch

In [12]:
def scrape_givecampus_event_page(url: str) -> dict:
    """
    Args:
        url (str): Givecampus event page URL,
        e.g. https://www.givecampus.com/schools/CarnegieMellonUniversity/events/phl-women-s-history-month-dinner-in-philly

    Returns:
        dict
    """
    result = {'url': url}
    temp_driver = webdriver.Chrome()  # Reinitialize to pass human verification
    temp_driver.get(url)
    contents = temp_driver.find_elements(By.ID, 'main-content')
    if len(contents) > 0:  # Check if main content is found
        content = contents[0]
        find_mapping = {
            'title': [(By.TAG_NAME, 'h1')],
            'datetime': [(By.ID, 'event-when'), (By.CLASS_NAME, 'text-left')],
            'location': [(By.ID, 'event-where'), (By.CLASS_NAME, 'text-left')],
            'price': [(By.ID, 'event-price'), (By.CLASS_NAME, 'text-left')],
            'message': [(By.ID, 'event-purchase-message')],
            'description': [(By.ID, 'event-description')],
        }
        for key, by_value_pairs in find_mapping.items():
            try:
                element = content
                for (by, value) in by_value_pairs:
                    element = element.find_element(by, value)
                text = element.text.strip()
                if key == 'datetime' or key == 'location':
                    text = text.replace('\n', ', ')
                text = re.sub(r'\s{1,}', ' ', text)  # Remove extra spaces
                result[key] = text
            except:
                pass
    time.sleep(random.uniform(2, 10))
    temp_driver.quit()
    return result


scrape_givecampus_event_page(
    'https://www.givecampus.com/schools/CarnegieMellonUniversity/events/phl-women-s-history-month-dinner-in-philly')

{'url': 'https://www.givecampus.com/schools/CarnegieMellonUniversity/events/phl-women-s-history-month-dinner-in-philly',
 'title': 'PHL - Women’s History Month Dinner in Philly',
 'datetime': 'March 10, 2025, 7:00pm - 8:00pm EDT',
 'location': 'Osteria Philly, 640 N Broad St, Philadelphia, Pennsylvania 19130',
 'price': '$30.00 per person',
 'message': 'Event registrations closed March 6th at 11:59 PM EST',
 'description': 'Join the CMU Philadelphia Network for an intimate dinner to celebrate Women’s History Month. Come reconnect with your former classmates and meet new friends over a delicious Neapolitan pizza dinner. We look forward to seeing you! Space is limited due to venue capacity restrictions, so register today. Registration is required for attendance and walk-ins are NOT permitted. Questions? Contact alumni-house@andrew.cmu.edu. There are over 4,200 alumni, students, parents and friends in the Philadelphia Network throughout Montgomery, Philadelphia, Chester, Bucks and Delawar

In [13]:
def get_cmu_community_events(filename: str, save_file=True):
    # Get all the event URLs from the Event Calendar
    event_urls = []
    driver.get(CMU_COMMUNITY_URL)
    print(f'Scraping from {CMU_COMMUNITY_URL}')

    # Scape page by page
    next_button_enabled = True
    while next_button_enabled:
        event_urls.extend(scrape_cmu_community_search_grid())
        # Check next button status
        for button in driver.find_elements(By.TAG_NAME, 'button'):
            if button.text == 'Next':
                if button.get_attribute('disabled'):
                    next_button_enabled = False
        try:  # Turn to next page
            next_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CLASS_NAME, 'slds-p-left--xx-small'))
            )
            next_button.click()
        except:
            pass

    # Scrape each event page
    events = []
    for url in tqdm(set(event_urls), desc='Scraping events'):
        curr_event = scrape_givecampus_event_page(url)
        events.append(curr_event)
    print(f'Scraped {len(events)} events')

    # Save to file
    if save_file:
        save_events_to_json(filename, events)


get_cmu_community_events('cmu_community_events.json')

Scraping from https://community.cmu.edu/s/events


Scraping Page 1 /: 100%|██████████| 9/9 [00:00<00:00, 129.34it/s]
Scraping Page 2 / 5: 100%|██████████| 9/9 [00:00<00:00, 149.12it/s]
Scraping Page 3 / 5: 100%|██████████| 9/9 [00:00<00:00, 126.18it/s]
Scraping Page 4 / 5: 100%|██████████| 9/9 [00:00<00:00, 146.33it/s]
Scraping Page 5 / 5: 100%|██████████| 2/2 [00:00<00:00, 98.37it/s]
Scraping events: 100%|██████████| 38/38 [06:06<00:00,  9.66s/it]

Scraped 38 events
Saved scraped data to raw_data/cmu_community_events.json



