In [1]:
import calendar
import json
import os
import random
import re
import time
from datetime import datetime, timedelta
from urllib.parse import urljoin, urlparse

import pandas as pd
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from tqdm import tqdm

In [2]:
os.chdir(os.path.dirname(os.getcwd()))
os.getcwd()  # * Check working directory

'/Users/xx/Documents/Repositories/anlp-spring2025-hw2'

In [None]:
RAW_DATA_DIR = 'raw_data/events_food_related'
os.makedirs(RAW_DATA_DIR, exist_ok=True)

In [7]:
driver = webdriver.Chrome()
driver.implicitly_wait(2)

In [5]:
def is_scrapable_url(url: str) -> bool:
    skip_sites = ['instagram.com', 'facebook.com', 'x.com']
    for site in skip_sites:
        if site in url:
            return False
    return True

In [8]:
def scrape_webpage(url: str):
    driver.get(url)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(1)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    contents = [driver.title + '.']
    for element in soup.find('body').find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
        text = element.text.strip()
        if text:
            if element.name.startswith('h'):
                text = f'\n{text}'
            if re.search(r'[\w\]\}\)]$', text):
                text += '.'
            contents.append(text)

    filename = re.sub(r'[^a-zA-Z0-9-_]', '_', driver.title.lower())
    filepath = f'{RAW_DATA_DIR}/{filename}.txt'
    with open(filepath, 'w') as f:
        f.write('\n'.join(contents).strip())

    time.sleep(random.uniform(2, 5))


def get_food_festivals():
    url = 'https://www.visitpittsburgh.com/events-festivals/food-festivals'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract events in main content
    contents = []
    main_body = soup.find('main')
    for event in main_body.children:
        if isinstance(event, str):
            if not event.isspace():
                contents.append(event.text)
        else:  # One event
            for child in event.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
                text = child.get_text(' ', strip=True)
                if re.search(r'[\w\]\}\)]$', text):
                    text += '.'
                contents.append(text)
            contents.append('')  # New line separator between events

    # Save to file
    filename = [s.lower() for s in urlparse(url).path.split('/') if len(s) > 0][-1]
    filepath = f'{RAW_DATA_DIR}/{filename}.txt'
    with open(filepath, 'w') as f:
        f.write('\n'.join(contents).strip())

    # Iteratively scrape hyperlinks
    for a in tqdm(main_body.find_all('a', href=True)):
        hyperlink = a['href']
        if is_scrapable_url(hyperlink):
            scrape_webpage(hyperlink)


get_food_festivals()

100%|██████████| 60/60 [06:46<00:00,  6.78s/it]
