In [377]:
# Imports
import pandas as pd
import json
import requests
from bs4 import BeautifulSoup
import re

## Schools

In [396]:
# List out bay area schools (colleges and universities)
bay_area_schools = [
    {
        "Name": "UC Berkeley",
        "Links": [
            {
                "Link": "https://art.berkeley.edu/department-calendar",
                "Description": "Departmental Calendar"
            },
            {
                "Link": "https://events.berkeley.edu/ah/",
                "Description": "Arts & Humanities Calendar"
            }
        ]
    },
    # "Stanford",
    {
        "Name": "California College of the Arts (CCA)",
        "Links": [
            {
                "Link": "https://portal.cca.edu/events-calendar/?daterange_predefined=ALL&p=1",
                "Description": "Events Calendar"
            }
        ]
    },
    # "San Francisco State University (SFSU)",
    # "Univserity of San Francisco (USF)",
]

In [212]:
bay_area_schools

[{'Name': 'UC Berkeley',
  'Links': [{'Link': 'https://art.berkeley.edu/department-calendar',
    'Description': 'Departmental Calendar'}]},
 {'Name': 'California College of the Arts (CCA)',
  'Links': [{'Link': 'https://portal.cca.edu/events-calendar/?daterange_predefined=ALL&p=1',
    'Description': 'Events Calendar'}]}]

In [213]:
# List school names
[school["Name"] for school in bay_area_schools]

['UC Berkeley', 'California College of the Arts (CCA)']

In [219]:
# List all links
[[link["Link"] for link in school["Links"]][0] for school in bay_area_schools]

['https://art.berkeley.edu/department-calendar',
 'https://portal.cca.edu/events-calendar/?daterange_predefined=ALL&p=1']

In [208]:
# URL of the website to scrape
url = 'https://art.berkeley.edu/department-calendar'

# Send a GET request to fetch the webpage content
response = requests.get(url)
html_content = response.content

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Get p elements --> each p is a time period
p_elements = soup.find_all("p")

# Store event data
event_list = []

for p in p_elements:
    # Skip first p element if it's the intro text
    if "For more information on each event, visit the corresponding page on this website" not in p.get_text():
        events_raw = p.get_text()
        # print(events_raw)

        # How many events in this period?
        # Use this regex pattern to match occurrences of a left and right parenthesis with 3 or 4 of any character in bewteen --> (XXX) or (XXXX)
        pattern = r"\(.{3,4}\)"
        results = re.findall(pattern, events_raw)
        # Remove occurrences of "(Zoom)"
        results = [r for r in results if r != "(Zoom)"]
        num_events = len(results)
        # We don't care about the text that doesn't contain any events
        if num_events > 0:
            events = events_raw.split(") ")
            events = [e for e in events if e != "" and e not in ["(Mon", "(Tue", "(Tues", "(Wed", "(Th", "(Thu", "(Thurs", "(Fri", "(Sat", "(Sun"]]
            for e in events:
                if len(e) > 20: # This takes care of small bits of text that got caught
                    event_dict = {}
                    print(e)
                    date = e.split(": ")[0]
                    name = e.split(": ", 1)[1].split(",")[0]
                    try:
                        time = e.split(": ", 1)[1].split(",")[1].strip()
                    except IndexError:
                        time = "unknown"
                    try:
                        location = e.split(": ", 1)[1].split(",")[2].strip().replace("(Mon", "").replace("(Tu", "")\
                        .replace("(Tues", "").replace("(Wed", "").replace("(Th", "").replace("(Thu", "").replace("(Thurs", "")\
                        .replace("(Fri", "").replace("(Sat", "").replace("(Sun", "").split(".")[0]
                    except IndexError:
                        location = 'unknown'
                    print("Date:", date)
                    print("Name:", name)
                    print("Time:", time)
                    print("Location:", location)
                    print()
                    event_dict["Date"] = date
                    event_dict["Name"] = name
                    event_dict["Time"] = time
                    event_dict["Location"] = location
                    event_list.append(event_dict)

August 30: Reception: “Black chicagoland is…”,  5-7 pm, Worth Ryder Art Gallery (AAPB 116)
Date: August 30
Name: Reception: “Black chicagoland is…”
Time: 5-7 pm
Location: Worth Ryder Art Gallery (AAPB 116)

May. 3: Senior Exhibit “Textured Overflow.”, 5-7 pm, Worth Ryder Art Gallery (AAPB 116)(Wed
Date: May. 3
Name: Senior Exhibit “Textured Overflow.”
Time: 5-7 pm
Location: Worth Ryder Art Gallery (AAPB 116)

April. 12: Blood Pressure and Here Lies, 4-7 pm, Worth Ryder Art Gallery (AAPB 116)(Wed
Date: April. 12
Name: Blood Pressure and Here Lies
Time: 4-7 pm
Location: Worth Ryder Art Gallery (AAPB 116)

Jan. 18: Reception: Cacophonies of Resistance, 4-7 pm, Worth Ryder Art Gallery (AAPB 116). Featuring a performance by Valencia James.(Wed
Date: Jan. 18
Name: Reception: Cacophonies of Resistance
Time: 4-7 pm
Location: Worth Ryder Art Gallery (AAPB 116)

Feb. 8: Closing Reception and Catalogue Preview: Cacophonies of Resistance, 4-7 pm, Worth Ryder Art Gallery (AAPB 116)(Wed
Date: Feb. 8

In [207]:
p.contents

[<a href="https://dac.berkeley.edu/web-accessibility"><span style="text-decoration:underline">Accessibility</span></a>,
 <span style="text-decoration:underline"><br/></span>,
 <a href="https://ophd.berkeley.edu/policies-and-procedures/nondiscrimination-policy-statement"><span style="text-decoration:underline">Nondiscrimination</span></a>,
 <span style="text-decoration:underline"><br/></span>,
 <a href="https://security.berkeley.edu/policy/privacy-statement-uc-berkeley-websites"><span style="text-decoration:underline">Privacy</span></a>]

In [209]:
event_list

[{'Date': 'August 30',
  'Name': 'Reception: “Black chicagoland is…”',
  'Time': '5-7 pm',
  'Location': 'Worth Ryder Art Gallery (AAPB 116)'},
 {'Date': 'May. 3',
  'Name': 'Senior Exhibit “Textured Overflow.”',
  'Time': '5-7 pm',
  'Location': 'Worth Ryder Art Gallery (AAPB 116)'},
 {'Date': 'April. 12',
  'Name': 'Blood Pressure and Here Lies',
  'Time': '4-7 pm',
  'Location': 'Worth Ryder Art Gallery (AAPB 116)'},
 {'Date': 'Jan. 18',
  'Name': 'Reception: Cacophonies of Resistance',
  'Time': '4-7 pm',
  'Location': 'Worth Ryder Art Gallery (AAPB 116)'},
 {'Date': 'Feb. 8',
  'Name': 'Closing Reception and Catalogue Preview: Cacophonies of Resistance',
  'Time': '4-7 pm',
  'Location': 'Worth Ryder Art Gallery (AAPB 116)'},
 {'Date': 'Feb. 22',
  'Name': 'Reception: Emergenc(y): Afghan Lives Beyond the Forever War',
  'Time': '5-7 pm',
  'Location': 'Worth Ryder Art Gallery (AAPB 116)'},
 {'Date': 'Feb 23 - 28',
  'Name': 'Remembering a Future#2',
  'Time': 'Performance Lecture 

## Museums

In [260]:
# List out bay area museums
bay_area_museums = [
    {
        "Name": "Cantor Arts Center",
        "Links": [
            {
                "Link": "https://museum.stanford.edu/exhibitions",
                "Description": "Exhibitions"
            },
            {
                "Link": "https://museum.stanford.edu/programs", # need to use selenium
                "Description": "Programs & Events"
            }
        ]
    },
    {
        "Name": "de Young",
        "Links": [
            {
                "Link": "https://www.famsf.org/calendar",
                "Description": "Calendar"
            }
        ]
    },
    "Legion of Honor",
    "SF MoMA",
    "Institute of Contemporary Art San Jose",
    "Asian Art Museum",
    "Oakland Museum of California (OMCA)",
    "Cartoon Art Museum",
]

#### Cantor Arts Center - Programs & Events

In [233]:
# URL of the website to scrape
url = 'https://museum.stanford.edu/programs'

# Send a GET request to fetch the webpage content
response = requests.get(url)
html_content = response.content

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

In [257]:
# Find elements a class
item_elements = soup.find_all("ul")

for i, item in enumerate(item_elements):
    print(i, item.get_text())

0 

Visit


Exhibitions


Programs


Collections


Students & Faculty


About


Support


1 

Student Programs


Family Programs


Member Programs


Academic Programs




In [254]:
item_elements[0]

<ul class="nav navbar-nav">
<li class="nav-item">
<a class="nav-link" data-drupal-link-system-path="node/4" href="/visit">Visit</a>
</li>
<li class="nav-item">
<a class="nav-link" data-drupal-link-system-path="node/5" href="/exhibitions">Exhibitions</a>
</li>
<li class="nav-item active">
<a class="nav-link is-active" data-drupal-link-system-path="node/9" href="/programs">Programs</a>
</li>
<li class="nav-item">
<a class="nav-link" href="http://cantorcollection.stanford.edu/">Collections</a>
</li>
<li class="nav-item">
<a class="nav-link" data-drupal-link-system-path="node/14" href="/students-faculty">Students &amp; Faculty</a>
</li>
<li class="nav-item">
<a class="nav-link" data-drupal-link-system-path="node/125" href="/about">About</a>
</li>
<li class="nav-item">
<a class="nav-link" data-drupal-link-system-path="node/18" href="/support">Support</a>
</li>
</ul>

### de Young - Calendar

In [537]:
def get_de_young_events():
    """
    Uses BeautifulSoup to scrape event info from the de Young
    Museum's calendar.
    """
    # Collect event info
    events_list = []
    
    # URL of the website to scrape
    url = "https://www.famsf.org/calendar"

    # Iterate through the pages
    for i in range(1, 10):
        
        print("PAGE ", i, " +"*40)
        url = "https://www.famsf.org/calendar" + f"?page={i}"
    
        # Send a GET request to fetch the webpage content
        response = requests.get(url)
        html_content = response.content
        
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Find elements a class
        group_elements = soup.find_all(class_="mt-24 xl:mt-32")

        # If no pages left, exit loop
        if len(group_elements) == 0: # this will be 0 when we've gone through all the pages
            break
        
        for e in group_elements:
            # Instantiate list to collect tags
            tags = []
            # Extract title
            title = e.find("a").find("h3").get_text().strip()
            # Extract link
            link = e.find("a").get("href")
            # Extract date info
            date = e.find(class_="mt-12 text-secondary f-subheading-1").get_text()
            # Extract venue
            try:
                venue = e.find(class_="text-inherit pt-2 ml-8").get_text()
            except AttributeError:
                venue = "unknown"
            # Add tags
            event_type = e.find(class_="text-inherit pt-2").get_text().lower() # this is Exhibition or Event
            if event_type == "exhibition":
                tags.append("exhibition")
            if "tour" in title.lower():
                tags.append("tour")
            if "family" in title.lower():
                tags.append("family")
            if "reading" in title.lower():
                tags.append("reading")
            if "concert" in title.lower():
                tags.append("audio")
            if "song bath" in title.lower():
                tags.append("audio")
            if "workshop" in title.lower():
                tags.append("workshop")
            if "free" in title.lower():
                tags.append("free")
            if "opening" in title.lower():
                tags.append("opening")
            # if "Through" in date:
            #     multi_day = True # if multi_day = True, then date is the end date, otherwise date is the date of the event
            #     multi_day_type = 'daily'
            # Collect data
            events_list.append(
                {
                    "Title": title,
                    "Links": link,
                    "Date": date,
                    "Venue": venue,
                    "Tags": tags
                }
            )

    print(f"Collected {len(events_list)} events.")
    return events_list
    # return pd.DataFrame.from_records(events_list)
    
data = get_de_young_events()

PAGE  1  + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
PAGE  2  + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
PAGE  3  + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
PAGE  4  + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Collected 53 events.


In [395]:
# Save data as json
file_path = 'site/data.json'

with open(file_path, 'w') as json_file:
    json.dump(data, json_file)

## Galleries

In [397]:
# List out bay area galleries
bay_area_galleries = [
    {
        "Name": "Berkeley Art Center",
        "Links": [
            {
                "Link": "https://www.berkeleyartcenter.org/calendar",
                "Description": "Calendar"
            },
        ]
    },
]

In [542]:
def get_berkeley_art_center_events():
    """
    Uses BeautifulSoup to scrape event info from Berkeley Art 
    Center's calendar.
    """

    # Collect event info
    events_list = []
    
    # URL of the website to scrape
    url = "https://www.berkeleyartcenter.org/calendar"
    
    # Send a GET request to fetch the webpage content
    response = requests.get(url)
    html_content = response.content
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # We'll use this later to identify dates
    days_of_week = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]

    # Get elements
    elements = soup.find_all(class_="col sqs-col-6 span-6")

    # Iterate through elements
    for e in elements:
        # If looking at past events, stop the loop
        if "past events" in e.find_previous("h1").text.lower():
            break
        # Otherwise, we're looking at current events --> collect events
        else:
            h3s = e.find_all("h3")
            # If there are any h3 elements
            if len(h3s) > 0:
                # Collect date(s)
                dates = []
                # Identify title
                title = h3s[0].get_text().strip()
                # Collect tags
                tags = []
                # Collect links
                links = []
                # Iterate through h3 elements
                for h in h3s:
                    if any(x in h.get_text().lower() for x in days_of_week):
                        dates.append(h.get_text().strip())
                    # Tag events
                    if "opening" in h.get_text().lower():
                        tags.append("opening")
                    if "in conversation" in h.get_text().lower():
                        tags.append("talk")
                    if "performance" in h.get_text().lower():
                        tags.append("performance")
                    if "workshop" in h.get_text().lower():
                        tags.append("workshop")
                # Combined dates
                date = " | ".join(dates)
                # Identify location
                if "on zoom" in date.lower():
                    venue = "Virtual"
                try:
                    "berkeley art center" in h3s[2].get_text().lower()
                    venue = h3s[2].get_text()
                except IndexError:
                    venue = "Berkeley Art Center"
                else:
                    venue = "Berkeley Art Center"
                # Get link
                link_elements = e.find_all("a")
                if len(link_elements) > 0:
                    for l in link_elements:
                        link_url = l.get("href")
                        if "eventbrite" in link_url.lower():
                            links.append({
                                "Link": link_url,
                                "Text": "Eventbrite"
                            })
                        elif ("berkeleyartcenter" in link_url.lower() and link_url.lower() != "https://www.berkeleyartcenter.org/upcoming-exhibitions"):
                            links.append({
                                "Link": link_url,
                                "Text": "Event Page"
                            })
                        else:
                            links.append({
                                "Link": link_url,
                                "Text": "unknown"
                            })
                # Collect event data
                events_list.append(
                    {
                        "Title": title,
                        "Links": links,
                        "Date": date,
                        "Venue": venue,
                        "Tags": tags
                    }
                )
    print(f"Collected {len(events_list)} events.")
    return events_list

data = get_berkeley_art_center_events()

Collected 3 events.


In [543]:
data

[{'Title': 'Artists in conversation: Arleene Correa Valencia',
  'Links': [{'Link': 'https://www.eventbrite.com/e/697676016747?aff=oddtdtcreator',
    'Text': 'Eventbrite'}],
  'Date': 'Tuesday, August 29 from 7–8PM on zoom.',
  'Venue': 'Berkeley Art Center',
  'Tags': ['talk']},
 {'Title': 'OÑI OCAN: A RITUAL PERFORMANCE BY COURTNEY DESIREE MORRIS',
  'Links': [{'Link': 'https://www.eventbrite.com/e/oni-ocan-a-ritual-performance-by-courtney-desiree-morris-tickets-697681232347?aff=oddtdtcreator',
    'Text': 'Eventbrite'}],
  'Date': 'Friday, september 8, 6–8PM',
  'Venue': 'Berkeley Art Center',
  'Tags': ['performance']},
 {'Title': 'Community Dinner',
  'Links': [{'Link': 'https://www.eventbrite.com/e/705586196307?aff=oddtdtcreator',
    'Text': 'Eventbrite'}],
  'Date': 'Tuesday, August 29 from 7–8PM on zoom.',
  'Venue': 'Berkeley Art Center',
  'Tags': []}]

## Selenium exp

In [279]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service as ChromeService

# Create a new instance of Chrome driver
driver = webdriver.Chrome()

# Open the URL
url = "https://www.famsf.org/calendar"
driver.get(url)

# Wait for the page to load (you might need to adjust the time)
driver.implicitly_wait(10)

# Find event elements
event_elements = driver.find_elements(By.CLASS_NAME, 'group')

# # Iterate through event elements and print event titles
# for event_element in event_elements:
#     title_element = event_element.find_element(By.CLASS_NAME, 'views-field-title')
#     event_title = title_element.text
#     print(event_title)

# Close the browser
driver.quit()

In [281]:
event_elements[0].text

MaxRetryError: HTTPConnectionPool(host='localhost', port=53924): Max retries exceeded with url: /session/d426708e3a39b73193e484b33dea1cdd/element/CC5CF4F38ADB67F31F262A39E7C7B575_element_31/text (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x10d770370>: Failed to establish a new connection: [Errno 61] Connection refused'))