In [263]:
# Imports
import pandas as pd
import dateparser
import json
import requests
import re
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

If I need to install a package:

In [9]:
# !pip install beautifulsoup4

## Read data and save a copy

In [18]:
# Define the source and destination file paths
source_file_path = '../website/data.json'
destination_file_path = '../website/data_copy.json'

# Step 1: Open and read the source JSON file
with open(source_file_path, 'r') as source_json_file:
    # Load the JSON content into a Python data structure
    data = json.load(source_json_file)

# Step 2: Create or open the destination JSON file and write the data to it
with open(destination_file_path, 'w') as destination_json_file:
    # Write the data to the destination file
    json.dump(data, destination_json_file, indent=4)  # You can use indent for pretty formatting if desired

In [27]:
# Instantiate list to collect data
data = []

## Schools

In [216]:
# List out bay area schools (colleges and universities)
bay_area_schools = [
    {
        "Name": "UC Berkeley",
        "Links": [
            {
                "Link": "https://art.berkeley.edu/department-calendar",
                "Description": "Departmental Calendar"
            },
            {
                "Link": "https://events.berkeley.edu/ah/",
                "Description": "Arts & Humanities Calendar"
            },
            {
                "Link": "https://cnmat.berkeley.edu/events",
                "Description": "Center for New Media & Audio (CNMAT)"
            }
        ]
    },
    # "Stanford",
    {
        "Name": "California College of the Arts (CCA)",
        "Links": [
            {
                "Link": "https://portal.cca.edu/events-calendar/?daterange_predefined=ALL&p=1",
                "Description": "Events Calendar"
            }
        ]
    },
    # "San Francisco State University (SFSU)",
    # "Univserity of San Francisco (USF)",
]

In [6]:
bay_area_schools

[{'Name': 'UC Berkeley',
  'Links': [{'Link': 'https://art.berkeley.edu/department-calendar',
    'Description': 'Departmental Calendar'},
   {'Link': 'https://events.berkeley.edu/ah/',
    'Description': 'Arts & Humanities Calendar'}]},
 {'Name': 'California College of the Arts (CCA)',
  'Links': [{'Link': 'https://portal.cca.edu/events-calendar/?daterange_predefined=ALL&p=1',
    'Description': 'Events Calendar'}]}]

In [7]:
# List school names
[school["Name"] for school in bay_area_schools]

['UC Berkeley', 'California College of the Arts (CCA)']

In [8]:
# List all links
[[link["Link"] for link in school["Links"]][0] for school in bay_area_schools]

['https://art.berkeley.edu/department-calendar',
 'https://portal.cca.edu/events-calendar/?daterange_predefined=ALL&p=1']

In [9]:
# URL of the website to scrape
url = 'https://art.berkeley.edu/department-calendar'

# Send a GET request to fetch the webpage content
response = requests.get(url)
html_content = response.content

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Get p elements --> each p is a time period
p_elements = soup.find_all("p")

# Store event data
event_list = []

for p in p_elements:
    # Skip first p element if it's the intro text
    if "For more information on each event, visit the corresponding page on this website" not in p.get_text():
        events_raw = p.get_text()
        # print(events_raw)

        # How many events in this period?
        # Use this regex pattern to match occurrences of a left and right parenthesis with 3 or 4 of any character in bewteen --> (XXX) or (XXXX)
        pattern = r"\(.{3,4}\)"
        results = re.findall(pattern, events_raw)
        # Remove occurrences of "(Zoom)"
        results = [r for r in results if r != "(Zoom)"]
        num_events = len(results)
        # We don't care about the text that doesn't contain any events
        if num_events > 0:
            events = events_raw.split(") ")
            events = [e for e in events if e != "" and e not in ["(Mon", "(Tue", "(Tues", "(Wed", "(Th", "(Thu", "(Thurs", "(Fri", "(Sat", "(Sun"]]
            for e in events:
                if len(e) > 20: # This takes care of small bits of text that got caught
                    event_dict = {}
                    print(e)
                    date = e.split(": ")[0]
                    name = e.split(": ", 1)[1].split(",")[0]
                    try:
                        time = e.split(": ", 1)[1].split(",")[1].strip()
                    except IndexError:
                        time = "unknown"
                    try:
                        location = e.split(": ", 1)[1].split(",")[2].strip().replace("(Mon", "").replace("(Tu", "")\
                        .replace("(Tues", "").replace("(Wed", "").replace("(Th", "").replace("(Thu", "").replace("(Thurs", "")\
                        .replace("(Fri", "").replace("(Sat", "").replace("(Sun", "").split(".")[0]
                    except IndexError:
                        location = 'unknown'
                    print("Date:", date)
                    print("Name:", name)
                    print("Time:", time)
                    print("Location:", location)
                    print()
                    event_dict["Date"] = date
                    event_dict["Name"] = name
                    event_dict["Time"] = time
                    event_dict["Location"] = location
                    event_list.append(event_dict)

NameError: name 'BeautifulSoup' is not defined

In [207]:
p.contents

[<a href="https://dac.berkeley.edu/web-accessibility"><span style="text-decoration:underline">Accessibility</span></a>,
 <span style="text-decoration:underline"><br/></span>,
 <a href="https://ophd.berkeley.edu/policies-and-procedures/nondiscrimination-policy-statement"><span style="text-decoration:underline">Nondiscrimination</span></a>,
 <span style="text-decoration:underline"><br/></span>,
 <a href="https://security.berkeley.edu/policy/privacy-statement-uc-berkeley-websites"><span style="text-decoration:underline">Privacy</span></a>]

In [209]:
event_list

[{'Date': 'August 30',
  'Name': 'Reception: “Black chicagoland is…”',
  'Time': '5-7 pm',
  'Location': 'Worth Ryder Art Gallery (AAPB 116)'},
 {'Date': 'May. 3',
  'Name': 'Senior Exhibit “Textured Overflow.”',
  'Time': '5-7 pm',
  'Location': 'Worth Ryder Art Gallery (AAPB 116)'},
 {'Date': 'April. 12',
  'Name': 'Blood Pressure and Here Lies',
  'Time': '4-7 pm',
  'Location': 'Worth Ryder Art Gallery (AAPB 116)'},
 {'Date': 'Jan. 18',
  'Name': 'Reception: Cacophonies of Resistance',
  'Time': '4-7 pm',
  'Location': 'Worth Ryder Art Gallery (AAPB 116)'},
 {'Date': 'Feb. 8',
  'Name': 'Closing Reception and Catalogue Preview: Cacophonies of Resistance',
  'Time': '4-7 pm',
  'Location': 'Worth Ryder Art Gallery (AAPB 116)'},
 {'Date': 'Feb. 22',
  'Name': 'Reception: Emergenc(y): Afghan Lives Beyond the Forever War',
  'Time': '5-7 pm',
  'Location': 'Worth Ryder Art Gallery (AAPB 116)'},
 {'Date': 'Feb 23 - 28',
  'Name': 'Remembering a Future#2',
  'Time': 'Performance Lecture 

## Museums

In [260]:
# List out bay area museums
bay_area_museums = [
    {
        "Name": "Cantor Arts Center",
        "Links": [
            {
                "Link": "https://museum.stanford.edu/exhibitions",
                "Description": "Exhibitions"
            },
            {
                "Link": "https://museum.stanford.edu/programs", # need to use selenium
                "Description": "Programs & Events"
            }
        ]
    },
    {
        "Name": "de Young",
        "Links": [
            {
                "Link": "https://www.famsf.org/calendar",
                "Description": "Calendar"
            }
        ]
    },
    "Legion of Honor",
    "SF MoMA",
    "Institute of Contemporary Art San Jose",
    "Asian Art Museum",
    "Oakland Museum of California (OMCA)",
    "Cartoon Art Museum",
]

#### Cantor Arts Center - Programs & Events

In [233]:
# URL of the website to scrape
url = 'https://museum.stanford.edu/programs'

# Send a GET request to fetch the webpage content
response = requests.get(url)
html_content = response.content

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

In [257]:
# Find elements a class
item_elements = soup.find_all("ul")

for i, item in enumerate(item_elements):
    print(i, item.get_text())

0 

Visit


Exhibitions


Programs


Collections


Students & Faculty


About


Support


1 

Student Programs


Family Programs


Member Programs


Academic Programs




In [254]:
item_elements[0]

<ul class="nav navbar-nav">
<li class="nav-item">
<a class="nav-link" data-drupal-link-system-path="node/4" href="/visit">Visit</a>
</li>
<li class="nav-item">
<a class="nav-link" data-drupal-link-system-path="node/5" href="/exhibitions">Exhibitions</a>
</li>
<li class="nav-item active">
<a class="nav-link is-active" data-drupal-link-system-path="node/9" href="/programs">Programs</a>
</li>
<li class="nav-item">
<a class="nav-link" href="http://cantorcollection.stanford.edu/">Collections</a>
</li>
<li class="nav-item">
<a class="nav-link" data-drupal-link-system-path="node/14" href="/students-faculty">Students &amp; Faculty</a>
</li>
<li class="nav-item">
<a class="nav-link" data-drupal-link-system-path="node/125" href="/about">About</a>
</li>
<li class="nav-item">
<a class="nav-link" data-drupal-link-system-path="node/18" href="/support">Support</a>
</li>
</ul>

### de Young - Calendar

In [98]:
def get_de_young_events():
    """
    Uses BeautifulSoup to scrape event info from the de Young
    Museum's calendar.
    """
    # Collect event info
    events_list = []

    # Iterate through the pages
    for i in range(1, 10):
        
        print("PAGE ", i, " +"*40)
        url = "https://www.famsf.org/calendar" + f"?page={i}"
    
        # Send a GET request to fetch the webpage content
        response = requests.get(url)
        html_content = response.content
        
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Find elements a class
        group_elements = soup.find_all(class_="mt-24 xl:mt-32")

        # If no pages left, exit loop
        if len(group_elements) == 0: # this will be 0 when we've gone through all the pages
            break
        
        for e in group_elements:
            # Instantiate list to collect tags
            tags = []
            # Extract title
            title = e.find("a").find("h3").get_text().strip()
            # Extract link
            link = e.find("a").get("href")
            # Extract date info
            date = e.find(class_="mt-12 text-secondary f-subheading-1").get_text()
            # Extract venue
            try:
                venue = e.find(class_="text-inherit pt-2 ml-8").get_text()
            except AttributeError:
                venue = "unknown"
            # Add tags
            event_type = e.find(class_="text-inherit pt-2").get_text().lower() # this is Exhibition or Event
            if event_type == "exhibition":
                tags.append("exhibition")
            if "tour" in title.lower():
                tags.append("tour")
            if "family" in title.lower():
                tags.append("family")
            if "youngster" in title.lower():
                tags.append("family")
            if "reading" in title.lower():
                tags.append("reading")
            if "concert" in title.lower():
                tags.append("audio")
            if "song bath" in title.lower():
                tags.append("audio")
            if "workshop" in title.lower():
                tags.append("workshop")
            if "free" in title.lower():
                tags.append("free")
            if "opening" in title.lower():
                tags.append("opening")
            if "member" in title.lower():
                tags.append("members only")
            if "symposium" in title.lower():
                tags.append("symposium")
            if "lecture" in title.lower():
                tags.append("talk")
            if "talk" in title.lower():
                tags.append("talk")
            if "conversation" in title.lower():
                tags.append("talk")
            if "party" in title.lower():
                tags.append("party")
            if "queer" in title.lower():
                tags.append("queer")
            if "virtual" in title.lower():
                tags.append("virtual")
            # if "Through" in date:
            #     multi_day = True # if multi_day = True, then date is the end date, otherwise date is the date of the event
            #     multi_day_type = 'daily'
            # Collect data
            events_list.append(
                {
                    "Title": title,
                    "Links": [{
                        "Link": link,
                        "Text": "Event Page",
                    }],
                    "Date": date,
                    "Venue": venue,
                    "Tags": list(set(tags)) # get unique list of tags
                }
            )

    print(f"Collected {len(events_list)} events.")
    return events_list
    # return pd.DataFrame.from_records(events_list)
    
de_young_data = get_de_young_events()

PAGE  1  + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
PAGE  2  + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
PAGE  3  + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
PAGE  4  + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Collected 53 events.


In [99]:
data = []

In [100]:
# Collect data
data += de_young_data

In [78]:
# # Save data as json
# destination_file_path = '../website/data.json'

# with open(destination_file_path, 'w') as json_file:
#     json.dump(data, json_file, indent=4)

## Galleries

In [397]:
# List out bay area galleries
bay_area_galleries = [
    {
        "Name": "Berkeley Art Center",
        "Links": [
            {
                "Link": "https://www.berkeleyartcenter.org/calendar",
                "Description": "Calendar"
            },
        ]
    },
]

In [246]:
def standardize_text(string):
    # Remove punctuation
    string = string.replace('.', '')

    # Lowercase
    string = string.lower()

    # Remove spaces
    string = string.replace(' ', '')
    
    return string

In [266]:
def get_berkeley_art_center_events():
    """
    Uses BeautifulSoup to scrape event info from Berkeley Art 
    Center's calendar.
    """
    print("Collecting events from Berkeley Art Center...")

    def parse_start_end_times(string):
        times = []

        # Define a regular expression pattern to match time ranges
        time_pattern = r'(\d{1,2}(?::\d{1,2})?(?:[apAP][mM])?)\s*-\s*(\d{1,2}(?::\d{1,2})?(?:[apAP][mM])?)'

        # Use re.search to find the time pattern in the event string
        match = re.search(time_pattern, string)

        if match:
            # If a match is found, extract the start and end times
            start_time = match.group(1)
            end_time = match.group(2)
            times.append((start_time, end_time))
        else:
            # If no match is found, append None values
            times.append((None, None))

        return times

    # Collect event info
    events_list = []
    
    # URL of the website to scrape
    url = "https://www.berkeleyartcenter.org/calendar"
    
    # Send a GET request to fetch the webpage content
    response = requests.get(url)
    html_content = response.content
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # We'll use this later to identify dates
    days_of_week = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]

    # Get elements
    elements = soup.find_all(class_="col sqs-col-6 span-6")

    # Iterate through elements
    for e in elements:
        # If looking at past events, stop the loop
        if "past events" in e.find_previous("h1").text.lower():
            break
        # Otherwise, we're looking at current events --> collect events
        else:
            h3s = e.find_all("h3")
            # If there are any h3 elements
            if len(h3s) > 0:
                # Collect date(s)
                dates = []
                # Identify title
                title = h3s[0].get_text().strip()
                # Collect tags
                tags = []
                # Collect links
                links = []
                # Iterate through h3 elements
                for h in h3s:
                    if any(x in h.get_text().lower() for x in days_of_week):
                        dates.append(h.get_text().strip())
                    # Tag events
                    if "opening" in h.get_text().lower():
                        tags.append("opening")
                    if "conversation" in h.get_text().lower():
                        tags.append("talk")
                    if "talk" in h.get_text().lower():
                        tags.append("talk")
                    if "dialogue" in h.get_text().lower():
                        tags.append("talk")
                    if "performance" in h.get_text().lower():
                        tags.append("performance")
                    if "workshop" in h.get_text().lower():
                        tags.append("workshop")
                    if "party" in h.get_text().lower():
                        tags.append("party")
                    if "queer" in h.get_text().lower():
                        tags.append("queer")
                    if "virtual" in h.get_text().lower():
                        tags.append("virtual")
                    if "zoom" in h.get_text().lower():
                        tags.append("virtual")
                # Combined dates
                date = " | ".join(dates)
                date_text = date # copy date text to use for other purpose
                date = date.replace(" on zoom.", "")
                # Identify location
                try:
                    "berkeley art center" in h3s[2].get_text().lower()
                    venue = h3s[2].get_text()
                except IndexError:
                    venue = "Berkeley Art Center"
                else:
                    venue = "Berkeley Art Center"
                if "on zoom" in date_text.lower():
                    venue = "Virtual"
                # Get link
                link_elements = e.find_all("a")
                if len(link_elements) > 0:
                    for l in link_elements:
                        link_url = l.get("href")
                        if "eventbrite" in link_url.lower():
                            links.append({
                                "Link": link_url,
                                "Text": "Eventbrite"
                            })
                        elif ("berkeleyartcenter" in link_url.lower() and link_url.lower() != "https://www.berkeleyartcenter.org/upcoming-exhibitions"):
                            links.append({
                                "Link": link_url,
                                "Text": "Event Page"
                            })
                        else:
                            links.append({
                                "Link": link_url,
                                "Text": "unknown"
                            })
                # Get time
                print(date)
                print(dateparser.parse(date))
                      
                date = standardize_text(date)
                # print(date)
                time = dparser.parse(date)
                # print(time)
                # Collect event data
                events_list.append(
                    {
                        "Title": title,
                        "Links": links,
                        "Date": date,
                        "Time": time,
                        "Venue": venue,
                        "Tags": list(set(tags)) # get unique list of tags
                    }
                )
    print("Completed. Collected {:,} events.".format(len(events_list)))
    return events_list

berkeley_art_center_data = get_berkeley_art_center_events()

Collecting events from Berkeley Art Center...
Friday, september 8, 6–8PM
None


ParserError: Unknown string format: friday,september8,6–8pm

In [102]:
# Collect data
data += berkeley_art_center_data

In [109]:
data[:2]

[{'Title': 'Nampeyo and the Sikyátki Revival',
  'Links': [{'Link': 'https://www.famsf.org/exhibitions/nampeyo-and-sikyatki-revival',
    'Text': 'Event Page'}],
  'Date': 'Through Sep 15, 2024',
  'Venue': 'de Young',
  'Tags': ['exhibition']},
 {'Title': 'Lhola Amira: Facing the Future',
  'Links': [{'Link': 'https://www.famsf.org/exhibitions/lhola-amira',
    'Text': 'Event Page'}],
  'Date': 'Through Dec 3, 2023',
  'Venue': 'de Young',
  'Tags': ['exhibition']}]

In [249]:
berkeley_art_center_data

[{'Title': 'OÑI OCAN: A RITUAL PERFORMANCE BY COURTNEY DESIREE MORRIS',
  'Links': [{'Link': 'https://www.eventbrite.com/e/oni-ocan-a-ritual-performance-by-courtney-desiree-morris-tickets-697681232347?aff=oddtdtcreator',
    'Text': 'Eventbrite'}],
  'Date': 'friday,september8,6–8pm',
  'Time': [(None, None)],
  'Venue': 'Berkeley Art Center',
  'Tags': ['performance']},
 {'Title': 'Community Dinner',
  'Links': [{'Link': 'https://www.eventbrite.com/e/705586196307?aff=oddtdtcreator',
    'Text': 'Eventbrite'}],
  'Date': 'tuesday,august29from7–8pm',
  'Time': [(None, None)],
  'Venue': 'Virtual',
  'Tags': ['virtual']}]

In [262]:
# import dateutil.parser as dparser

# print(dparser.parse("monkey 2010-07-10 love banana",fuzzy=True))

2010-07-10 00:00:00


In [259]:
# Sample event strings
event_strings = [
    "1-2am",
    "3-4pm",
    "5:30-6:45pm",
    "7-8:15pm",
    "9:45-10:30pm",
    "11-12:00pm",
    "1:30-2:pm",
    "9:45pm-10:30pm",
]

In [260]:
for e in event_strings:
    print(e)
    print(extract_time(e))
    print()

1-2am
None

3-4pm
None

5:30-6:45pm
None

7-8:15pm
None

9:45-10:30pm
None

11-12:00pm
None

1:30-2:pm
None

9:45pm-10:30pm
None



In [239]:
import re

def extract_time_from_event(event_str):
    # Define a regular expression pattern to match time in various formats
    time_pattern = r'(\d{1,2}(?::\d{2})?)\s*-\s*(\d{1,2}(?::\d{2})?)([ap]\.?m\.?)?'

    # Use re.search to find the time pattern in the event string
    match = re.search(time_pattern, event_str, re.IGNORECASE)

    if match:
        # If a match is found, extract the start and end times and am/pm if present
        start_time = match.group(1)
        end_time = match.group(2)
        am_pm = match.group(3)
        
        # # Add "am" or "pm" if not already present
        # if am_pm and 'm' not in am_pm:
        #     am_pm = am_pm + 'm'
        
        return start_time + am_pm, end_time + am_pm
    else:
        # If no match is found, return None
        return None, None

# Sample event strings
event_strings = [
    "1-2am",
    "3-4pm",
    "5:30-6:45pm",
    "7-8:15PM",
    "9:45-10:30pm",
    "11-12:00pm",
    "1:30-2:45PM",
    "9:45pm-10:30pm",
]

# Extract time from each event string
for event_str in event_strings:
    start_time, end_time = extract_time_from_event(event_str.lower())
    print(f"Event: {event_str}")
    print(f"Start Time: {start_time}")
    print(f"End Time: {end_time}")
    print("---")


Event: 1-2am
Start Time: 1am
End Time: 2am
---
Event: 3-4pm
Start Time: 3pm
End Time: 4pm
---
Event: 5:30-6:45pm
Start Time: 5:30pm
End Time: 6:45pm
---
Event: 7-8:15PM
Start Time: 7pm
End Time: 8:15pm
---
Event: 9:45-10:30pm
Start Time: 9:45pm
End Time: 10:30pm
---
Event: 11-12:00pm
Start Time: 11pm
End Time: 12:00pm
---
Event: 1:30-2:45PM
Start Time: 1:30pm
End Time: 2:45pm
---
Event: 9:45pm-10:30pm
Start Time: None
End Time: None
---


In [233]:
extract_time_from_event(berkeley_art_center_data[0]['Date'])

(None, None)

In [241]:
def parse_start_end_times(event_strings):
    times = []

    for event_str in event_strings:
        # Define a regular expression pattern to match time ranges
        time_pattern = r'(\d{1,2}(?::\d{1,2})?(?:[apAP][mM])?)\s*-\s*(\d{1,2}(?::\d{1,2})?(?:[apAP][mM])?)'

        # Use re.search to find the time pattern in the event string
        match = re.search(time_pattern, event_str)

        if match:
            # If a match is found, extract the start and end times
            start_time = match.group(1)
            end_time = match.group(2)
            times.append((start_time, end_time))
        else:
            # If no match is found, append None values
            times.append((None, None))

    return times

# Sample event strings
event_strings = [
    "1-2:30pm",
    "4:30-4pm",
    "6-8pm",
    "3:45-4:50pm",
]

# Parse start and end times
parsed_times = parse_start_end_times(event_strings)

# Print the results
for i, (start_time, end_time) in enumerate(parsed_times, start=1):
    print(f"Event {i}:")
    print(f"Start Time: {start_time}")
    print(f"End Time: {end_time}")
    print("---")


Event 1:
Start Time: 1
End Time: 2:30pm
---
Event 2:
Start Time: 4:30
End Time: 4pm
---
Event 3:
Start Time: 6
End Time: 8pm
---
Event 4:
Start Time: 3:45
End Time: 4:50pm
---


In [112]:
import re

def extract_event_info(event_str):
    event_info = {
        "event_string": event_str,
        "start_date": None,
        "start_time": None,
        "end_time": None,
        "is_recurring": False,
    }

    # Check for recurring pattern (e.g., "Day of the week - day of the week")
    recurring_match = re.search(r'(Mon|Tues|Wed|Thurs|Fri|Sat|Sun)(?: - (Mon|Tues|Wed|Thurs|Fri|Sat|Sun))?', event_str)
    if recurring_match:
        event_info["is_recurring"] = True
        event_info["recurring_days"] = (recurring_match.group(1), recurring_match.group(2)) if recurring_match.group(2) else (recurring_match.group(1),)
    elif 'through' in event_str:
        event_info["is_recurring"] = True
    
    # Check for date (e.g., "Through Month day, year")
    date_match = re.search(r'Through (\w+ \d{1,2}, \d{4})', event_str)
    if date_match:
        event_info["start_date"] = date_match.group(1)

    # Check for time (e.g., "Day of the week \ start time", "start time - end time")
    time_match = re.search(r'(\d{1,2}:\d{2} (?:am|pm))(?: - (\d{1,2}:\d{2} (?:am|pm)))?', event_str)
    if time_match:
        event_info["start_time"] = time_match.group(1)
        event_info["end_time"] = time_match.group(2) if time_match.group(2) else None

    return event_info

# Sample event strings
event_strings = [
    "Through Sep 15, 2024",
    "Sat \ 4 pm",
    "Sat \ 9:30 am – 5:15 pm",
    "Tues – Fri, Sun \ Noon – 1 pm, 2 – 3 pm",
    "Tues – Sun \ 11:30 am + 1:30 pm",
    "Friday, september 8, 6–8PM",
]

# Extract event info
event_data = [extract_event_info(event_str) for event_str in event_strings]

# Display the extracted event information
for event_info in event_data:
    print(event_info)

{'event_string': 'Through Sep 15, 2024', 'start_date': 'Sep 15, 2024', 'start_time': None, 'end_time': None, 'is_recurring': False}
{'event_string': 'Sat \\ 4 pm', 'start_date': None, 'start_time': None, 'end_time': None, 'is_recurring': True, 'recurring_days': ('Sat',)}
{'event_string': 'Sat \\ 9:30 am – 5:15 pm', 'start_date': None, 'start_time': '9:30 am', 'end_time': None, 'is_recurring': True, 'recurring_days': ('Sat',)}
{'event_string': 'Tues – Fri, Sun \\ Noon – 1 pm, 2 – 3 pm', 'start_date': None, 'start_time': None, 'end_time': None, 'is_recurring': True, 'recurring_days': ('Tues',)}
{'event_string': 'Tues – Sun \\ 11:30 am + 1:30 pm', 'start_date': None, 'start_time': '11:30 am', 'end_time': None, 'is_recurring': True, 'recurring_days': ('Tues',)}
{'event_string': 'Friday, september 8, 6–8PM', 'start_date': None, 'start_time': None, 'end_time': None, 'is_recurring': True, 'recurring_days': ('Fri',)}


In [140]:
def get_closest_weekday(day):
    # Get the current date
    current_date = datetime.now()
    
    # Get the current weekday (0 = Monday, 6 = Sunday)
    current_weekday = current_date.weekday()
    
    # Map day names to weekday numbers
    weekday_mapping = {
        "mon": 0,
        "tues": 1,
        "wed": 2,
        "thurs": 3,
        "fri": 4,
        "sat": 5,
        "sun": 6
    }
    
    # Calculate the difference in days to the desired day
    days_difference = (weekday_mapping[str(day).lower()] - current_weekday) % 7
    
    # Calculate the date of the desired day of the week
    target_date = current_date + timedelta(days=days_difference)
    
    return target_date.strftime("%b %d, %Y")

def extract_event_info(event_str):
    event_info = {
        "event_string": event_str,
        "start_date": None,
        "start_time": None,
        "end_time": None,
        "is_recurring": False,
    }

    # Check for recurring pattern (e.g., "Day of the week - day of the week")
    recurring_match = re.search(r'(Mon|Tues|Wed|Thurs|Fri|Sat|Sun)(?: - (Mon|Tues|Wed|Thurs|Fri|Sat|Sun))?', event_str)
    if recurring_match:
        event_info["is_recurring"] = True
        event_info["recurring_days"] = (recurring_match.group(1), recurring_match.group(2)) if recurring_match.group(2) else (recurring_match.group(1),)
    elif 'through' in event_str.lower():
        event_info["is_recurring"] = True
    
    # Check for date (e.g., "Through Month day, year")
    date_match = re.search(r'Through (\w+ \d{1,2}, \d{4})', event_str)
    if date_match:
        event_info["start_date"] = date_match.group(1)
    elif any(d in str(event_str).lower() for d in ['mon', 'tues', 'wed', 'thu', 'thurs', 'fri', 'sat', 'sun']):
        weekdays = ['mon', 'tues', 'wed', 'thu', 'thurs', 'fri', 'sat', 'sun']
        pattern = r'\b(?:' + '|'.join(weekdays) + r')\b'
        matches = re.findall(pattern, event_str, re.IGNORECASE)
        if len(matches) == 1:
            event_info["start_date"] = get_closest_weekday(matches[0])
    
    # Check for time (e.g., "Day of the week \ start time", "start time - end time")
    time_match = re.search(r'(\d{1,2}:\d{2} (?:am|pm))(?: - (\d{1,2}:\d{2} (?:am|pm)))?', event_str)
    if time_match:
        event_info["start_time"] = time_match.group(1)
        event_info["end_time"] = time_match.group(2) if time_match.group(2) else None
    else:
        # Regular expression pattern to match time in various formats
        time_pattern = r'(\d{1,2}(?::\d{2})?(?:\s?[ap]m)?)'
        # Find all time matches in the string
        matches = re.findall(time_pattern, event_str, re.IGNORECASE)
        if len(matches) == 1:
            event_info["start_time"] = matches[0]

    return event_info

# Sample event strings
event_strings = [
    "Through Sep 15, 2024",
    "Sat \ 4 pm",
    "Sat \ 9:30 am – 5:15 pm",
    "Tues – Fri, Sun \ Noon – 1 pm, 2 – 3 pm",
    "Tues – Sun \ 11:30 am + 1:30 pm",
    "Friday, september 8, 6–8PM",
    "Sat \ 4pm",
    "Mon \ 7:30 pm",
]

# Extract event info
event_data = [extract_event_info(event_str) for event_str in event_strings]

# Adjust the start_date for "Day of the week \ start time" pattern
for event_info in event_data:
    if re.search(r'^\w+ \\ \d{1,2}:\d{2} (?:am|pm)$', event_info["event_string"]):
        # Extract the day of the week
        day_match = re.search(r'^(\w+) \\', event_info["event_string"])
        if day_match:
            day = day_match.group(1)
            event_info["start_date"] = get_closest_weekday(day)

# Display the extracted event information
for event_info in event_data:
    print(event_info)


{'event_string': 'Through Sep 15, 2024', 'start_date': 'Sep 15, 2024', 'start_time': None, 'end_time': None, 'is_recurring': True}
{'event_string': 'Sat \\ 4 pm', 'start_date': 'Sep 09, 2023', 'start_time': '4 pm', 'end_time': None, 'is_recurring': True, 'recurring_days': ('Sat',)}
{'event_string': 'Sat \\ 9:30 am – 5:15 pm', 'start_date': 'Sep 09, 2023', 'start_time': '9:30 am', 'end_time': None, 'is_recurring': True, 'recurring_days': ('Sat',)}
{'event_string': 'Tues – Fri, Sun \\ Noon – 1 pm, 2 – 3 pm', 'start_date': None, 'start_time': None, 'end_time': None, 'is_recurring': True, 'recurring_days': ('Tues',)}
{'event_string': 'Tues – Sun \\ 11:30 am + 1:30 pm', 'start_date': None, 'start_time': '11:30 am', 'end_time': None, 'is_recurring': True, 'recurring_days': ('Tues',)}
{'event_string': 'Friday, september 8, 6–8PM', 'start_date': None, 'start_time': None, 'end_time': None, 'is_recurring': True, 'recurring_days': ('Fri',)}
{'event_string': 'Sat \\ 4pm', 'start_date': 'Sep 09, 20

In [141]:
import re

# Sample strings
event_strings = [
    "Sat \\ 4 pm",
    "Sat \\ 4pm",
    "Mon-Tues \\ 7:30 pm",
    "Mon-Tues \\ 7:30pm",
]

# Regular expression pattern to match time in various formats
time_pattern = r'(\d{1,2}(?::\d{2})?(?:\s?[ap]m)?)'

for event_str in event_strings:
    # Find all time matches in the string
    matches = re.findall(time_pattern, event_str, re.IGNORECASE)

    if matches:
        print("Extracted times from '{}':".format(event_str))
        for match in matches:
            print(match)
    else:
        print("No time found in '{}'.".format(event_str))


Extracted times from 'Sat \ 4 pm':
4 pm
Extracted times from 'Sat \ 4pm':
4pm
Extracted times from 'Mon-Tues \ 7:30 pm':
7:30 pm
Extracted times from 'Mon-Tues \ 7:30pm':
7:30pm


In [143]:
def get_closest_weekday(day_of_week):
    # Get the current date
    current_date = datetime.now()
    
    # Get the current weekday (0 = Monday, 6 = Sunday)
    current_weekday = current_date.weekday()
    
    # Map day names to weekday numbers
    weekday_mapping = {
        "mon": 0,
        "tues": 1,
        "wed": 2,
        "thurs": 3,
        "fri": 4,
        "sat": 5,
        "sun": 6
    }
    
    # Calculate the difference in days to the desired day
    days_difference = (weekday_mapping[str(day).lower()] - current_weekday) % 7
    
    # Calculate the date of the desired day of the week
    target_date = current_date + timedelta(days=days_difference)
    
    return target_date.strftime("%b %d, %Y")

# Function to extract date, time, and recurrence information from event strings
def extract_event_info(event_str):
    event_info = {
        "event_string": event_str,
        "start_date": None,
        "start_time": None,
        "end_time": None,
        "is_recurring": False,
    }

    # Regular expression patterns
    date_pattern = r'Through (\w+ \d{1,2}, \d{4})'
    time_pattern = r'(\d{1,2}(?::\d{2})?(?:\s?[ap]m)?)'
    recurring_pattern = r'(?i)(Mon|Tue|Wed|Thu|Fri|Sat|Sun)'
    
    # Check for recurring pattern
    recurring_match = re.search(rf'{recurring_pattern} – {recurring_pattern}', event_str)
    if recurring_match:
        event_info["is_recurring"] = True
        event_info["recurring_days"] = (recurring_match.group(1), recurring_match.group(2))
    elif 'through' in event_str.lower():
        event_info["is_recurring"] = True

    # Check for date
    date_match = re.search(date_pattern, event_str)
    if date_match:
        event_info["start_date"] = date_match.group(1)

    # Check for time
    time_match = re.search(rf'{time_pattern}(?:\s?–\s?{time_pattern})?', event_str, re.IGNORECASE)
    if time_match:
        event_info["start_time"] = time_match.group(1)
        event_info["end_time"] = time_match.group(2) if time_match.group(2) else None

    # Check for day of the week (e.g., "Sat \ 4 pm")
    weekday_match = re.search(recurring_pattern, event_str, re.IGNORECASE)
    if weekday_match:
        event_info["start_date"] = get_closest_weekday(weekday_match.group(0))

    return event_info

# Sample event strings
event_strings = [
    "Through Sep 15, 2024",
    "Sat \ 4 pm",
    "Sat \ 9:30 am – 5:15 pm",
    "Tues – Fri, Sun \ Noon – 1 pm, 2 – 3 pm",
    "Tues – Sun \ 11:30 am + 1:30 pm",
    "Friday, september 8, 6–8PM",
    "Mon-Tues \ 7:30 pm",
    "Mon-Tues \ 7:30pm",
]

# Extract event info
event_data = [extract_event_info(event_str) for event_str in event_strings]

# Display the extracted event information
for event_info in event_data:
    print(event_info)


{'event_string': 'Through Sep 15, 2024', 'start_date': 'Sep 15, 2024', 'start_time': '15', 'end_time': None, 'is_recurring': True}
{'event_string': 'Sat \\ 4 pm', 'start_date': 'Sep 11, 2023', 'start_time': '4 pm', 'end_time': None, 'is_recurring': False}
{'event_string': 'Sat \\ 9:30 am – 5:15 pm', 'start_date': 'Sep 11, 2023', 'start_time': '9:30 am', 'end_time': '5:15 pm', 'is_recurring': False}
{'event_string': 'Tues – Fri, Sun \\ Noon – 1 pm, 2 – 3 pm', 'start_date': 'Sep 11, 2023', 'start_time': '1 pm', 'end_time': None, 'is_recurring': False}
{'event_string': 'Tues – Sun \\ 11:30 am + 1:30 pm', 'start_date': 'Sep 11, 2023', 'start_time': '11:30 am', 'end_time': None, 'is_recurring': False}
{'event_string': 'Friday, september 8, 6–8PM', 'start_date': 'Sep 11, 2023', 'start_time': '8', 'end_time': None, 'is_recurring': False}
{'event_string': 'Mon-Tues \\ 7:30 pm', 'start_date': 'Sep 11, 2023', 'start_time': '7:30 pm', 'end_time': None, 'is_recurring': False}
{'event_string': 'Mon

In [169]:
# def extract_dates_from_event(event_str):
event_dates = {
    "raw_text": event_str,
    "start_date": None,
    "end_date": None
}

# Regular expression pattern to match date formats
date_pattern = r'(?i)(through\s)?(\w{3,9})?\.?\s?(\d{1,2})(?:,)?(?:\s(\d{2,4}))?'

# Find all matches of the date pattern in the event string
date_matches = re.finditer(date_pattern, event_str)

for match in date_matches:
    print(match)
    # is_end_date = match.group(1) and "through" in match.group(1).lower()
    # month = match.group(2)
    # day = match.group(3)
    # year = match.group(4)

    # if not year:  # If year is missing, use a default year (e.g., current year)
    #     year = datetime.now().year
    # elif len(year) == 2:  # If year is in YY format, convert it to YYYY
    #     year = f'20{year}'

    # date_string = f'{year}-{month}-{day}'

    # if is_end_date:
    #     event_dates["end_date"] = date_string
    # else:
    #     event_dates["start_date"] = date_string

# return event_dates

# Sample event strings
event_strings = [
    "Through Sep 15, 2024",
    "Sat, Sep 15",
    "October 5",
    "Tue. November 22, 23",
    "Thursday, 10/6, 2022",
    "Fri 12, 2023",
]

# # Extract event dates
# event_dates_data = [extract_dates_from_event(event_str) for event_str in event_strings]

# # Display the extracted event dates
# for event_dates in event_dates_data:
#     print(event_dates['raw_text'])
#     print('\t', event_dates['start_date'])
#     print('\t', event_dates['end_date'])


<re.Match object; span=(8, 20), match='september 8,'>
<re.Match object; span=(20, 22), match=' 6'>
<re.Match object; span=(23, 24), match='8'>


In [170]:
# Sample event strings
event_strings = [
    "Event from Jan 01 – Feb 15, 2024",
    "Another event from Mar 10 – Mar 20, 22",
    "No date range here",
    "Event from Dec 25 – Jan 01",
]

# Regular expression pattern to find date ranges
date_range_pattern = r'\b\w{3} \d{1,2} – \w{3} \d{1,2}(?:,? (?:20)?\d{2,4})?\b'

for event_str in event_strings:
    date_ranges = re.findall(date_range_pattern, event_str, re.IGNORECASE)
    if date_ranges:
        print(f"Event: {event_str}")
        for date_range in date_ranges:
            print(f"Date Range: {date_range}")
        print()

Event: Event from Jan 01 – Feb 15, 2024
Date Range: Jan 01 – Feb 15, 2024

Event: Another event from Mar 10 – Mar 20, 22
Date Range: Mar 10 – Mar 20, 22

Event: Event from Dec 25 – Jan 01
Date Range: Dec 25 – Jan 01



In [175]:
def is_event_recurring(event_str):
    """
    Function that takes in a string containing date and time information
    about an event and identifies whether it is a recurring event or not.
    """
    # Check if the string contains "through" (case insensitive)
    if re.search(r'through', event_str, re.IGNORECASE):
        return True

    # Check if the string contains a "–" or "," between two days of the week or abbreviations
    if re.search(r'(\w{3} – \w{3}|\w{3}, \w{3})', event_str):
        return True

    # Check if the string contains a "+" and has more than one day of the week or abbreviations
    if re.search(r'\+\s*(\w{3}|\w{3},)+', event_str):
        return True

    return False

# Sample event strings
event_strings = [
    "Event through Sep 15, 2024",
    "Event from Mon – Fri",
    "Event on Mon, Wed, Fri",
    "Recurring event: Tue + Thu",
    "One-time event: Wed",
]

for event_str in event_strings:
    recurring = is_event_recurring(event_str)
    print(f"'{event_str}' is recurring: {recurring}")


'Event through Sep 15, 2024' is recurring: True
'Event from Mon – Fri' is recurring: True
'Event on Mon, Wed, Fri' is recurring: True
'Recurring event: Tue + Thu' is recurring: True
'One-time event: Wed' is recurring: False


In [230]:
def remove_chars_before_first_number(input_string):
    # Use regular expression to find the first number
    match = re.search(r'\d', input_string)
    
    if match:
        # If a number is found, return the substring starting from that position
        return input_string[match.start():]
    else:
        # If no number is found, return the original string
        return input_string

def extract_times(event_str):
    # # Regular expression to match times in various formats
    # time_pattern_1 = r'(\d{1,2}\s*-\s*\d{1,2}\s*(?:[APap][Mm])?)'
    # # Find all matches in the event string
    # matches_1 = re.findall(time_pattern_1, event_str)

    # # Regular expression to match times in various formats
    # time_pattern_2 = r'(\d{1,2}(?::\d{2})?\s*(?:am|pm))\s*[-–]\s*(\d{1,2}(?::\d{2})?\s*(?:am|pm))'
    # # Find all matches in the event string
    # matches_2 = re.findall(time_pattern_2, event_str)

    # # Regular expression to match times in various formats
    # time_pattern_3 = r'(\d{1,2}:\d{2}\s*[APap][Mm])\s*[+-,]\s*(\d{1,2}:\d{2}\s*[APap][Mm])'
    # # Find all matches in the event string
    # matches_3 = re.findall(time_pattern_3, event_str, re.IGNORECASE)

    # # Regular expression to match times in various formats
    # time_pattern_4 = r'(\d{1,2}(?::\d{2})?\s*(?:am|pm))'
    # # Find all matches in the event string
    # matches_4 = re.findall(time_pattern_4, event_str, re.IGNORECASE)

    # # If we found a match, we're good
    # if len(matches_1) > 0:
    #     return matches_1
    # # Otherwise, keep looking 
    # elif len(matches_2) > 0:
    #     return matches_2
    # # Otherwise, keep looking 
    # elif len(matches_3) > 0:
    #     return matches_3

    # Lowercase string
    event_str = event_str.lower()

    # Remove all characters before the first number
    event_str = remove_chars_before_first_number(event_str)



    # # Start list to collect times
    # times = []

    # # Split by cases of '+'
    # times = event_str.split('+')

    # list_to_remove = [
    #     'vent from ',
    #     'event on ',
        
    # ]

    # for t in times:
    #     if 

    # if '-' in event_str:
        

    return event_str

# Sample event strings
event_strings = [
    "Event from 6-8PM",
    "Event on 6 - 8 PM",
    "Recurring event: 6-8pm",
    "One-time event: 6 - 8 pm",
    "Event from 10-11am",
    "Tues – Sun \ 11:30 am + 1:30 pm",
    "11:30am, 1:30pm",
    "11:30 am, 1:30 pm",
    "11:30AM, 1:30 pm",
    "11:30 am,1:30 pm",
    "Sat \ 9:30 am – 5:15 pm",
]

for event_str in event_strings:
    times = extract_times(event_str)
    print(f"Times in '{event_str}': {times}")

print()

for event_str in [d["Date"] for d in data]:
    times = extract_times(event_str)
    print(f"Times in '{event_str}': {times}")

Times in 'Event from 6-8PM': 6-8pm
Times in 'Event on 6 - 8 PM': 6 - 8 pm
Times in 'Recurring event: 6-8pm': 6-8pm
Times in 'One-time event: 6 - 8 pm': 6 - 8 pm
Times in 'Event from 10-11am': 10-11am
Times in 'Tues – Sun \ 11:30 am + 1:30 pm': 11:30 am + 1:30 pm
Times in '11:30am, 1:30pm': 11:30am, 1:30pm
Times in '11:30 am, 1:30 pm': 11:30 am, 1:30 pm
Times in '11:30AM, 1:30 pm': 11:30am, 1:30 pm
Times in '11:30 am,1:30 pm': 11:30 am,1:30 pm
Times in 'Sat \ 9:30 am – 5:15 pm': 9:30 am – 5:15 pm

Times in 'Through Sep 15, 2024': 15, 2024
Times in 'Through Dec 3, 2023': 3, 2023
Times in 'Sat \ 4 pm': 4 pm
Times in 'Sat \ 9:30 am – 5:15 pm': 9:30 am – 5:15 pm
Times in 'Tues – Fri, Sun \ Noon – 1 pm, 2 – 3 pm': 1 pm, 2 – 3 pm
Times in 'Through Oct 15, 2023': 15, 2023
Times in 'Through Oct 1, 2023': 1, 2023
Times in 'Sat \ 9:30 am – 5:15 pm': 9:30 am – 5:15 pm
Times in 'Through Sep 24, 2023': 24, 2023
Times in 'Tues – Sun \ 11:30 am + 1:30 pm': 11:30 am + 1:30 pm
Times in 'Sat \ noon – 12:

In [None]:
# Times in 'Event from 6-8PM': ['6-8PM']
# Times in 'Event on 6 - 8 PM': ['6 - 8 PM']
# Times in 'Recurring event: 6-8pm': ['6-8pm']
# Times in 'One-time event: 6 - 8 pm': ['6 - 8 pm']
# Times in 'Event from 10-11am': ['10-11am']
# Times in 'Tues – Sun \ 11:30 am + 1:30 pm': [('11:30 am', '1:30 pm')]
# Times in '11:30am, 1:30pm': [('11:30am', '1:30pm')]
# Times in '11:30 am, 1:30 pm': [('11:30 am', '1:30 pm')]
# Times in '11:30AM, 1:30 pm': [('11:30AM', '1:30 pm')]
# Times in '11:30 am,1:30 pm': [('11:30 am', '1:30 pm')]
# Times in 'Sat \ 9:30 am – 5:15 pm': [('9:30 am', '5:15 pm')]

In [212]:
def standardize_text(event_str):
    # Remove punctuation
    event_str = event_str.replace('.', '')

    # Lowercase
    event_str = event_str.lower()

    # Remove spaces
    event_str = event_str.replace(' ', '')
    
    return event_str

In [217]:
def name_tbd(event_str):
    # Add recurring dates, end date, end_time as needed
    
    event_dt_info = {
        "event_string": event_str,
        "start_date": None,
        "start_time": None,
        "is_recurring": False,
    }

    # Standardize text
    event_str_standardized = standardize_text(event_str)
    event_dt_info["standardized"] = event_str_standardized
        
    # Is it a recurring event?
    event_dt_info["is_recurring"] = is_event_recurring(event_str_standardized)

    # If it is a recurring event
    if event_dt_info["is_recurring"]:
        # Get list of dates
        dates = []
        # # Get start date
        # start_date = min(dates)
        # # Get end date
        # end_date = max(dates)
    # If it is not a recurring event
    else:
        pass
        # Identify date
        # Identify time

    # Extrace times
    times = extract_times(event_str_standardized)
    event_dt_info["times"] = times

    return event_dt_info

# Sample event strings
event_strings = [
    "Tues – Sun \ 11:30 am + 1:30 pm",
    "Through Sep 15, 2024",
    "Sat \ 4 pm",
    "Sat \ 9:30 am – 5:15 pm",
    "Apr 6 – Aug 18, 2024",
    "Friday, september 8, 6–8PM",
    "Friday, september 8, 6 - 8PM",
    "Friday, september 8, 6 - 8 PM",
]

for event_str in event_strings:
    info = name_tbd(event_str)
    # print(info)
    print(info["standardized"])
    print(info["times"])
    print()
    # recurring = info['is_recurring']
    # event_string = info["event_string"]
    # print(f"'{event_string}' is recurring: {recurring}")

tues–sun\11:30am+1:30pm
[('11:30am', '1:30pm')]

throughsep15,2024
None

sat\4pm
None

sat\9:30am–5:15pm
[('9:30am', '5:15pm')]

apr6–aug18,2024
None

friday,september8,6–8pm
None

friday,september8,6-8pm
['6-8pm']

friday,september8,6-8pm
['6-8pm']



In [220]:
[d["Date"] for d in data]

['Through Sep 15, 2024',
 'Through Dec 3, 2023',
 'Sat \\ 4 pm',
 'Sat \\ 9:30 am – 5:15 pm',
 'Tues – Fri, Sun \\ Noon – 1 pm, 2 – 3 pm',
 'Through Oct 15, 2023',
 'Through Oct 1, 2023',
 'Sat \\ 9:30 am – 5:15 pm',
 'Through Sep 24, 2023',
 'Tues – Sun \\ 11:30 am + 1:30 pm',
 'Sat \\ noon – 12:40 pm, 3 – 3:40 pm',
 'Through Mar 24, 2024',
 'Through Dec 31, 2023',
 'Sat, Sun, Tues \\ 11:30 am – 1:30 pm',
 'Select Saturdays \\ 10:30 am',
 'Through Feb 25, 2024',
 'Select Saturdays \\ 11 am – 3 pm',
 'Through Mar 10, 2024',
 'Select Saturdays \\ 11 am – 3 pm',
 'Sat, Sun, Free Tues \\ 11:30 am + 1 pm',
 'Wed, Sep 6 \\ Noon PDT',
 'Wed, Sep 6 \\ 10:30 am – 3 pm',
 'Thu, Sep 7 \\ 1 – 2:15 pm',
 'Sat, Sep 9 \\ 2 – 3:30 pm',
 'Mon, Sep 11',
 'Sat, Sep 16 \\ 1 pm',
 'Sat, Sep 16 \\ 3 – 5 pm',
 'Sat, Sep 23 \\ 10 am – noon + 1 – 3 pm',
 'Sat, Sep 23 \\ 10 – 11 am + 1 – 2 pm',
 'Thu, Sep 28 + Fri, Sep 29 \\ 9:30 am – 5:15 pm',
 'Sep 30, 2023 – Jan 7, 2024',
 'Sat, Sep 30 \\ 11 am – 4 pm',
 'S

## Save data

In [103]:
# Define the file path
file_path = '../website/data.json'

# Step 3: Save the modified data back to the JSON file
with open(file_path, 'w') as json_file:
    json.dump(data, json_file, indent=4)  # You can use indent for pretty formatting if desired

## Selenium exp

In [279]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service as ChromeService

# Create a new instance of Chrome driver
driver = webdriver.Chrome()

# Open the URL
url = "https://www.famsf.org/calendar"
driver.get(url)

# Wait for the page to load (you might need to adjust the time)
driver.implicitly_wait(10)

# Find event elements
event_elements = driver.find_elements(By.CLASS_NAME, 'group')

# # Iterate through event elements and print event titles
# for event_element in event_elements:
#     title_element = event_element.find_element(By.CLASS_NAME, 'views-field-title')
#     event_title = title_element.text
#     print(event_title)

# Close the browser
driver.quit()

In [281]:
event_elements[0].text

MaxRetryError: HTTPConnectionPool(host='localhost', port=53924): Max retries exceeded with url: /session/d426708e3a39b73193e484b33dea1cdd/element/CC5CF4F38ADB67F31F262A39E7C7B575_element_31/text (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x10d770370>: Failed to establish a new connection: [Errno 61] Connection refused'))